from sklearn.datasets import fetch_mldata
from collections import counter
import numpy as np
#載入mnist資料集
defload_mnist()
: mnist = fetch_mldata(
'mnist original'
, data_home=
'./datasets'
) x, y = mnist[
"data"
], mnist[
"target"
] x = np.where(x>0,
1, x)
#將大於1的陣列全部換成1
x_train, x_test, y_train, y_test = x[
:60000
], x[
60000:]
, y[
:60000
], y[
60000:]
return x_train, y_train, x_test, y_test
class
*****bayes()
: prob_c =
none
#類別為c的先驗概率
prob_cj =
none
#類別為c,維度為j的似然概率
y_pred =
none
#**結果,類別
y_pred_prob =
none
#**結果,類別概率
#訓練模型,計算先驗概率和似然概率
deffit
(self, x_train, y_train)
:#計算各類別先驗概率 p(y)
prob_c =
y_count = counter(y_train)
for key in y_count.keys():
/len
(y_train)
)#計算每一維的條件概率 p(x_ij | y), x_ij表示第i行第j維的元素
prob_cj =
#第c類,第j維的條件概率
for c in y_count.keys():
prob_temp =
for j in
range
(x_train.shape[1]
):c_train = x_train[y_train==c]
#類別為c的訓練樣本
c_train_j = c_train[
:, j]
#類別為c樣本中第j維(列)的資料
c_train_j_1 = c_train_j[c_train_j ==1]
#類別為c樣本中第j維(列)值為1的資料
prob_1_cj =
(c_train_j_1.shape[0]
+1)/
(c_train_j.shape[0]
+ c_train.shape[1]
)# add_one smoothing計算條件概率
self.prob_c = prob_c
self.prob_cj = prob_cj
return prob_c, prob_cj
#**defpredict
(self, x_test)
: y_pred_prob =
#對每一條測試樣本
for x in x_test:
temp_list =
#對每一類,計算後驗概率
for i in
range
(len
(self.prob_c)):
prob_cond =
1 c_test_index = np.where(x==1)
[0]for z in c_test_index:
prob_cond *= self.prob_cj[i]
[z]
post_prob = prob_cond * self.prob_c[i]
#後驗概率最大的索引值,就是該測試樣本的所屬類別
y_pred = np.argmax(y_pred_prob, axis=1)
self.y_pred_prob = y_pred_prob
self.y_pred = y_pred
return y_pred
deflossfun
(self, y_pred, y_test)
: p =
0for i in
range
(len
(y_pred)):
if y_pred[i]
== y_test[i]
: p +=
1return p/
len(y_pred)
# 計算累加概率,可以參考輪盤賭演算法
defsum_prob
(self, prob_list)
: sum_prob =
for i in
range
(len
(prob_list)):
temp_prob =
0for j in
range
(i+1):
temp_prob += prob_list[j]
return sum_prob
defgeneratedata
(self)
:# 計算p(y)的累加概率
sum_prob_c = self.sum_prob(self.prob_c)
# 隨機生成[0, 1]之間的隨機數
rand_prob_c = np.random.rand(
)for i in
range
(len
(sum_prob_c)):
# 如果隨機生成的概率小於等於第i類的累加概率,則該樣本屬於第i類
if rand_prob_c <= sum_prob_c[i]
: new_c = i # 新的類別
break
print
('隨機生成的樣本類別:'
, new_c)
# 同理生成新的樣本特徵
new_c_j =
sum_prob_c_j = self.prob_cj[new_c]
for i in
range
(len
(sum_prob_c_j)):
rand_prob_c_j = np.random.rand(
)if rand_prob_c_j <= sum_prob_c_j[i]:1
)else:0
) new_c_j = np.array(new_c_j)
print
('隨機生成的樣本資料:'
)print
(new_c_j.reshape(28,
28))if __name__ ==
'__main__'
: x_train, y_train, x_test, y_test = load_mnist(
) clf = *****bayes(
) prob_c, prob_cj = clf.fit(x_train, y_train)
#訓練模型
y_pred = clf.predict(x_test)
#測試
accurate = clf.lossfun(y_pred, y_test)
#計算精確度
print
('各類數量統計:'
, counter(y_pred)
)print
('準確率: '
, accurate)
clf.generatedata(
)#隨機生成新樣本資料
樸素貝葉斯
樸素貝葉斯演算法是一種基於概率統計的分類方法,它主要利用貝葉斯公式對樣本事件求概率,通過概率進行分類。以下先對貝葉斯公式做個了解。對於事件a b,若p b 0,則事件a在事件b發生的條件下發生的概率為 p a b p a b p b 將條件概率稍作轉化即可得到貝葉斯公式如下 p a b p b a ...
樸素貝葉斯
1.準備資料 從文字中構建詞向量 2.訓練演算法 從詞向量計算概率 3.測試演算法 儲存為 bayes.py 檔案 參考 coding utf 8 from numpy import 文字轉化為詞向量 def loaddataset postinglist my dog has flea probl...
樸素貝葉斯
機器學習是將資料轉化為決策面的過程 scikit learn縮寫為sklearn 訓練乙個分類器,學習之後 其處理的準確性 def nbaccuracy features train,labels train,features test,labels test from sklearn.bayes ...