# -*- coding: utf-8 -*-
"""created on mon apr 15 12:52:24 2019
@author: lccfm
"""import numpy as np
import struct
import os
from collections import defaultdict
def normalize(data): ##將畫素二值化
m, n = data.shape
for i in range(m):
for j in range(n):
if data[i, j] != 0:
data[i, j] = 1
else:
data[i, j] = 0
return data
def transforms(imgs):
c, l = imgs.shape
for i in range(c):
imgs[i] = [i for i in range(l)]
imgs[i] = np.array(imgs[i]).reshape(28, 28)
print(imgs[i].shape)
return imgs
def read_data_sets(dir, one_hot=true):
files =
data_set = defaultdict(dict)
for key, value in files.items():
for i, fn in enumerate(value): # 可遍歷的資料物件(如列表、元組或字串)組合為乙個索引序列,同時列出資料和資料下標
file = open(os.path.join(dir, fn), 'rb') # 拼接路徑
f = file.read()
file.close()
if not i: # 把檔案解壓成位元組流
img_index = struct.calcsize('>iiii') # 將python的值根據格式符,轉換為字串# 計算給定的格式(fmt)占用多少位元組的記憶體
_, size, row, column = struct.unpack('>iiii', f[:img_index]) # 將位元組字串解包成為變數
imgs = struct.unpack_from(str(size * row * column) + 'b', f, img_index)
# print(imgs)
data_set['img_shape'] = (row, column, 1)
imgs = np.reshape(imgs, (size, row * column)).astype(np.float32)
# imgs = transforms(imgs)
imgs = normalize(imgs)
i # mgs = (imgs - np.min(imgs)) / (np.max(imgs) - np.min(imgs))
data_set[key]['images'] = imgs
else: # 把標籤檔案解壓成位元組流
label_index = struct.calcsize('>ii')
_, size = struct.unpack('>ii', f[:label_index]) # 按照給定的格式(fmt)#解析位元組流string,返回解析出來的tuple
labels = struct.unpack_from(str(size) + 'b', f,
label_index) # 根據minist檔案的描述,labels的數字是`unsigned byte`格式,占用乙個位元組,所以這裡填寫`b`
labels = np.reshape(labels, (size,))
if one_hot:
tmp = np.zeros((size, np.max(labels) + 1))
tmp[np.arange(size), labels] = 1
labels = tmp
data_set[key]['labels'] = labels
return data_set
def train(data_set):
imgs = data_set['train']['images']
labels = data_set['train']['labels']
num_image, dimsnum = imgs.shape
num_label, labelnum = labels.shape
# print(labels) 當前是哪個數字,就在某確定位置標註1
# print(num_image, dimsnum)
# print(num_label, labelnum)
label_sum = np.zeros(labelnum)
label_shape = np.zeros((labelnum, dimsnum))
# print(label_shape.shape)
for i in range(num_image):
label = np.argmax(labels[i])
label_sum[label] = label_sum[label] + 1
for j in range(dimsnum):
label_shape[label][j] = label_shape[label][j] + imgs[i][j]
# print('label個數', label_num)
for i in range(labelnum):
for j in range(dimsnum):
label_shape[i][j] = (label_shape[i][j] + 1) / (label_sum[i] + 2)
label_sum = label_sum / num_image # 計算每個label的概率 即p(wi)
return label_sum, label_shape
def test(data_set, pyjk1, pyj): ##測試
imgs = data_set['test']['images']
labels = data_set['test']['labels']
num, dimsnum = imgs.shape
num1, labelnum = labels.shape
acc = 0
# print(pyjk1.shape)
# print(pyjk1[9][100])
for i in range(num):
testdata = imgs[i]
res=np.argmax(labels[i])
# print(p_yj_xi[1])
# print(p_yj_xi.shape)
result = 0
pro = 0
for j in range(labelnum): ##計算xi 屬於 第j個類別的概率
p_yj_xi = 1
for k in range(dimsnum):
# testdata = np.concatenate(testdata, axis=0)
xk = testdata[k] ##x^i的第j個畫素 或者說是 維度
if (xk == 1):
p_yj_xi *= pyjk1[j][k]
else:
p_yj_xi *= (1 - pyjk1[j][k])
temp = pyj[j] * p_yj_xi
if pro < temp:
pro = temp
result = j
if result == res:
acc = acc + 1
# print('real is: ', np.argmax(labels[i]), ' predict is: ', result)
return acc, num
if __name__ == '__main__':
data_set = read_data_sets('c:/users/lccfm/desktop/data/')
label_sum, label_shape = train(data_set)
# print(imgs.shape)
labels = data_set['train']['labels']
# print(labels.shape)
# print(labels.size)
acc, num = test(data_set, label_shape, label_sum)
print(acc/num)
print('test accuracy is: %f' % (acc/num))
利用樸素貝葉斯分類器實現手寫數字的識別
條件 類別數一定,i,i 1,2,3,c 已知類先驗概率和類條件概率密度 1,2,兩類情況 i?then if?then 多類情況 if?max then 已知 1,2,資料集包括四部分 訓練影象 訓練標籤 表示影象為哪個數字 測試影象 測試標籤 二值特徵提取將進行分割處理轉化為0,1數字資訊,方便...
Python實現貝葉斯分類器
使用樸素貝葉斯分類器,對一片文章進行分類處理 對中文進行分詞處理 jieba分詞 對分開的詞語進行處理,去除重複詞彙,去除標點和單個虛擬詞彙如 你,我,他。選擇特徵詞,很重要,要總結出符合某一型別的關鍵特徵詞對分類器進行訓練,即傳入一些已經分好類的文章,讓分類器可以知道其中的一些特徵詞。計算出特徵詞...
matlab 實現貝葉斯分類器
網上有很多文章介紹貝葉斯原理,這裡推薦個鏈結。這裡再說貝葉斯分類器的設計步驟 1.對每個簇的資料求均值mu 和協方差矩陣sigma 2.對測試資料,將其對每個簇用均值和協方差矩陣求相關性。3.將資料分類到相關性大的簇中。分類器函式 bayesclassifer.m function labels b...