參考文章: 樸素貝葉斯實現的文字分類原理
# coding=utf-8
'''created on 2017
@author: xyj
'''import jieba
import os
import random
import math
deftextprocessing
(floder_path,train_size =0.8):
floder_list = os.listdir(floder_path)
train_data_list =
train_class_list =
test_data_list =
test_class_list =
for floder in floder_list:
new_floder_path = os.path.join(floder_path,floder)
new_floder_list = os.listdir(new_floder_path)
word_list =
for file in new_floder_list:
txt_list =
with open(os.path.join(new_floder_path,file),'rb') as f:
raw = f.read().decode('ansi','ignore')
txt_list = list(jieba.cut(raw,cut_all = false))
while
'\u3000'
in txt_list:
txt_list.remove('\u3000')
while
'\r\n'
in txt_list:
txt_list.remove('\r\n')
while
'\x00'
in txt_list:
txt_list.remove('\x00')
while
'\n'
in txt_list:
txt_list.remove('\n')
random.shuffle(word_list)
size = int(len(word_list)*train_size)
print(floder)
print(size)
tem_train_list = word_list[:size]
tem_test_list = word_list[size:]
tem_train_word =
for a in tem_train_list :
for b in a:
3##生成訓練資料集和測試資料集
return train_data_list,test_data_list,train_class_list,test_class_list
'''@param param is stopwords's filename:
@return: a set of stopwords_file
'''def
makestopwordsset
(stopwords_file):
words_set = set()
with open(stopwords_file,'rb') as f:
lines = f.readlines()
for line in lines:
word = line[:-2].decode('utf-8')
if len(word)>0
and word not
in words_set:
words_set.add(word)
return words_set
deflisttodict
(data_list,stopwords_set=set()):
data_dict = {}
for word in data_list:
if word not
in stopwords_set and
not word.isdigit():
if word in data_dict:
data_dict[word] += 1
else:
data_dict[word] = 1
return data_dict
defclearlist
(test_list,stopwords_set = set()):
test =
for word in test_list:
if word not
in stopwords_set and
not word.isdigit():
return test
defpredicted
(test_list,train_data_list_dict,train_class_list,train_data_count):
predicte =
for dic ,count in zip(train_data_list_dict,train_data_count):
laplace = 0
for word in test_list:
laplace += p(word,dic,count)
ma = max(predicte)
return train_class_list[list.index(predicte,ma)]
defp
(word,dic,count):
if word in dic:
laplace = math.log(((dic[word]+1)/(count + len(dic))))/math.log(10)
else:
laplace = math.log((1/(count + len(dic))))/math.log(10)
return laplace
defmain
(): abspath = os.path.abspath(os.path.dirname(os.getcwd()))
##########獲取不關鍵單詞集合##########
stopwords_file = abspath + '\\stopwords_cn.txt'
stopwords_set = makestopwordsset(stopwords_file)
###########獲取資料集################
folder_path = abspath+'/reduced'
train_data_list,test_data_list,train_class_list,test_class_list = textprocessing(folder_path,train_size = 0.8)
##處理訓練資料集#####################
train_data_list_dict =
for word_list in train_data_list:
print('訓練資料集處理完成')
##處理測試訓練集########
for test_list in test_data_list:
for test in test_list:
test = clearlist(test,stopwords_set)
print('測試資料集處理完成')
for a in train_data_list_dict:
internet_list = sorted(a.items(),key = lambda f : f[1],reverse = true)
print(internet_list[:200])
##統計每一類的單詞數,為了方便計算p(bi/a)
train_data_count =
for dic in train_data_list_dict:
count = 0
for v in dic.values():
count += v
###test###########################################
for li,classtpye in zip(test_data_list,test_class_list):
corr = 0
count = 0
for lis in li:
name = predicted(lis, train_data_list_dict, train_class_list, train_data_count)
count += 1
if name == classtpye:
corr += 1
print(classtpye+'類**成功率為 %.3f %%'%(corr*100/count))
if __name__ == '__main__':
main()
樸素貝葉斯分類
1 貝葉斯分類是一類分類演算法的總稱,這類演算法均以貝葉斯定理為基礎,故統稱為貝葉斯分類。2 樸素貝葉斯的思想基礎是這樣的 對於給出的待分類項,求解在此項出現的條件下各個類別出現的概率,哪個最大,就認為此待分類項屬於哪個類別。通俗來說,就好比這麼個道理,你在街上看到乙個黑人,我問你你猜這哥們 來的,...
樸素貝葉斯分類
摘自寫在公司內部的wiki 要解決的問題 表中增加欄位classification,有四個取值 0 初始值,未分類 1 positive 2 normal 99 negative review submit前,由樸素貝葉斯分類器決定該條review的flag屬於negative還是positive ...
分類 樸素貝葉斯
原始的貝葉斯公式為 p b a p a b p a p a b p b p a 1 在分類問題中,y為類別,x為樣本特徵,則已知待 的樣本特徵 x 它為類別yi 的概率為 p yi x p x yi p y i p x p yi jp xj y i p x 2 p yi 類別為y i的樣本 數總樣本...