# coding: utf-8from numpy import *
def loaddataset():
postinglist = [
['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'i', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']
]classvec = [0, 1, 0, 1, 0, 1]
return postinglist, classvec
# 去重,選取所有詞彙
def createvocablist(dataset):
vocabset = set()
for document in dataset:
vocabset = vocabset | set(document)
return list(vocabset)
# 給出乙個所有詞彙的狀態圖,如果輸入的序列在詞彙表中,輸出的詞彙表狀態中對應位置為1,就是詞向量
def setofwords2vec(vocablist, inputset):
returnvec = [0] * len(vocablist)
for word in inputset:
if word in vocablist:
returnvec[vocablist.index(word)] = 1
else:
print('the word: %s is not in my vocabulary!' % word)
return returnvec
# 詞袋模型,就是統計出每種詞出現的次數
def bagofwords2vecmn(vocablist, inputset):
returnvec = [0] * len(vocablist)
for word in inputset:
if word in vocablist:
returnvec[vocablist.index(word)] += 1
return returnvec
def trainnb(trainmatrix, traincategory):
numtraindocs = len(trainmatrix)
numwords = len(trainmatrix[0])
pabusive = sum(traincategory) / float(numtraindocs)
p0num = zeros(numwords)
p1num = zeros(numwords)
p0denom = 0.0
p1denom = 0.0
for i in range(numtraindocs):
if traincategory[i] == 1:
p1num += trainmatrix[i]
p1denom += sum(trainmatrix[i])
else:
p0num += trainmatrix[i]
p0denom += sum(trainmatrix[i])
# 求0/1條件下,各個詞的條件概率
# 拉普拉斯轉移並取log,為了將連乘轉成log的求和
p1vect = log(p1num + 1 / (p1denom + 6))
p0vect = log(p0num + 1 / (p0denom + 6))
return p0vect, p1vect, pabusive
def classifynb(vec2classify, p0vec, p1vec, pclass1):
# 對連乘取log
p1 = sum(vec2classify * p1vec) + log(pclass1)
p2 = sum(vec2classify * p0vec) + log(1.0 - pclass1)
if p1 > p2:
return 1
else:
return 0
def testingnb():
listoposts, listclasses = loaddataset()
myvocablist = createvocablist(listoposts)
trainmat =
for postindoc in listoposts:
p0v, p1v, pab = trainnb(array(trainmat), array(listclasses))
testentry = ['love', 'my', 'dalmation']
thisdoc = array(setofwords2vec(myvocablist, testentry))
print(testentry, classifynb(thisdoc, p0v, p1v, pab))
testentry = ['stupid', 'garbage']
thisdoc = array(setofwords2vec(myvocablist, testentry))
print(testentry, classifynb(thisdoc, p0v, p1v, pab))
testingnb()
樸素貝葉斯模型
生成模型 條件概率 p a b p a b p b 全概率 p a in p a bi p bi inp abi 把所 有包含a 的情況都 加起來 貝葉斯公式 p a b p ak b p ab p b ak p ak p b i p b ai p ai p b ak p ak i 1 kp b a...
樸素貝葉斯
樸素貝葉斯演算法是一種基於概率統計的分類方法,它主要利用貝葉斯公式對樣本事件求概率,通過概率進行分類。以下先對貝葉斯公式做個了解。對於事件a b,若p b 0,則事件a在事件b發生的條件下發生的概率為 p a b p a b p b 將條件概率稍作轉化即可得到貝葉斯公式如下 p a b p b a ...
樸素貝葉斯
1.準備資料 從文字中構建詞向量 2.訓練演算法 從詞向量計算概率 3.測試演算法 儲存為 bayes.py 檔案 參考 coding utf 8 from numpy import 文字轉化為詞向量 def loaddataset postinglist my dog has flea probl...