樸素貝葉斯模型的簡單實現

2021-09-13 23:47:16 字數 2915 閱讀 4670

# coding: utf-8

from numpy import *

def loaddataset():

postinglist = [

['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],

['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],

['my', 'dalmation', 'is', 'so', 'cute', 'i', 'love', 'him'],

['stop', 'posting', 'stupid', 'worthless', 'garbage'],

['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],

['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']

]classvec = [0, 1, 0, 1, 0, 1]

return postinglist, classvec

# 去重,選取所有詞彙

def createvocablist(dataset):

vocabset = set()

for document in dataset:

vocabset = vocabset | set(document)

return list(vocabset)

# 給出乙個所有詞彙的狀態圖,如果輸入的序列在詞彙表中,輸出的詞彙表狀態中對應位置為1,就是詞向量

def setofwords2vec(vocablist, inputset):

returnvec = [0] * len(vocablist)

for word in inputset:

if word in vocablist:

returnvec[vocablist.index(word)] = 1

else:

print('the word: %s is not in my vocabulary!' % word)

return returnvec

# 詞袋模型,就是統計出每種詞出現的次數

def bagofwords2vecmn(vocablist, inputset):

returnvec = [0] * len(vocablist)

for word in inputset:

if word in vocablist:

returnvec[vocablist.index(word)] += 1

return returnvec

def trainnb(trainmatrix, traincategory):

numtraindocs = len(trainmatrix)

numwords = len(trainmatrix[0])

pabusive = sum(traincategory) / float(numtraindocs)

p0num = zeros(numwords)

p1num = zeros(numwords)

p0denom = 0.0

p1denom = 0.0

for i in range(numtraindocs):

if traincategory[i] == 1:

p1num += trainmatrix[i]

p1denom += sum(trainmatrix[i])

else:

p0num += trainmatrix[i]

p0denom += sum(trainmatrix[i])

# 求0/1條件下,各個詞的條件概率

# 拉普拉斯轉移並取log,為了將連乘轉成log的求和

p1vect = log(p1num + 1 / (p1denom + 6))

p0vect = log(p0num + 1 / (p0denom + 6))

return p0vect, p1vect, pabusive

def classifynb(vec2classify, p0vec, p1vec, pclass1):

# 對連乘取log

p1 = sum(vec2classify * p1vec) + log(pclass1)

p2 = sum(vec2classify * p0vec) + log(1.0 - pclass1)

if p1 > p2:

return 1

else:

return 0

def testingnb():

listoposts, listclasses = loaddataset()

myvocablist = createvocablist(listoposts)

trainmat =

for postindoc in listoposts:

p0v, p1v, pab = trainnb(array(trainmat), array(listclasses))

testentry = ['love', 'my', 'dalmation']

thisdoc = array(setofwords2vec(myvocablist, testentry))

print(testentry, classifynb(thisdoc, p0v, p1v, pab))

testentry = ['stupid', 'garbage']

thisdoc = array(setofwords2vec(myvocablist, testentry))

print(testentry, classifynb(thisdoc, p0v, p1v, pab))

testingnb()

樸素貝葉斯模型

生成模型 條件概率 p a b p a b p b 全概率 p a in p a bi p bi inp abi 把所 有包含a 的情況都 加起來 貝葉斯公式 p a b p ak b p ab p b ak p ak p b i p b ai p ai p b ak p ak i 1 kp b a...

樸素貝葉斯

樸素貝葉斯演算法是一種基於概率統計的分類方法,它主要利用貝葉斯公式對樣本事件求概率,通過概率進行分類。以下先對貝葉斯公式做個了解。對於事件a b,若p b 0,則事件a在事件b發生的條件下發生的概率為 p a b p a b p b 將條件概率稍作轉化即可得到貝葉斯公式如下 p a b p b a ...

樸素貝葉斯

1.準備資料 從文字中構建詞向量 2.訓練演算法 從詞向量計算概率 3.測試演算法 儲存為 bayes.py 檔案 參考 coding utf 8 from numpy import 文字轉化為詞向量 def loaddataset postinglist my dog has flea probl...