# 建立tfidfvectorizer,載入停用詞,對於超過半數文章**現的單詞不做統計
tfidf_vec = tfidfvectorizer(stop_words=stop_words, max_df=
0.5)
# 對document進行擬合,得到各個文字各個詞的tf-idf值(分類器用到的特徵空間)
features = tfidf_vec.fit_transform(documents)
4.5.1處理流程
4.5.2對文件進行分詞
word_list = nltk.word_tokenize(text)
nltk.post_tag(word_list)
word_list = jieba.cut(text)
4.5.3載入停用詞表stop_words =
[line.strip(
).decode(utf-8)
for line in io.
open
("stop_wordss.txt"
).readlines(
)]
4.5.4 計算單詞的權重# 建立tfidfvectorizer,載入停用詞,對於超過半數文章**現的單詞不做統計
tfidf_vec = tfidfvectorizer(stop_words=stop_words, max_df=
0.5)
# 對document進行擬合,得到tf-idf矩陣
features = tfidf_vec.fit_transform(documents)
4.5.5 生成樸素貝葉斯分類器from sklearn.*****_bayes import multinomialnb
clf = multinomialnb(alpha=
0.001
).fit(features, labels
4.5.6 使用生成的分類器做**- 得到測試集的特徵矩陣test_tfidf_vec = tfidfvectorizer(stop_words=stop_words, max_df=
0.5,vocabulary=train_vocabulary)
test_features = tfidf_vec.fit_transform(documents)
predict = clf.predict(test_features)
4.5.6 計算準確率from sklearn import metrics
accuracy = metrics.accuracy_score(test_labels, predict_labels)
# encoding=utf-8
import jieba
import os
import io
stop_words =
[line.strip(
)for line in io.
open
("data/stop/stopword.txt"
,"rb"
).readlines()]
label_dic =
# 載入目錄下的文件,返回分詞後的文件和文件標籤
defload_data
(data_path)
: labels =
document =
for root, dirs, files in os.walk(data_path)
:for
file
in files:
label = root.split(
"\\")[
-1])
filename = os.path.join(root,
file
)with
open
(filename,
"rb"
)as f:
content = f.read(
) word_list =
list
(jieba.cut(content)
) words =
[wd for wd in word_list]
' '.join(words)
)return document, labels
from sklearn.feature_extraction.text import tfidfvectorizer
from sklearn.*****_bayes import multinomialnb
# 傳入分詞後的文件和對應的標籤,返回用於區分文件的單詞表和分類器
deftrain
(documents, labels)
:# 建立tfidfvectorizer,載入停用詞,對於超過半數文章**現的單詞不做統計
tfidf_vec = tfidfvectorizer(stop_words=stop_words, max_df=
0.5)
# 對document進行擬合,得到各個文字各個詞的tf-idf值(分類器用到的特徵空間)
features = tfidf_vec.fit_transform(documents)
train_vocabulary = tfidf_vec.vocabulary_
clf = multinomialnb(alpha=
0.001
).fit(features, labels)
return train_vocabulary, clf
# 傳入用於分類的單詞表、分類器、以及需要**的文件
defpredict
(train_vocabulary, clf, document)
: test_tfidf = tfidfvectorizer(stop_words=stop_words, max_df=
0.5, vocabulary=train_vocabulary)
test_features = test_tfidf.fit_transform(document)
predict_labels = clf.predict(test_features)
return predict_labels
train_document, train_labels = load_data(
"data/train"
)test_document, test_labels = load_data(
"data/test"
)train_vocabulary, clf = train(train_document, train_labels)
predict_labels = predict(train_vocabulary, clf, test_document)
print
(predict_labels)
print
(test_labels)
from sklearn import metrics
x = metrics.accuracy_score(test_labels, predict_labels)
print
(x)
樸素貝葉斯分類
1 貝葉斯分類是一類分類演算法的總稱,這類演算法均以貝葉斯定理為基礎,故統稱為貝葉斯分類。2 樸素貝葉斯的思想基礎是這樣的 對於給出的待分類項,求解在此項出現的條件下各個類別出現的概率,哪個最大,就認為此待分類項屬於哪個類別。通俗來說,就好比這麼個道理,你在街上看到乙個黑人,我問你你猜這哥們 來的,...
樸素貝葉斯分類
摘自寫在公司內部的wiki 要解決的問題 表中增加欄位classification,有四個取值 0 初始值,未分類 1 positive 2 normal 99 negative review submit前,由樸素貝葉斯分類器決定該條review的flag屬於negative還是positive ...
分類 樸素貝葉斯
原始的貝葉斯公式為 p b a p a b p a p a b p b p a 1 在分類問題中,y為類別,x為樣本特徵,則已知待 的樣本特徵 x 它為類別yi 的概率為 p yi x p x yi p y i p x p yi jp xj y i p x 2 p yi 類別為y i的樣本 數總樣本...