上次爬取的爸爸、媽媽、老師和自己的作文,利用sklearn.neighbors.kneighborsclassifier進行分類。
importjieba
import
pandas as pd
import
numpy as np
import
osimport
itertools
import
matplotlib.pyplot as plt
from sklearn.feature_extraction.text import
countvectorizer
from sklearn.neighbors import
kneighborsclassifier
from sklearn.metrics import
confusion_matrix
from sklearn.decomposition import
pca#
讀取檔案內容
path = '
e:\作文
'corpos = pd.dataframe(columns=['
filepath
','text
','kind'])
for root,dirs,files in
os.walk(path):
for name in
files:
filepath = root+'
\\'+name
f = open(filepath,'
r',encoding='
utf-8')
text =f.read()
txt = ''.join(text.split('\n'
)) kind = root.split('
\\')[-1]
corpos.loc[len(corpos)] =[filepath,text.strip(),kind]
#設定停用詞,構建詞頻矩陣
stopwords = pd.read_csv(r'
stopwords.txt',
encoding='
utf-8
',sep='\n'
)def
tokenizer(s):
words=
cut =jieba.cut(s)
for word in
cut:
return
words
count = countvectorizer(tokenizer=tokenizer,
stop_words=list(stopwords['
stopword
']))
countvector = count.fit_transform(corpos.iloc[:,1]).toarray()
#將類別轉化為數字
kind = np.unique(corpos['
kind
'].values)
nkind = np.zeros(700)
for i in
range(len(kind)):
index = corpos[corpos['
kind
']==kind[i]].index
nkind[index] = i+1
#將詞頻矩陣轉化為二維資料,畫圖
pca = pca(n_components=2)
newvector =pca.fit_transform(countvector)
plt.figure()
for i,c,m in zip(range(len(kind)),['
r','
b','
g','
y'],['
o','
^','
>
','<
']):
index = corpos[corpos['
kind
']==kind[i]].index
x =newvector[index,0]
y = newvector[index,1]
plt.scatter(x,y,c=c,marker=m,label=kind[i])
plt.legend()
plt.xlim(-5,10)
plt.ylim(-20,50)
plt.xlabel(
'x label')
plt.ylabel(
'y label')
#隨機選出測試集
index = np.random.randint(0,700,200)
x_test =countvector[index]
y_test = corpos.iloc[index,2]
#利用knn分類
knn =kneighborsclassifier()
knn.fit(countvector,corpos.iloc[:,2])
y_pred =knn.predict(x_test)
knn.score(x_test,y_test)
#畫knn分類結果的混淆矩陣
knn_confusion =confusion_matrix(y_test,y_pred)'''array([[61, 1, 0, 3],
[ 8, 35, 0, 1],[ 1, 0, 53, 1],
[ 9, 1, 2, 24]])
'''
plt.imshow(knn_confusion,interpolation='資料散點圖如下所示:nearest
',cmap=plt.cm.oranges)
plt.xlabel(
'y_pred')
plt.ylabel(
'y_true')
tick_marks =np.arange(len(kind))
plt.xticks(tick_marks,kind,rotation=90)
plt.yticks(tick_marks,kind)
plt.colorbar()
plt.title(
'confustion_matrix')
for i,j in
itertools.product(range(len(knn_confusion)),range(len(knn_confusion))):
plt.text(i,j,knn_confusion[j,i],
horizontalalignment="
center
")
knn分類結果的混淆矩陣圖如下所示:
機器學習1 KNN文字分類
思想 1.找到與資料最相近k個資料 根據余弦相似度 2.分別找出k條資料的類別,同類別相加,得到最大值,則該類別為測試資料的所屬類。encoding utf 8 from pylab import reload sys defcreatedataset group 1.0,1.1 2.0,2.1 1...
使用python進行文字分類
coding utf 8 author lishuai importnumpy defloaddataset postinglist my dog has flea problems help please maybe not take him to dog park stupid my dalma...
fasstext文字分類(python)
fasttext是word2vec作者提出的文字分類演算法。它是乙個用於高效學習單詞表示和文字分類的庫。本篇部落格主要介紹fasttext在python下的基本應用 pip install fasttext訓練樣本train data.txt的格式介紹 每一行是文字 分類標籤 分類標籤最好形如 la...