#coding:utf-8__author__ ='lishuai'
importnumpy
defloaddataset():
postinglist=[
['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','i','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
#0代表沒有侮辱性的詞彙,1代表包涵侮辱性的詞
classvec = [0,1,0,1,0,1]
returnpostinglist,classvec
#將選取的文字構建成詞彙表
defcreatevocablist(dataset):
vocabset=set()
fordocumentindataset:
vocabset = vocabset | set(document)
returnlist(vocabset)
#對選取的文字向量化
defsetofword2vec(vocablist,inputdata):
returnvec = [0] * len(vocablist)
fordocumentininputdata:
ifdocumentinvocablist:
returnvec[vocablist.index(document)] = 1
else:
print"the word: %s is not in my vocabyulary!"%document
returnreturnvec
defmakematrix(vocablist,inputdata):
trainmatr =
fordocumentintrainmatr
#對向量化後的文字進行訓練
deftrainnb(trainmatrix,traincategory):
numtraindocs = len(trainmatrix)
numwords = len(trainmatrix[0])
pabusive=sum(traincategory)/float(numtraindocs)
p1vector=numpy.ones(numwords)
p0vector=numpy.ones(numwords)
p1documentwords=2
p0documentwords=2
foriinrange(numtraindocs):
iftraincategory[i] == 1:
p1vector+=trainmatrix[i]
p1documentwords+=sum(trainmatrix[i])
else:
p0vector+=trainmatrix[i]
p0documentwords+=sum(trainmatrix[i])
p1=p1vector/p1documentwords
p0=p0vector/p0documentwords
returnpabusive,numpy.log(p1),numpy.log(p0)
#利用建立好的模型對文字testvector進行分類
defclassifynb(testvector,p1vector,p0vector,pclass):
p1 = sum(testvector*p1vector)+numpy.log(pclass)
p0 = sum(testvector*p0vector)+numpy.log(1-pclass)
ifp1 > p0 :
return1
else:
return0
使用awk sort uniq進行文字分析
問題 處理一下檔案內容,將網域名稱取出並根據網域名稱進行計數排序處理 root web01 cat access.log 此類問題是運維工作中最常見的問題。可以演變成分析日誌,檢視tcp各個狀態連線數,檢視單ip連線數排名等等。root web01 awk f access.log sort uni...
python使用KNN文字分類
上次爬取的爸爸 媽媽 老師和自己的作文,利用sklearn.neighbors.kneighborsclassifier進行分類。import jieba import pandas as pd import numpy as np import osimport itertools import ...
pyltp庫的使用進行文字分析
pyltp目前支援puthon3.6不支援3.7 用python3.7安裝了很久都沒有安裝成功 無奈換成了3.6 from pyltp import sentencesplitter sents sentencesplitter.split 元芳你怎麼看?我就趴視窗上看唄!分句 切割句子。分詞 im...