(1)共有m篇文件
(2)第i篇文件,包含ki個詞
(3)第i篇文件,詞j出現的次數,記為ci,j
(4)含有詞j的文件一共有lj篇
tf = ci,j/kitf:term frequency(詞頻):某個詞在該文件中出現的頻率
idf:inverse document frequency(逆文件頻率):總文件數m與含有詞i的文件數的比值的對數
idf = log(m/(1 + lj)) //分母』+1』時防止lj為0
tf-idf = tf * idf
(1)只用乙個文件作為求解tf-idf過程的說明,因此,tf、idf的計算式的理解,也有了一些變化。實際應用在多文件中,需要調整
(2)tf的計算,分母就取了這個文件所含總詞數
這只是乙個求tf-idf的簡略思想。資料&**,參考zexinyan同學的githubimport jieba
import nltk
import math
# 按行讀取,每乙個非空行作為list的乙個元素
def load_txt
(filename)
: lists =
with
open
(filename, mode=
'r', encoding=
'utf-8'
, errors=
'ignore'
)as f:
for each in f.
readlines()
:if each !='':
lists.
(each.
strip
('\n'))
return lists # list[str1,str2,
...]
# 按行讀取,去掉每行的換行和回車後,每行作為list的乙個元素
def load_stopwords
(filename)
: stopwords =
with
open
(filename, mode=
'r', encoding=
'utf-8'
, errors=
'ignore'
)as f:
for each in f.
readlines()
: each = each.
strip
('\n'
) each = each.
strip
('\r'
) stopwords.
(each)
return stopwords # list[str1,str2,
...]
# 返回了去除了停止詞和空格的句子
def get_seg_list
(array, stopwords)
: seg_list =
for each in array: # 每次讀入list乙個元素,相當於讀入txt中的一行
local_list = jieba.
cut(each, false) # 分詞
final_list =
for word in local_list: # 處理分詞後元素
if word not in stopwords and word !=
' ': # 每行中,不在stopwords以及分空格的詞
final_list.
(word) # 加入list
seg_list.
(final_list) # 所有行處理後的list,每個元素也是乙個list
(變數final_list)
return seg_list # list[list1,list2,
...]
# 統計seq_list中,每個list中不同詞的個數,返回的是乙個list,list每個元素為seq_list物件轉換成的freqdist物件
def get_freq_dist
(seg_list)
: freq_dist =
for each in seg_list: # 一次訪問seq_list中的每個list
freq_dist.
(nltk.
freqdist
(each)
) # 對每個list處理,並加到freq_list中
return freq_dist # list[
freqdist1()
,freqdist2()
,...
]# 去除seq_list中不同list元素中的重複元素
def get_words_set
(seg_list)
: word_set =
set(
) # 新建無序的元素集,每次執行,次序會變
for each in seg_list: # 按list循序訪問list中的每個元素(每個都是乙個list)
for word in each: # 乙個特定list中
word_set.
add(word) # 把這個list中元素加入到word_set中
return word_set # set
# seq_list中所有的詞的數量(!包括重複的詞)
# 整個文字分詞、過濾後的詞的數量
def compute_words_num
(seg_list)
: total =
0for each in seg_list:
total +=
len(each)
return total
def compute_tf_idf
(word_set, freq_dist, words_num, total_seg_list)
: word_dist =
for word in word_set: # 訪問不重複的詞set
tf =
0for each in freq_dist: # 依次訪問每個freqdist物件
tf += each[word] # 統計word出現的次數
tf /= words_num # 計算tf:只有1個文件,將這個文件總詞數當作了分母
total_num =
len exist_num =01
exist_num +=
1continue
# 計算idf值。+
idf = math.
log(total_num /(1
+ exist_num)
) # 計算word的tf
-idf值,形成字典
word_dist[word]
= tf * idf
return word_dist
# --
----
----
----
----
----
----
----
----
-test part--
----
----
----
----
----
----
----
----
----
----
--#neg_list =
load_txt
('tf_idf_data.txt'
) # list[str1,str2,
...]
(neg_list)
stops_list =
load_stopwords
('./corpus/stopwords.txt'
) # list[str1,str2,
...]
(stops_list)
seq_list =
get_seg_list
(neg_list, stops_list) # list[list1,list2,
...]
(seq_list)
freq_list =
get_freq_dist
(seq_list) # list[
freqdist1()
,freqdist2()
,...
(freq_list)
words_set =
get_words_set
(seq_list) # set
(words_set)
word_num =
compute_words_num
(seq_list)
(word_num)
words_tf_idf =
compute_tf_idf
(words_set, freq_list, word_num, seq_list)
(words_tf_idf)
文字的tfidf值表示
做nlp的時候,如果用到tf idf,sklearn中用countvectorizer與tfidftransformer兩個類,下面對和兩個類進行講解 countvectorizer與tfidftransformer在處理訓練資料的時候都用fit transform方法,在測試集用transform...
TF IDF的計算方法
使用sklearn提取文字tfidf特徵 參考 或者 語料庫 corpus this is the first document this is the second second document and the third one is this the first document from ...
運用hadoop計算TF IDF
這個例子是使用hadoop來實現tf idf。tf idf term frequency inverse document frequency 因為要實現的細節比較多,所以我直接將 放上來。大家可以參看 裡面的注釋,我寫的比較詳細。我採用了兩個mapreduce任務順序執行來實現tf idf功能。最...