python 實現tf-idf
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @time : 2020/7/30 20:38
# @author : fuguowen
# @site :
# @file : test05.py
# @software: pycharm
import math
from collections import counter
corpus = [
'hello world hello ',
'hello go',
]word_list =
for i in range(len(corpus)):
print(word_list)
countlist =
for i in range(len(word_list)):
count = counter(word_list[i])
# word可以通過count得到,count可以通過countlist得到
# count[word]可以得到每個單詞的詞頻, sum(count.values())得到整個句子的單詞總數
def tf(word, count):
return count[word] / sum(count.values())
# 統計的是含有該單詞的句子數
def n_containing(word, count_list):
return sum(1 for count in count_list if word in count)
# len(count_list)是指句子的總數,n_containing(word, count_list)是指含有該單詞的句子的總數,加1是為了防止分母為0
def idf(word, count_list):
# return math.log(len(count_list) / (1 + n_containing(word, count_list)))
return math.log(len(count_list) / ( n_containing(word, count_list)))
# 將tf和idf相乘
def tfidf(word, count, count_list):
return tf(word, count) * idf(word, count_list)
# for i, count in enumerate(countlist):
# print("top words in document {}".format(i + 1))
# scores =
# sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=true)
# for word, score in sorted_words[:]:
# # print("\tword: {}, tf-idf: {}".format(word, round(score, 5)))
# print("\tword: {}, tf-idf: {}".format(word, score))
# def simall(self, doc):
# """
# 找出訓練資料中所有相似的句子概率
# :param doc: 一句話的分詞list
# :return:
# """
# scores =
# for index in range(self.d):
# score = self.sim(doc, index)
# return scores
# for i, count in enumerate(countlist):
# print("top words in document {}".format(i + 1))
# scores =
# sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=true)
# for word, score in sorted_words[:]:
# # print("\tword: {}, tf-idf: {}".format(word, round(score, 5)))
# print("\tword: {}, tf-idf: {}".format(word, score))
# 相似度計算
def simall(countlist,word):
scores =
for i,count in enumerate(countlist):
print(tfidf(word, count, countlist))
return scores
if __name__ == '__main__':
scores=simall(countlist,"hello")
print(scores)
TF IDF及其演算法
概念 tf idf term frequency inverse document frequency 是一種用於資訊檢索與資訊探勘的常用加權技術。tf idf是一種統計方法,用以評估一字詞對於乙個檔案集或乙個語料庫中的其中乙份檔案的重要程度。字詞的重要性隨著它在檔案中出現的次數成正比增加,但同時會...
TF IDF及其演算法
概念 tf idf term frequency inverse document frequency 是一種用於資訊檢索與資訊探勘的常用加權技術。tf idf是一種統計方法,用以評估一字詞對於乙個檔案集或乙個語料庫中的其中乙份檔案的重要程度。字詞的重要性隨著它在檔案中出現的次數成正比增加,但同時會...
TF IDF及其演算法
tf idf及其演算法 概念 tf idf term frequency inverse document frequency 是一種用於資訊檢索與資訊探勘的常用加權技術。tf idf是一種統計方法,用以評估一字詞對於乙個檔案集或乙個語料庫中的其中乙份檔案的重要程度。字詞的重要性隨著它在檔案中出現的...