#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""利用simhash進行文字去重
"""from simhash import simhash, simhashindex
import jieba
import codecs
import datetime
import os
class dudup(object):
def __init__(self, data_dir='../data/', model_dir='../model/', file_name='test_data.txt',\
clean_file_name='test_data_clean.txt'):
self.data_dir = data_dir
self.model_dir = model_dir
self.file_name = file_name
self.clean_file_name = clean_file_name
def stop_word_list(self, stop_words_path):
stopwords = [x.strip() for x in codecs.open(stop_words_path, 'r', encoding='utf-8').readlines()]
return stopwords
def tokenization(self, line):
""":param line: 每行原始資料
:return: 分詞、去除停用詞後的資料
"""result =
words = jieba.lcut(line)
for word in words:
if word not in self.stop_word_list(self.data_dir + 'stopwords.txt'):
return result
def read_data(self, file):
data_list =
with open(self.data_dir + file, encoding='utf-8') as data:
for line in data.readlines():
return data_list
def get_data_dict(self):
data_dic = {}
index = 1
clean_data =
if not os.path.exists(self.data_dir + self.file_name):
# clean_data = [self.tokenization(sent) for sent in self.read_data() if len(sent)]
with open(self.data_dir + self.clean_file_name, 'w', encoding='utf-8') as cleaned_data:
for sent in self.read_data():
clean_line = self.tokenization(sent)
cleaned_data.write(' '.join(clean_line)+'\n')
else:
clean_data = self.read_data(self.clean_file_name)
data_line_number = len(clean_data)
for line in clean_data:
data_dic[str(index)] = ' '.join(line)
index += 1
if index == data_line_number:
break
# print(data_dic)
return data_dic
def get_index(self):
data_dic = self.get_data_dict()
print(data_dic) # 列印出字典
line_score = [(id, simhash(sent)) for id, sent in data_dic.items()]
index = simhashindex(line_score, k=2)
return index
if __name__ == '__main__':
start_time = datetime.datetime.now()
find_dup = dudup()
sim_hash_index = find_dup.get_index()
inp = '「全椒縣經開區汙水處理廠****提標改造裝置採購二次'
inp_sim_hash = simhash(' '.join(find_dup.tokenization(inp)))
result_index = sim_hash_index.get_near_dups(inp_sim_hash)
if len(result_index):
print('重複行索引\t', result_index[0])
raw_data_list = find_dup.read_data(find_dup.data_dir + find_dup.file_name)
print('重複標題\t', raw_data_list[int(result_index[0]) - 1])
else:
print("沒有重複行")
end_time = datetime.datetime.now()
print("consume time is %f minutes." % ((end_time - start_time).seconds * 1.0 / 60))
基於simhash的文字去重原理
simhash 是 google 用來處理海量文字去重的演算法。simhash 可以將乙個文件轉換成乙個 64 位的位元組,暫且稱之為特徵字。判斷文件是否重複,只需要判斷文件特徵字之間的漢明距離。根據經驗,一般當兩個文件特徵字之間的漢明距離小於 3,就可以判定兩個文件相似。傳統的hash演算法只負責...
文字去重之SimHash演算法
說到文字相似性計算,大家首先想到的應該是使用向量空間模型vsm vector space model 使用vsm計算相似度,先對文字進行分詞,然後建立文字向量,把相似度的計算轉換成某種特徵向量距離的計算,比如余弦角 歐式距離 jaccard相似係數等。這種方法存在很大乙個問題 需要對文字兩兩進行相似...
文字去重之SimHash演算法
說到文字相似性計算,大家首先想到的應該是使用向量空間模型vsm vector space model 使用vsm計算相似度,先對文字進行分詞,然後建立文字向量,把相似度的計算轉換成某種特徵向量距離的計算,比如余弦角 歐式距離 jaccard相似係數等。這種方法存在很大乙個問題 需要對文字兩兩進行相似...