需求,乙個csv檔案中有很多行,每行是個id,字串,每個字串可能兩兩相似(是相似,不是相同),怎樣去重,保留兩兩相似度小於0.8的id。
做法,用diff庫計算兩兩相似度,每次計算結果,這裡可以用(程序/執行緒/協程加速),然後將相似度大於0.8的結果放到set中,最後set中取任意乙個加非集合中成員,可以返回去重結果。
import sys, os
from htmlparser import htmlparser
from multiprocessing import queue
wait_set=queue()
class htmlstripper(htmlparser):
def __init__(self):
self.reset()
self.fed =
def handle_data(self, d):
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = htmlstripper()
s.feed(html.decode('utf-8'))
return s.get_data()
def distance(s1, s2):
import difflib
return difflib.sequencematcher(none, s1, s2).ratio()
class csv():
def __init__(self, csvfile, delimiter=',', quotechar='"'):
self.csvfile = csvfile
self.delimiter = delimiter
self.quotechar = quotechar
self.items =
import csv
csv.field_size_limit(sys.maxsize)#this line is limited in linux,if run in windows will rasise
#csv.field_size_limit(131072)
with open(csvfile, 'ru') as f:
for item in csv.reader(f, delimiter=delimiter, quotechar=quotechar):
f.close()
def diff(self, threshold=none, strip=none, truncate=none, start=0, end=none):
i = start
for item in self.items[start:end]:
i = i + 1
id = item[0]
text = item[1]
if strip:
text = strip_tags(text)
# skip the items already diffed or itself
for c_item in self.items[i:]:
c_id = c_item[0]
c_text = c_item[1]
if strip:
c_text = strip_tags(c_text)
if truncate:
d = distance(text[0:truncate], c_text[0:truncate])
else:
d = distance(text, c_text)
if (threshold and d < threshold):
continue
wait_set.put(id)# because the subprocess
wait_set.put(c_id)
def output(self, data):
print ','.join(data)
sys.stdout.flush()
def analysis(self):
print "%s items, %s bytes." % ( len(self.items), os.path.getsize(self.csvfile) )
print '%6s %32s %32s %32s' % ('no', 'uuid', 'length(after strip)', 'length(before strip)')
i = 0
for item in self.items:
print '%6s %32s %32s %32s' % (i, item[0], len(strip_tags(item[1])), len(item[1]))
i = i + 1;
csv.diff(*args)
def main():
csvfile = sys.ar**[1]
threshold = 0.8
threads = 4
strip = true
truncate = 256
action = 'diff'
if len(sys.ar**) > 2:
if sys.ar**[2] == '-a':
action = 'analysis'
else:
threshold = float(sys.ar**[2])
if len(sys.ar**) > 3:
threads = int(sys.ar**[3])
if len(sys.ar**) > 4:
strip = bool(int(sys.ar**[4]))
if len(sys.ar**) > 5:
truncate = int(sys.ar**[5])
c = csv(csvfile)
if action == 'analysis':
c.analysis()
elif action == 'diff':
if threads > 1:
batch = len(c.items) / threads
tail = len(c.items) % threads
from multiprocessing import pool
pool = pool()
args =
i = 0
while (i < threads):
start = batch * i
end = start + batch
# last loop
if i == (threads - 1):
end = end + tail
i = i + 1
else:
c.diff(threshold, strip, truncate)
wait_list=[x[0] for x in c.items[1:]]
if wait_set.qsize:
queue_list=list(set([wait_set.get() for x in range(wait_set.qsize())]))
for x in range(len(queue_list)-1):#left one element
ele = queue_list[x]
if ele in wait_list:
wait_list.remove(ele)
print wait_list
else:
return wait_list
sys.stdout.flush()
if __name__ == '__main__':
main()
標題相似度演算法 乙個簡單的計算文章相似度功能!
在做文章系統的時候,很多時候需要為這篇文章推薦最相近的文章。通過這樣進行匹配,查詢出來的結果是包含 茶 和 功效 的所有文章。在顯示上,第一篇排在第二篇的上面。那麼,如何做到最匹配的文章呢?words 茶,功效 otitle 用靈芝泡茶的功效 otitletwo 泡茶的功效與作用 echo strp...
js讓乙個物件 陣列,去重
乙個陣列中的每一項都是物件,現在需要將這個物件中id相同的只保留乙個,即根據id去重,通過以上方法可實現 var person var obj person person.reduce cur,next 設定cur預設型別為陣列,並且初始值為空的陣列 console.log person 同上的情況...
根據乙個已存在的表,去建立乙個相同的表
利用已存在的customer表來建立 customer2表 oracle create table customer2 as select from customer mssql select into customer3 from customer 本人親測可以用 mysql 同oracle cr...