# -*- coding: utf-8 -*-
defmkkey0
(s:str):
""" this function is to split word inclede special character
for example:
source code: lib/posix'path.py (for posix), lib/ntpath.py (for windows nt)
s.split() method :
list=['source','code:','lib/posix'path.py','(for posix)','lib/ntpath.py','(for windows nt)']
but we want:
list=['source','code:','lib','posix','path','py','for',posix','lib','ntpath','py','for', 'windows','nt']
實驗:wordlist = ["goggd`e","qw,sh","god"] 測試驗證map函式生成的結果
for w in map(makekey,wordlist):
print(w) 測試結果返回是列表
['goggd', 'e'] 提醒使用時需再次迭代
['qw', 'sh']
['god']
主要思路是將特殊字元替換為空格字元,儲存在列表list中,然後通過"".join鏈結成只帶空格的字串
"""keychar = set("""~!@#$%^&*()_+`,.-;'\:\"/?""")
key = s.lower()
strlist =
for c in key:
return
"".join(strlist).split()
defmkkey1
(s:str):
""" this function is to split word inclede special character
for example:
source code: lib/posix'path.py (for posix), lib/ntpath.py (for windows nt)
s.split() method :
實驗:wordlist = ["`gog))gd`e","qw,sh","god"]
for w in map(makekey1,wordlist):
print(w)
執行結果:
['gog', 'gd', 'e']
['qw', 'sh']
['god']
"""keychar = set("""~!@#$%^&*()_+`,.-;'\:\"/?""")
key = s.lower()
strlist =
offset = 0
for i,v in enumerate(key):
if v in keychar:
if offset == i: #第乙個為特殊字元或者相鄰的兩字元為特殊字元
offset += 1
continue
offset = i+1
else: #無此項時,最後乙個單詞無法獲取到
if offset < len(key): #說明仍有有效字元
return
"".join(strlist).split()
iterwd = set(['the','is','a','of','on','in','or','to','and','if','an','for'])
defcountwd
(words,iterwords={}):
""" the function to count words repeat times in args
"""wordict = {}
with open('{}'.format(words),'r+t',encoding='utf-8') as f:
for lines in f:
wordlist = lines.split()
for wordlst in map(mkkey0,wordlist):
for word in wordlst:
if word not
in iterwords:
wordict[word] = wordict.get(word,0) +1
countword = sorted(wordict.items(),key=lambda x:x[1],reverse=true)[0:10]
#當資料不是很多時可以採用一次性生成推送給顯示端,若資料過多時一次性推送顯示伺服器時,顯示伺服器壓力過多
#因此此時建議採用生成器模式
#def top(n):
# for i,t in enumerate(sorted(wordict.items(),key=lambda x:x[1],reverse=true)):
# if i > n:
# return
# yield t
#for key,v in top(10):
# print(key,v)
return countword
print(countwd('d:\sample.txt',iterwords=iterwd))
執行結果:
[('path', 138), ('os', 49), ('return', 30), ('windows', 25), ('file', 24), ('pathname', 17), ('true', 17), ('drive', 17), ('this', 17), ('unix', 16)]
# -*- coding: utf-8 -*-
defmkkey1
(key:str,keychar=set("""~!@#$%^&*()_+`\n,. -;'\:\"/?""")):
""" this is function to make key
"""offset = 0
for i,v in enumerate(key):
if v in keychar:
if offset == i:
offset += 1
continue
yield key[offset:i]
offset = i+1
else:
if offset < len(key):
yield key[offset:]
defcountwd
(words,encode,ignorwords):
""" the function to count words repeat times in args
"""wordict = {}
with open('{}'.format(words),'r+t',encoding=encode) as f:
for lines in f:
for word in map(str.lower,mkkey1(lines)):
if word not
in ignorwords:
wordict[word] = wordict.get(word,0) +1
return wordict
iterwd = set(['the','is','a','of','on','in','or','to',' ','and','if','an','for'])
deftop
(wordict=countwd(words='d:\sample.txt',encode='utf-8',ignorwords=iterwd),n=10):
""" this function is count top
"""for i,t in enumerate(sorted(wordict.items(),key=lambda item:item[1],reverse=true)):
if i > n:
break
yield t
for key,v in top():
print(key,v)
python爬蟲入門 豆瓣電影排行榜top250
1.requests 2.re 正規表示式庫 請求頭 此處複製的火狐瀏覽器請求頭 myheader 標記電影次序 time,初始化為1 排行榜第i頁 link str i 25 正則匹配結果 matchobj import requests import re def get movies 請求頭 ...
設計模式Top10排行榜
在工作中,常常使用到設計模式,增強了軟體的靈活性,然而要為它們排排位置,還真是一件難事,因為每個人對設計模式的理解程度,每個人使用的程式語言,個人的習慣,工作性質等等都會影響排行榜。在這裡暫且列出自己心中的排行榜吧 僅限design pattern中提到的設計模式 冠軍寶座 strategy pat...
設計模式Top10排行榜
在工作中,常常使用到設計模式,增強了軟體的靈活性,然而要為它們排排位置,還真是一件難事,因為每個人對設計模式的理解程度,每個人使用的程式語言,個人的習慣,工作性質等等都會影響排行榜。在這裡暫且列出自己心中的排行榜吧 僅限 design pattern 中提到的設計模式 冠軍寶座 strategy p...