Python網路資料採集

from urllib.request import urlopen
from bs4 import beautifulsoup
import re
pages = set()
defgetlinks
(pageurl):
global pages
html = urlopen(""+pageurl)
bsobj = beautifulsoup(html)
# 查詢以/wiki/為開頭的鏈結
for link in bsobj.findall("a",href=re.compile("^(/wiki/)")):  
if'href'
in link.attrs:
if link.attrs['href'] not
in pages:
newpage = link.attrs['href']
print(newpage)
pages.add(newpage)
getlinks(newpage)
getlinks("")

輸出

/wiki/wikipedia /wiki/wikipedia:protection_policy#semi /wiki/wikipedia:requests_for_page_protection /wiki/wikipedia:requests_for_permissions /wiki/wikipedia:user_access_levels /wiki/wikipedia:requests_for_adminship /wiki/wikipedia:protection_policy#extended /wiki/wikipedia:lists_of_protected_pages /wiki/wikipedia:protection_policy /wiki/wikipedia:perennial_proposals /wiki/wikipedia:project_namespace#how-to_and_information_pages /wiki/wikipedia:protection_policy#move /wiki/wikipedia:wppp /wiki/file:people_icon.svg /wiki/special:whatlinkshere/file:people_icon.svg /wiki/help:what_links_here /wiki/wikipedia:policies_and_guidelines /wiki/wikipedia:shortcut /wiki/wikipedia:keyboard_shortcuts

/wiki/wikipedia:wikiproject_kansas

import requests
session = requests.session()
params = 
s = session.post("",params)
print("cookie is set to:")
print(s.cookies.get_dict())
print('------------------')
print("going to profile page...")
s = session.get("")
print(s.text)

輸出

cookie is set to: ------------------ going to profile page...

hey tomatosir! looks like you're still logged into the site!

from urllib.request import urlopen
from random import randint
defwordlistsum
(wordlist):
sum = 0
for word, value in wordlist.items():
sum += value
return sum
#按照詞頻隨機選擇單詞
defretrieverandomword
(wordlist):
randindex = randint(1,wordlistsum(wordlist))
for word, value in wordlist.items():
randindex -= value
if randindex <= 0:
return word
defbuildworddict
(text):
#剔除換行符和引號
text = text.replace("\n","").replace("\"","")
#將標點和前面詞連在一起，確保標點不被剔除
punctuation = [',','.',';',':']
for symbol in punctuation:
text = text.replace(symbol," "+symbol+" ")
words = text.split(" ")
#過濾空單詞
words = [word for word in words if word != ""]
#建立字典
worddict = {}
#統計2-gram片語的個數
for i in range(1,len(words)):
if words[i-1] not
in worddict:
worddict[words[i-1]]={}
if words[i] not
in worddict[words[i-1]]:
worddict[words[i-1]][words[i]] = 0
worddict[words[i-1]][words[i]] =  worddict[words[i-1]][words[i]] + 1
return worddict
text = str(urlopen("").read(),'utf-8')
worddict = buildworddict(text)
#生成馬爾科夫鏈
length = 100
chain = ""
currentword = "i"
for i in range(0,length):
chain += currentword + " "
currentword = retrieverandomword(worddict[currentword])
print(chain)

輸出

Python網路資料採集

Python網路資料採集

Python 網路資料採集（二）

網路資料採集

相關推薦