from urllib.request import urlopen
from bs4 import beautifulsoup
import re
pages = set()
defgetlinks
(pageurl):
global pages
html = urlopen(""+pageurl)
bsobj = beautifulsoup(html)
# 查詢以/wiki/為開頭的鏈結
for link in bsobj.findall("a",href=re.compile("^(/wiki/)")):
if'href'
in link.attrs:
if link.attrs['href'] not
in pages:
newpage = link.attrs['href']
print(newpage)
pages.add(newpage)
getlinks(newpage)
getlinks("")
輸出
/wiki/wikipedia
/wiki/wikipedia:protection_policy#semi
/wiki/wikipedia:requests_for_page_protection
/wiki/wikipedia:requests_for_permissions
/wiki/wikipedia:user_access_levels
/wiki/wikipedia:requests_for_adminship
/wiki/wikipedia:protection_policy#extended
/wiki/wikipedia:lists_of_protected_pages
/wiki/wikipedia:protection_policy
/wiki/wikipedia:perennial_proposals
/wiki/wikipedia:project_namespace#how-to_and_information_pages
/wiki/wikipedia:protection_policy#move
/wiki/wikipedia:wppp
/wiki/file:people_icon.svg
/wiki/special:whatlinkshere/file:people_icon.svg
/wiki/help:what_links_here
/wiki/wikipedia:policies_and_guidelines
/wiki/wikipedia:shortcut
/wiki/wikipedia:keyboard_shortcuts
/wiki/wikipedia:wikiproject_kansas
import requests
session = requests.session()
params =
s = session.post("",params)
print("cookie is set to:")
print(s.cookies.get_dict())
print('------------------')
print("going to profile page...")
s = session.get("")
print(s.text)
輸出
cookie is set to:
------------------
going to profile page...
hey tomatosir! looks like you're still logged into the site!
from urllib.request import urlopen
from random import randint
defwordlistsum
(wordlist):
sum = 0
for word, value in wordlist.items():
sum += value
return sum
#按照詞頻隨機選擇單詞
defretrieverandomword
(wordlist):
randindex = randint(1,wordlistsum(wordlist))
for word, value in wordlist.items():
randindex -= value
if randindex <= 0:
return word
defbuildworddict
(text):
#剔除換行符和引號
text = text.replace("\n","").replace("\"","")
#將標點和前面詞連在一起,確保標點不被剔除
punctuation = [',','.',';',':']
for symbol in punctuation:
text = text.replace(symbol," "+symbol+" ")
words = text.split(" ")
#過濾空單詞
words = [word for word in words if word != ""]
#建立字典
worddict = {}
#統計2-gram片語的個數
for i in range(1,len(words)):
if words[i-1] not
in worddict:
worddict[words[i-1]]={}
if words[i] not
in worddict[words[i-1]]:
worddict[words[i-1]][words[i]] = 0
worddict[words[i-1]][words[i]] = worddict[words[i-1]][words[i]] + 1
return worddict
text = str(urlopen("").read(),'utf-8')
worddict = buildworddict(text)
#生成馬爾科夫鏈
length = 100
chain = ""
currentword = "i"
for i in range(0,length):
chain += currentword + " "
currentword = retrieverandomword(worddict[currentword])
print(chain)
輸出 Python網路資料採集
本書適合熟悉python的程式設計師 安全專業人士 網路管理員閱讀。書中不僅介紹了網路資料採集的基本原理,還深入 了更高階的主題,比如分析原始資料 用網路爬蟲測試 等。此外,書中還提供了詳細的 示例,以幫助你更好地理解書中的內容。這本書中的工具和示例幫我輕鬆地將一些重複性工作自動化了,我可以將省下來...
Python 網路資料採集(二)
使用beautifulsoup解析後的網頁通常是一種帶標籤的類文字形式,個人認為難點就是怎麼通過層層標籤的阻攔,抓取到目標內容。findall tag,attributes,recursive,text,limit,keywords find tag,attributes,recursive,tex...
網路資料採集
網路資料採集是指通過網路爬蟲或 公開api等方式從 上獲取資料資訊 工作原理 工作流程 抓取策略 網路爬蟲策略用到的基本概念 通用網路爬蟲 通用網路爬蟲又稱全網爬蟲,爬行物件從一些種子url擴充套件到整個web,主要為門戶站點搜尋引擎和大型web服務提供商採集資料。聚焦網路爬蟲 聚焦網路爬蟲又稱主題...