#python網路爬蟲與資訊提取
1.requests:自動爬取html頁面自動網路請求提交
2.robot.txt:網路爬蟲排除標準
3.projects實戰
3.1.小規模:(requess)爬取網頁,玩轉網頁
3.3.大規模(定製開發):爬取全網
#requests入門
#通用的爬蟲框架
import requests
url = ""
try:
r = requests.get(url)
r.raise_for_status()
print(r.text[:1000])
except:
print("爬取失敗!")
#demo1.爬取亞馬遜
import requests
r = requests.get("")
print(r.status_code)
print(r.encoding)
print(r.encoding)
print(r.text)
print(r.request.headers)
kv =
url = ""
r = requests.get(url,headers = kv)
print(r.status_code)
print(r.request.headers)
print(r.text[:1000])
#d爬取亞馬遜框架
import requests
url = "" \
"9%a9%ac%e9%80%8a%e7%bd%91%e7%ab%99&keywords=deep+learning&qid=1557108835&s=gateway&sr=8-1"
try:
kv =
r = requests.get(url,headers = kv)
r.raise_for_status()
print(r.text[1000:2000])
except:
print("爬取失敗!")
'''
import requests
kv =
r = requests.get("",params = kv)
print(r.status_code)
print(r.request.url)
print(len(r.text))
#2.2 360
import requests
keyword = "python"
try:
kv =
r = requests.get("",params = kv)
print(r.request.url)
r.raise_for_status()
print(len(r.text))
except:
print("爬取失敗!")
import requests
f.close()
#demo4.爬取全**
import requests
import os
url = ""
root = "e:\pictures"
path = root + url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print("檔案儲存成功")
else:
print("檔案已經存在")
except:
print("爬取失敗")
#demo5.ip位址歸屬地查詢
import requests
url = ""
r = requests.get(url+"202.204.80.112")
print(r.status_code)
print(r.text[-500:])
import requests
url = ""
try:
r = requests.get(url + '202.204.80.112')
r.raise_for_status()
print(r.text[-500:])
except:
print("爬取失敗")
五個簡單的 Requests 庫爬蟲例項
本文是中國大學mooc上的python網路爬蟲與資訊提取 課程中的筆記,是五個基本的爬蟲操作,個人覺得其中的方法比較常用,因此記錄下來了。import requests url try r requests.get url r.raise for status print r.text 1000 e...
爬蟲(3) Requests庫的5個小例項
1 獲取京東某個商品的資訊 import requests url try r requests.get url r.raise for status print r.text 1000 except print 爬取失敗!2 獲取亞馬遜某個商品的資訊 import requests url r r...
記乙個遇到的requests庫編碼的小問題
這裡其實是存在2個問題的 requests庫的content與text 這個是開課8老師提過的 content返回型別是str text返回型別是unicode,所以需要指定一下編碼形式,不然容易顯示亂碼 還說了啥忘記了,反正他的結論是建議用content.decode 關於這個decode,mac...