import re,json
import requests
from urllib import request
import os
defb
(url):
headers =
# print(url)
# headers =
response = requests.get(url,headers=headers)
# print(response)
html_str = response.text
# print(html_str)
pattern = r'gallery: json\.parse\((.*)\),'
match_res = re.search(pattern,html_str)
# print(match_res)
# with open('ying.html', 'wb') as f:
# f.write(response.content)
# 新建資料夾
ifnot os.path.exists('load'):
os.mkdir('load')
if match_res:
# 這本來就是str
# print(match_res.group(1))
json_origin = match_res.group(1)
# 這是第一遍loads, 返回值是str
print('你寫錯了, 不應該來我這')
defa
(offset):
url = ''
a_url = url.format(offset)
# print(a_url)
# print(url)
response = requests.get(a_url)
# 可以通過response.json 直接獲取轉化後的物件(dict)
html_json_dict = response.json()
# print(html_json_dict)
# 獲取dict中的data key對應的列表
data_list = html_json_dict['data']
# print(data_list)
num = offset/20
if num < 4:
offset+=20
# 如果列表中的每一項,有article_url我們就取這個值
for data_item in data_list:
if'article_url'
in data_item:
article_url = data_item['article_url']
# print(article_url)
print(article_url)
b(article_url)
# response = requests.get(article_url)
# print(response)
a(offset)
if __name__=='__main__':
a(0)
爬取今日頭條Ajax請求
搜尋頭條 可以得到這個 search keyword e8 a1 97 e6 8b 8d 開發者工具檢視 我們在搜尋中並沒有發現上面的文字,那麼我們可以www.cppcns.com初步判定,這個由ajax載入,然後渲染出來的。此時切換到xhr過濾,可以看到確實是ajaxyviloqszif請求。觀察...
今日頭條站長平台 頭條搜尋爬蟲spider介紹
頭條搜尋的爬蟲ua為 bytespider 首寫字母為大寫。例如 例如 mozilla 5.0 www.cppcns.com linux android 6.0 nexus 5 build mra58n applewebkit 537.36 khtml,like gecko chrome 41.0....
python2 spider 今日頭條
requests json 直接上 usr bin python coding utf 8 import requests import json url wbdata requests.get url text data json.loads wbdata news data data pc fe...