1import
requests2#
匯入lxml使用xpath提取資料
3from lxml import
etree
4def
douban_movies(m_type,nums):
5'''
6豆瓣電影排行榜爬取
7'''
89 url = "
"+m_type+"
&interval_id=100%3a90&action=&start=0&limit=
"+nums
10 headers =
13 response = requests.get(url=url,headers=headers)
14 datas =response.json()
15for data in
datas:
16 movies_info ={}
17 movies_info ['
image
'] = data['
cover_url']
18 movies_info ['
types
'] = data['
types']
19 movies_info ['
regions
'] = data['
regions']
20 movies_info ['
title
'] = data['
title']
21 movies_info ['
url'] = data['
url'
]22 movies_info ['
release_date
'] = data['
release_date']
23 movies_info ['
score
'] = data['
score']
24 movies_info ['
actors
'] = data['
actors']
2526 with open('
./'+key_word+'
豆瓣電影分類排行榜爬取.csv
','a+
',encoding='
utf-8
')as f:
27f.writelines(str(movies_info))
2829
30def
get_type():
3132 movies_type ={}
33 url = '
'34 headers =
37 response = requests.get(url=url, headers=headers)
38 douban_html =etree.html(response.text)
39 datas = douban_html.xpath("
//div[@class='article']/div[2]/div[@class='types']/span")
4041
for href in
datas:
42 info = href.xpath("
.//@href
")[0].split('
&')[1]
43 name = href.xpath("
.//a
")[0].text
44 movies_type[name] =info45#
print(info)
4647
return
movies_type
4849
if__name__ == '
__main__':
50'''
51分析
52頁面分析一次顯示20條資訊
53json資料中的請求url顯示為:
54type=11&interval_id=100%3a90&action=&start=0&limit=20
55type=11&interval_id=100%3a90&action=&start=20&limit=20
56type=11&interval_id=100%3a90&action=&start=40&limit=20
57start 為起始點
58limit 為顯示資訊
59type 為劇情型別
60'''
61 key_word = input('
請輸入查詢分類排行榜》')
62 nums = input('
請輸入查詢資料數量》')
63#獲取分類
64 movies_type =get_type()
65if key_word in
movies_type.keys():66#
執行爬取
67 m_type =movies_type[key_word]
68douban_movies(m_type,nums)
69pass
70else:71
print('
輸入電影分類不存在!!!
')
豆瓣電影排行榜獲取
倉庫 開啟豆瓣的電影排行榜,隨機找乙個分類,隨後對頁面進行分析,經過查到,找到了資訊請求的鏈結,返回的是乙個json資料,根據請求鏈結,我們發現url所帶的引數有四個type為分類 start獲得元素起始點,相當於頁數 limit限制,相當於獲取元素終點,interval id action 意義不...
爬取豆瓣電影推薦排行榜
import requests from bs4 import beautifulsoup class dianying def html url self,url html requests.get url soup beautifulsoup html.text,lxml pai soup.se...
豆瓣讀書 豆瓣電影
1.獲取豆瓣讀書頁資訊,為 如下 coding utf 8 import requests from lxml import etree 1.獲取豆瓣讀書網頁內容 headers url response requests.get url,headers headers text response....