豆瓣電影資訊查詢

2022-07-28 20:51:17 字數 4374 閱讀 2857

電影資訊**為豆瓣網,搜尋頁面位址為由於其頁面是通過js渲染的,直接通過requests請求是拿不到電影查詢結果的,所以我先用selenium獲取本頁面的查詢結果並提取出前10條電影名和對應的詳情頁面url,然後再用requests請求需要查詢的電影詳情頁面以獲取電影資訊。

在運用selenium的時候需要開啟無頭模式,具體操作如下:

chrome_options =webdriver.chromeoptions()

chrome_options.add_argument(

'--headless')

chrome_options.add_argument(

'--disable-gpu')

browser = webdriver.chrome(chrome_options=self.chrome_options, executable_path='

chromedriver.exe

')# executable_path應準確指定chromedriver.exe的位置

最終效果如下圖所示:

完整**如下:

import

requests

from selenium import

webdriver

from selenium.webdriver.support.wait import

webdriverwait

from selenium.webdriver.support import

expected_conditions as ec

from selenium.webdriver.common.by import

byfrom lxml.html import

etree

import

bs4import

jieba

import

wordcloud

import

refrom matplotlib import

pyplot

class

movie():

def__init__

(self, name):

self.url = f'

search_text=

'self.headers = ''\

'chrome/78.0.3904.108 safari/537.36"

'self.chrome_options =webdriver.chromeoptions()

self.chrome_options.add_argument(

'--headless')

self.chrome_options.add_argument(

'--disable-gpu')

self.chrome_options.add_argument(self.headers)

self.browser = webdriver.chrome(chrome_options=self.chrome_options, executable_path='

chromedriver.exe')

self.wait = webdriverwait(self.browser, 10)

defget_search(self):

# 獲取搜尋結果,以便進一步選擇

self.browser.get(self.url)

response = self.wait.until(ec.presence_of_all_elements_located((by.css_selector, '

.title > a

')))

ifresponse:

print('

請選擇:')

movies =

for i in range(10):

name =response[i].text

url = response[i].get_attribute('

href')

print(f'.'

) self.browser.close()

return

movies

else

:

print("

沒有搜到您要的資訊,請重新輸入")

self.get_search()

defget_movie_info(self, movie):

# 在影片詳情頁面提取影片基本資訊

name =movie[0]

url = movie[1]

headers =

resp = requests.get(url, headers=headers)

try:

if resp.status_code == 200:

soup = bs4.beautifulsoup(resp.text, '

html.parser')

info = soup.find(name='

div', attrs=).text

rating = soup.find(name='

div', attrs=)

rating_num =rating.strong.text

rating_people =rating.a.text

print

(info)

print(f'

評分: ')

print

(rating_people)

text =self.get_reviews(url, headers)

self.word_cloud(name, text)

else

:

return

none

except

requests.exceptions:

return

none

@staticmethod

def text = ''

for i in range(5):

url = f'

reviews?start=i

'response = requests.get(url, headers=headers)

html =etree.html(response.text)

reviews = html.xpath('

//*[@class="short-content"]/text()')

reviews = ''.join(''

.join(reviews).split())

reviews = ''.join(reviews.split('()'

)) text +=reviews

return

text

@staticmethod

defword_cloud(name, word):

# 生成詞云

name = re.sub(r'

[\\/:*?"<>|\r\n。,.?]+

', ''

, name)

ls =jieba.lcut(word)

text = '

'.join(ls)

w = wordcloud.wordcloud(font_path='

simkai.ttf

', width=800, height=600, background_color='

white')

w.generate(text)

w.to_file(f

'.png')

pyplot.imshow(w)

pyplot.axis(false)

pyplot.show()

defmain():

movie_name = input("

請輸入電影名稱,即可查詢對應的影片資訊:")

m =movie(movie_name)

movies =m.get_search()

num = input('

請輸入序號選擇:')

num =int(num)

m.get_movie_info(movies[num])

if__name__ == '

__main__':

main()

爬取豆瓣網電影資訊

coding utf 8 import urllib2 import bs4 from bs4 import beautifulsoup 爬取豆瓣網電影簡介,包括電影名,導演,評分以及介紹等 class dbtop def init self self.usr agent mozilla 5.0 w...

python 抓取https豆瓣電影資訊

1 豆瓣 為https,python需要模擬瀏覽器行為,新增請求頭資訊,2 開啟開發者工具,對資訊進行提取 2.1定位到電影資訊頭,先把關注的資訊提取出來 table re.findall r 顯示全部影片 data,re.s print table firsttable table 0 2.2 提...

豆瓣讀書 豆瓣電影

1.獲取豆瓣讀書頁資訊,為 如下 coding utf 8 import requests from lxml import etree 1.獲取豆瓣讀書網頁內容 headers url response requests.get url,headers headers text response....