python爬蟲3(電影天堂)

2021-09-18 07:36:39 字數 3100 閱讀 6642

from lxml import etree

import requests

base_domain = ''

headers =

def get_detal_urls(url):# 通過生成頁碼url,獲取到每頁的目錄資訊

#url = "/html/gndy/dyzz/list_23_2.html"

response = requests.get(url, headers=headers)

#text = response.content.decode('gbk')

text = response.text

html = etree.html(text)

# // 指定獲取的標籤,@ 指明獲取標籤的屬性

detail_urls = html.xpath("//table[@class='tbspan']//a/@href")

# for detail_url in detail_urls:

# print(base_domain + detail_url)

detail_urls1 = map(lambda url:base_domain+url,detail_urls)# 迴圈執行內部相加方法

# for index, detail_url in detail_urls:

# detail_url = base_domain+detail_url

# detail_urls[index] = detail_url

# index +=1

# print(detail_url)

return detail_urls1

def parge_detail_page(url):#通過電影詳情的url,獲取電影詳情頁面的內容

movie = {}

response = requests.get(url,headers = headers)

text = response.content.decode('gbk')

html = etree.html(text)

title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]

movie['title'] = title

zoome = html.xpath("//div[@id='zoom']")[0]

imgs = zoome.xpath(".//img/@src")

cover = imgs[0]

screenshot = imgs[1]

movie['screenshot'] = screenshot

movie['cover'] = cover

infos = zoome.xpath(".//text()")

for index,info in enumerate(infos):

if info.startswith("◎年  代"): # 替換頁面內容

info = info.replace("◎年  代","").strip()

movie['info'] = info

elif info.startswith("◎產  地"): # 替換頁面內容

info = info.replace("◎產  地","").strip()

movie['contry'] = info

elif info.startswith("◎類  別"): # 替換頁面內容

info = info.replace("◎類  別", "").strip()

movie['category'] = info

elif info.startswith("◎主  演"): # 替換頁面內容

info = info.replace("◎主  演", "").strip()

actors=[info]

for x in range(index+1,len(infos)):

actor = infos[x].strip()

if actor.startswith("◎"):

break

movie['actors'] = actors

elif info.startswith("◎簡  介"):

info = info.replace("◎簡  介","").strip()

for x in range(index+1,len(infos)):

profile = infos[x].strip()

movie["profile"] = profile

download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]

movie['download_url'] = download_url

return movie

# for x in title:

# name = etree.tostring(x,encoding='utf-8').decode("utf-8")

# print(name)

def spider():

base_url = "/html/gndy/dyzz/list_23_{}.html"

movies =

for x in range(1,8):

# print(x)

url = base_url.format(x)# 通過base_url利用range函式生成電影目錄頁面的分頁 1-7頁的url

detail_urls = get_detal_urls(url)# 通過目錄頁面的url拿到所有電影詳情的資訊,提取電影詳情中的url

for detail_url in detail_urls:

# print(detail_url)

movie = parge_detail_page(detail_url) # 通過詳情的頁面的url提取詳情頁面的資訊,標題

print(movies)

if __name__ == '__main__':

spider()

電影天堂爬蟲

from lxml import etree import requests base domin url html gndy dyzz list 23 1.html headers def get detail urls url response requests.get url,headers ...

電影天堂爬蟲

import requests from lxml import etree imgs zoom.xpath img src 標籤分開時,注意勿忘 獲取封面圖和縮圖 cover img imgs 0 screenshot imgs 1 movie cover cover img movie scre...

python爬蟲 爬取電影天堂連線

import requests,re,chardet,pymysql from piaot import def shoye url headers req requests.get url,headers headers req.encoding gb2312 html req.text 正則 z...