電影天堂爬蟲

2021-09-16 18:37:15 字數 1806 閱讀 4701

import requests

from lxml import etree

imgs = zoom.xpath(".//img/@src")#標籤分開時,注意勿忘  (.)
#獲取封面圖和縮圖

cover_img = imgs[0]

screenshot = imgs[1]

movie[『cover』] = cover_img

movie[『screenshot』] = screenshot

#用『』替換rule,並返回info

def parse_info(info,rule):

return info.replace(rule,』』).strip()#strip()作用是消除前後空格

infos = zoom.xpath(".//text()")

for index,info in enumerate(infos):

if info.startswith('◎年  代'):

info = parse_info(info,'◎年  代')

movie['year'] = info

elif info.startswith('◎產  地'):

info = parse_info(info,'◎產  地')

movie['place'] = info

elif info.startswith('◎導  演'):

info = parse_info(info,'◎導  演')

movie['director'] = info

elif info.startswith('◎主  演'):

info = parse_info(info,'◎主  演')

actors = [info]

#演員不止一位

for x in range(index +1,len(infos)):

actor = infos[x].strip()

if actor.startswith('◎標  籤'):

break

movie['actors'] = actors

elif info.startswith('◎簡  介 '):

info = parse_info(info,'◎簡  介')

for x in range(index +1,len(infos)):

profile = infos[x].strip()

break

movie['profile'] = profile

download_url = html.xpath("//td[@bgcolor='#fdfddf']//a/@href")[0]

movie['download_url'] = download_url

return movie

#獲取7頁總的資料

def main():

basic_url = 『

movies =

#獲取前七頁的url

for x in range(1,8):

url = basic_url.format(x)

detail_urls = parse_tatol_data(url)

#獲取一頁中的url

for detail_url in detail_urls:

movie = parse_branch_data(detail_url)

ifname== 『main』:

main()

電影天堂爬蟲

from lxml import etree import requests base domin url html gndy dyzz list 23 1.html headers def get detail urls url response requests.get url,headers ...

python爬蟲3(電影天堂)

from lxml import etree import requests base domain headers def get detal urls url 通過生成頁碼url,獲取到每頁的目錄資訊 url html gndy dyzz list 23 2.html response requ...

Scrapy爬蟲爬取電影天堂

目標 建立專案 scrapy startproject 爬蟲專案檔案的名字 生成 crawlspider 命令 scrapy genspider t crawl 爬蟲名字 爬蟲網域名稱 終端執行 scrapy crawl 爬蟲的名字 python操作mysql資料庫操作 爬蟲檔案 coding ut...