import requests
from lxml import etree
imgs = zoom.xpath(".//img/@src")#標籤分開時,注意勿忘 (.)
#獲取封面圖和縮圖
cover_img = imgs[0]
screenshot = imgs[1]
movie[『cover』] = cover_img
movie[『screenshot』] = screenshot
#用『』替換rule,並返回info
def parse_info(info,rule):
return info.replace(rule,』』).strip()#strip()作用是消除前後空格
infos = zoom.xpath(".//text()")
for index,info in enumerate(infos):
if info.startswith('◎年 代'):
info = parse_info(info,'◎年 代')
movie['year'] = info
elif info.startswith('◎產 地'):
info = parse_info(info,'◎產 地')
movie['place'] = info
elif info.startswith('◎導 演'):
info = parse_info(info,'◎導 演')
movie['director'] = info
elif info.startswith('◎主 演'):
info = parse_info(info,'◎主 演')
actors = [info]
#演員不止一位
for x in range(index +1,len(infos)):
actor = infos[x].strip()
if actor.startswith('◎標 籤'):
break
movie['actors'] = actors
elif info.startswith('◎簡 介 '):
info = parse_info(info,'◎簡 介')
for x in range(index +1,len(infos)):
profile = infos[x].strip()
break
movie['profile'] = profile
download_url = html.xpath("//td[@bgcolor='#fdfddf']//a/@href")[0]
movie['download_url'] = download_url
return movie
#獲取7頁總的資料
def main():
basic_url = 『
movies =
#獲取前七頁的url
for x in range(1,8):
url = basic_url.format(x)
detail_urls = parse_tatol_data(url)
#獲取一頁中的url
for detail_url in detail_urls:
movie = parse_branch_data(detail_url)
ifname== 『main』:
main()
電影天堂爬蟲
from lxml import etree import requests base domin url html gndy dyzz list 23 1.html headers def get detail urls url response requests.get url,headers ...
python爬蟲3(電影天堂)
from lxml import etree import requests base domain headers def get detal urls url 通過生成頁碼url,獲取到每頁的目錄資訊 url html gndy dyzz list 23 2.html response requ...
Scrapy爬蟲爬取電影天堂
目標 建立專案 scrapy startproject 爬蟲專案檔案的名字 生成 crawlspider 命令 scrapy genspider t crawl 爬蟲名字 爬蟲網域名稱 終端執行 scrapy crawl 爬蟲的名字 python操作mysql資料庫操作 爬蟲檔案 coding ut...