from lxml import etree
import requests
base_domin = ""
url = "/html/gndy/dyzz/list_23_1.html"
headers =
def get_detail_urls(url):
response = requests.get(url, headers=headers)
#response.text
#response.content
#requests庫,缺省會使用自己猜測的編碼方式將
#抓取下來的網頁進行編碼,然後儲存到 text屬性上去
# 在電影天堂的網頁中,因為編碼方式,requests庫猜錯了,所以會產生亂碼
#print(response.text)
#print(response.content.decode("gbk"))
print(response.encoding)
text = response.text
html = etree.html(text)
details_urls = html.xpath(".//table[@class='tbspan']//a/@href")
details_urls = map(lambda url:base_domin+url, details_urls)
return details_urls
def parse_detail_page(url):
movie = {}
response = requests.get(url, headers=headers)
text = response.content.decode("gbk")
html = etree.html(text)
title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
# for x in title:
# print(etree.tostring(x, encoding="utf-8").decode("utf-8"))
movie['title'] = title
zoome = html.xpath("//div[@id='zoom']")[0]
imgs = zoome.xpath(".//img/@src")
cover = imgs[0]
screenshot = imgs[1]
movie['cover'] = cover
movie['screenshot'] = screenshot
infos = zoome.xpath(".//text()")
def pars_info(info, relu):
return info.replace(relu, "").strip()
for index,info in enumerate(infos):
# print(index)
# print(info)
# print("-------")
if info.startswith("◎年 代"):
info = pars_info(info,"◎年 代")
movie["year"] = info
elif info.startswith("◎產 地"):
info = pars_info(info,"◎產 地")
# print(info)
movie["country"] = info
elif info.startswith("◎類 別"):
info = pars_info(info,"◎類 別")
movie["category"] = info
elif info.startswith("◎豆瓣評分"):
info = pars_info(info,"◎豆瓣評分")
# print(info)
movie["douban_rating"] = info
elif info.startswith("◎片 長"):
info = pars_info(info,"◎片 長")
# print(info)
movie["duration"] = info
elif info.startswith("◎導 演"):
info = pars_info(info,"◎導 演")
# print(info)
movie["director"] = info
elif info.startswith("◎主 演"):
info = pars_info(info,"◎主 演")
print(info)
actors = [info]
for i in range(index+1, len(infos)):
actor = infos[i].strip()
if actor.startswith("◎"):
break
# print(actor)
# print(actors)
movie["actors"] = actors
elif info.startswith("◎簡 介"):
info = pars_info(info,"◎簡 介")
for i in range(index+1, len(infos)):
profile = infos[i].strip()
if profile.startswith("◎獲獎情況"):
break
# print(profile)
movie["profile"] = profile
download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/text()")[0]
movie["download_url"] = download_url
return movie
def spider():
base_url = "/html/gndy/dyzz/list_23_{}.html"
movies =
for i in range(1,8):
print("*"*20)
print(i)
print("*"*20)
url = base_url.format(i)
detail_urls = get_detail_urls(url)
# print(detail_urls)
for detail_url in detail_urls:
movie = parse_detail_page(detail_url)
print(movies)
# base_url = "/html/gndy/dyzz/list_23_2.html"
# detail_urls = get_detail_urls(base_url)
# for detail_url in detail_urls:
# parse_detail_page(detail_url)
if __name__ == "__main__":
spider()
結果:
電影天堂爬蟲
import requests from lxml import etree imgs zoom.xpath img src 標籤分開時,注意勿忘 獲取封面圖和縮圖 cover img imgs 0 screenshot imgs 1 movie cover cover img movie scre...
python爬蟲3(電影天堂)
from lxml import etree import requests base domain headers def get detal urls url 通過生成頁碼url,獲取到每頁的目錄資訊 url html gndy dyzz list 23 2.html response requ...
Scrapy爬蟲爬取電影天堂
目標 建立專案 scrapy startproject 爬蟲專案檔案的名字 生成 crawlspider 命令 scrapy genspider t crawl 爬蟲名字 爬蟲網域名稱 終端執行 scrapy crawl 爬蟲的名字 python操作mysql資料庫操作 爬蟲檔案 coding ut...