from lxml import etreeimport requests
base_domain = ''
headers =
def get_detal_urls(url):# 通過生成頁碼url,獲取到每頁的目錄資訊
#url = "/html/gndy/dyzz/list_23_2.html"
response = requests.get(url, headers=headers)
#text = response.content.decode('gbk')
text = response.text
html = etree.html(text)
# // 指定獲取的標籤,@ 指明獲取標籤的屬性
detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
# for detail_url in detail_urls:
# print(base_domain + detail_url)
detail_urls1 = map(lambda url:base_domain+url,detail_urls)# 迴圈執行內部相加方法
# for index, detail_url in detail_urls:
# detail_url = base_domain+detail_url
# detail_urls[index] = detail_url
# index +=1
# print(detail_url)
return detail_urls1
def parge_detail_page(url):#通過電影詳情的url,獲取電影詳情頁面的內容
movie = {}
response = requests.get(url,headers = headers)
text = response.content.decode('gbk')
html = etree.html(text)
title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
movie['title'] = title
zoome = html.xpath("//div[@id='zoom']")[0]
imgs = zoome.xpath(".//img/@src")
cover = imgs[0]
screenshot = imgs[1]
movie['screenshot'] = screenshot
movie['cover'] = cover
infos = zoome.xpath(".//text()")
for index,info in enumerate(infos):
if info.startswith("◎年 代"): # 替換頁面內容
info = info.replace("◎年 代","").strip()
movie['info'] = info
elif info.startswith("◎產 地"): # 替換頁面內容
info = info.replace("◎產 地","").strip()
movie['contry'] = info
elif info.startswith("◎類 別"): # 替換頁面內容
info = info.replace("◎類 別", "").strip()
movie['category'] = info
elif info.startswith("◎主 演"): # 替換頁面內容
info = info.replace("◎主 演", "").strip()
actors=[info]
for x in range(index+1,len(infos)):
actor = infos[x].strip()
if actor.startswith("◎"):
break
movie['actors'] = actors
elif info.startswith("◎簡 介"):
info = info.replace("◎簡 介","").strip()
for x in range(index+1,len(infos)):
profile = infos[x].strip()
movie["profile"] = profile
download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
movie['download_url'] = download_url
return movie
# for x in title:
# name = etree.tostring(x,encoding='utf-8').decode("utf-8")
# print(name)
def spider():
base_url = "/html/gndy/dyzz/list_23_{}.html"
movies =
for x in range(1,8):
# print(x)
url = base_url.format(x)# 通過base_url利用range函式生成電影目錄頁面的分頁 1-7頁的url
detail_urls = get_detal_urls(url)# 通過目錄頁面的url拿到所有電影詳情的資訊,提取電影詳情中的url
for detail_url in detail_urls:
# print(detail_url)
movie = parge_detail_page(detail_url) # 通過詳情的頁面的url提取詳情頁面的資訊,標題
print(movies)
if __name__ == '__main__':
spider()
電影天堂爬蟲
from lxml import etree import requests base domin url html gndy dyzz list 23 1.html headers def get detail urls url response requests.get url,headers ...
電影天堂爬蟲
import requests from lxml import etree imgs zoom.xpath img src 標籤分開時,注意勿忘 獲取封面圖和縮圖 cover img imgs 0 screenshot imgs 1 movie cover cover img movie scre...
python爬蟲 爬取電影天堂連線
import requests,re,chardet,pymysql from piaot import def shoye url headers req requests.get url,headers headers req.encoding gb2312 html req.text 正則 z...