電影天堂爬蟲

2021-09-11 03:23:07 字數 3807 閱讀 6533

from lxml import etree

import requests

base_domin = ""

url = "/html/gndy/dyzz/list_23_1.html"

headers =

def get_detail_urls(url):

response = requests.get(url, headers=headers)

#response.text

#response.content

#requests庫,缺省會使用自己猜測的編碼方式將

#抓取下來的網頁進行編碼,然後儲存到 text屬性上去

# 在電影天堂的網頁中,因為編碼方式,requests庫猜錯了,所以會產生亂碼

#print(response.text)

#print(response.content.decode("gbk"))

print(response.encoding)

text = response.text

html = etree.html(text)

details_urls = html.xpath(".//table[@class='tbspan']//a/@href")

details_urls = map(lambda url:base_domin+url, details_urls)

return details_urls

def parse_detail_page(url):

movie = {}

response = requests.get(url, headers=headers)

text = response.content.decode("gbk")

html = etree.html(text)

title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]

# for x in title:

# print(etree.tostring(x, encoding="utf-8").decode("utf-8"))

movie['title'] = title

zoome = html.xpath("//div[@id='zoom']")[0]

imgs = zoome.xpath(".//img/@src")

cover = imgs[0]

screenshot = imgs[1]

movie['cover'] = cover

movie['screenshot'] = screenshot

infos = zoome.xpath(".//text()")

def pars_info(info, relu):

return info.replace(relu, "").strip()

for index,info in enumerate(infos):

# print(index)

# print(info)

# print("-------")

if info.startswith("◎年  代"):

info = pars_info(info,"◎年  代")

movie["year"] = info

elif info.startswith("◎產  地"):

info = pars_info(info,"◎產  地")

# print(info)

movie["country"] = info

elif info.startswith("◎類  別"):

info = pars_info(info,"◎類  別")

movie["category"] = info

elif info.startswith("◎豆瓣評分"):

info = pars_info(info,"◎豆瓣評分")

# print(info)

movie["douban_rating"] = info

elif info.startswith("◎片  長"):

info = pars_info(info,"◎片  長")

# print(info)

movie["duration"] = info

elif info.startswith("◎導  演"):

info = pars_info(info,"◎導  演")

# print(info)

movie["director"] = info

elif info.startswith("◎主  演"):

info = pars_info(info,"◎主  演")

print(info)

actors = [info]

for i in range(index+1, len(infos)):

actor = infos[i].strip()

if actor.startswith("◎"):

break

# print(actor)

# print(actors)

movie["actors"] = actors

elif info.startswith("◎簡  介"):

info = pars_info(info,"◎簡  介")

for i in range(index+1, len(infos)):

profile = infos[i].strip()

if profile.startswith("◎獲獎情況"):

break

# print(profile)

movie["profile"] = profile

download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/text()")[0]

movie["download_url"] = download_url

return movie

def spider():

base_url = "/html/gndy/dyzz/list_23_{}.html"

movies =

for i in range(1,8):

print("*"*20)

print(i)

print("*"*20)

url = base_url.format(i)

detail_urls = get_detail_urls(url)

# print(detail_urls)

for detail_url in detail_urls:

movie = parse_detail_page(detail_url)

print(movies)

# base_url = "/html/gndy/dyzz/list_23_2.html"

# detail_urls = get_detail_urls(base_url)

# for detail_url in detail_urls:

# parse_detail_page(detail_url)

if __name__ == "__main__":

spider()

結果:

電影天堂爬蟲

import requests from lxml import etree imgs zoom.xpath img src 標籤分開時,注意勿忘 獲取封面圖和縮圖 cover img imgs 0 screenshot imgs 1 movie cover cover img movie scre...

python爬蟲3(電影天堂)

from lxml import etree import requests base domain headers def get detal urls url 通過生成頁碼url,獲取到每頁的目錄資訊 url html gndy dyzz list 23 2.html response requ...

Scrapy爬蟲爬取電影天堂

目標 建立專案 scrapy startproject 爬蟲專案檔案的名字 生成 crawlspider 命令 scrapy genspider t crawl 爬蟲名字 爬蟲網域名稱 終端執行 scrapy crawl 爬蟲的名字 python操作mysql資料庫操作 爬蟲檔案 coding ut...