1. 獲取豆瓣讀書頁資訊,**為:**如下:
2. 獲取豆瓣電影頁資訊,**為:**如下:# coding:utf-8
import requests
from lxml import etree
# 1.獲取豆瓣讀書網頁內容
headers =
url = ""
response = requests.get(url, headers=headers)
text = response.text
# with open("book.html", "w") as fp:
# fp.write(response.content)
# 2.通過一定規則獲取html檔案中的內容
html = etree.html(text)
ul = html.xpath("//ul[@class='list-col list-col5 list-express slide-item']")[0]
# print etree.tostring(ul, encoding="utf-8").decode("utf-8")
# 儲存成html檔案
# with open("ul.html", "w") as fp:
# fp.write(etree.tostring(ul, encoding="utf-8"))
lis = ul.xpath(".//li")
# print etree.tostring(lis[0], encoding="utf-8").decode("utf-8")
# 通過迴圈獲取lis下面的元素及屬性
books =
for li in lis:
meta = li.xpath(".//div[@class='more-meta']")[0]
# strip()去掉前後的空格
# /text()爬取中間text文字
title = meta.xpath(".//h4[@class='title']/text()")[0].strip()
author = meta.xpath(".//span[@class='author']/text()")[0].strip()
year = meta.xpath(".//span[@class='year']/text()")[0].strip()
publisher = meta.xpath(".//span[@class='publisher']/text()")[0].strip()
abstract = meta.xpath(".//p[@class='abstract']/text()")[0].strip()
book =
# 3.儲存抓取到的books資訊
print books
3. 電影天堂**的爬取,# -- coding:utf-8 --
import requests
from lxml import etree
# 1.將目標**上的頁面抓取下來
headers =
url = ""
response = requests.get(url, headers=headers)
text = response.text
with open("responses.html", "w") as fp:
# 注意response.content資料型別和response.text資料型別
# 2.將抓取下來的資料根據一定的規則進行提取
html = etree.html(text)
ul = html.xpath("//ul[@class='lists']")[0]
lis = ul.xpath("./li")
movies =
for li in lis:
title = li.xpath("@data-title", encoding="utf-8")[0]
score = li.xpath("@data-score", encoding="utf-8")[0]
duration = li.xpath("@data-duration", encoding="utf-8")[0]
region = li.xpath("@data-region", encoding="utf-8")[0]
thumbnail = li.xpath(".//img/@src")[0]
movie =
print movies
# coding:utf-8
import requests
from lxml import etree
import chardet
based_url = ""
headers =
def get_detailed_urls(url):
# 1.獲取html元素
response = requests.get(url, headers=headers)
# 注意使用response.content容易出問題
text = response.text
# 2.尋找detailed_urls
html = etree.html(text)
hrefs = html.xpath("//table[@class='tbspan']//a//@href")
detailed_urls = map(lambda url: based_url+url, hrefs)
return detailed_urls
def parse_detailed_page(url):
movie = {}
# url = "/html/gndy/dyzz/20180603/56925.html"
# 1.獲取頁面元素
response = requests.get(url, headers=headers)
text = response.content.decode("gbk")
# 2.尋找相應的內容
html = etree.html(text)
title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
movie["title"] = title
infor = html.xpath("//div[@id='zoom']//span//text()")
other_name = infor[2].replace("◎片 名".decode("utf-8"), "").strip()
movie["other_name"] = other_name
year = infor[3].replace("◎年 代".decode("utf-8"), "").strip()
movie["year"] = year
country = infor[4].replace("◎產 地".decode("utf-8"), "").strip()
movie["country"] = country
typing = infor[5].replace("◎類 別".decode("utf-8"), "").strip()
movie["typing"] = typing
return movie
def spider():
movies =
base_url = "/html/gndy/dyzz/list_23_{}.html"
for i in range(1, 8):
url = base_url.format(i)
detailed_urls = get_detailed_urls(url)
for detailed_url in detailed_urls:
movie = parse_detailed_page(detailed_url)
for x in movie:
print movie[x]
if __name__ == '__main__':
