1. 獲取豆瓣讀書頁資訊,**為:**如下:
# coding:utf-8
import requests
from lxml import etree
# 1.獲取豆瓣讀書網頁內容
headers =
url = ""
response = requests.get(url, headers=headers)
text = response.text
# with open("book.html", "w") as fp:
# fp.write(response.content)
# 2.通過一定規則獲取html檔案中的內容
html = etree.html(text)
ul = html.xpath("//ul[@class='list-col list-col5 list-express slide-item']")[0]
# print etree.tostring(ul, encoding="utf-8").decode("utf-8")
# 儲存成html檔案
# with open("ul.html", "w") as fp:
# fp.write(etree.tostring(ul, encoding="utf-8"))
lis = ul.xpath(".//li")
# print etree.tostring(lis[0], encoding="utf-8").decode("utf-8")
# 通過迴圈獲取lis下面的元素及屬性
books =
for li in lis:
meta = li.xpath(".//div[@class='more-meta']")[0]
# strip()去掉前後的空格
# /text()爬取中間text文字
title = meta.xpath(".//h4[@class='title']/text()")[0].strip()
author = meta.xpath(".//span[@class='author']/text()")[0].strip()
year = meta.xpath(".//span[@class='year']/text()")[0].strip()
publisher = meta.xpath(".//span[@class='publisher']/text()")[0].strip()
abstract = meta.xpath(".//p[@class='abstract']/text()")[0].strip()
book =
# 3.儲存抓取到的books資訊
print books
2. 獲取豆瓣電影頁資訊,**為:**如下:
# -- coding:utf-8 --
import requests
from lxml import etree
# 1.將目標**上的頁面抓取下來
headers =
url = ""
response = requests.get(url, headers=headers)
text = response.text
with open("responses.html", "w") as fp:
fp.write(response.content)
# 注意response.content資料型別和response.text資料型別
# 2.將抓取下來的資料根據一定的規則進行提取
html = etree.html(text)
ul = html.xpath("//ul[@class='lists']")[0]
lis = ul.xpath("./li")
movies =
for li in lis:
title = li.xpath("@data-title", encoding="utf-8")[0]
score = li.xpath("@data-score", encoding="utf-8")[0]
duration = li.xpath("@data-duration", encoding="utf-8")[0]
region = li.xpath("@data-region", encoding="utf-8")[0]
thumbnail = li.xpath(".//img/@src")[0]
movie =
print movies
3. 電影天堂**的爬取,
# coding:utf-8
import requests
from lxml import etree
import chardet
based_url = ""
headers =
def get_detailed_urls(url):
# 1.獲取html元素
response = requests.get(url, headers=headers)
# 注意使用response.content容易出問題
text = response.text
# 2.尋找detailed_urls
html = etree.html(text)
hrefs = html.xpath("//table[@class='tbspan']//a//@href")
detailed_urls = map(lambda url: based_url+url, hrefs)
return detailed_urls
def parse_detailed_page(url):
movie = {}
# url = "/html/gndy/dyzz/20180603/56925.html"
# 1.獲取頁面元素
response = requests.get(url, headers=headers)
text = response.content.decode("gbk")
# 2.尋找相應的內容
html = etree.html(text)
title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
movie["title"] = title
infor = html.xpath("//div[@id='zoom']//span//text()")
other_name = infor[2].replace("◎片 名".decode("utf-8"), "").strip()
movie["other_name"] = other_name
year = infor[3].replace("◎年 代".decode("utf-8"), "").strip()
movie["year"] = year
country = infor[4].replace("◎產 地".decode("utf-8"), "").strip()
movie["country"] = country
typing = infor[5].replace("◎類 別".decode("utf-8"), "").strip()
movie["typing"] = typing
return movie
def spider():
movies =
base_url = "/html/gndy/dyzz/list_23_{}.html"
for i in range(1, 8):
url = base_url.format(i)
detailed_urls = get_detailed_urls(url)
for detailed_url in detailed_urls:
movie = parse_detailed_page(detailed_url)
for x in movie:
print movie[x]
break
break
if __name__ == '__main__':
spider()
python豆瓣影評 python 豆瓣電影爬蟲
因為 的緣故,在家甚是無聊,想著可能會做乙個和資料分析相關的畢業設計,不如就提前準備一下資料。眼光一掃,就是你了,豆瓣!說起來很有意思,我最開始寫爬蟲就是從豆瓣開始的,現在又回來了。豆瓣,這世間所有的相逢都是久別重逢。好了,不皮了,開始正題。寫爬蟲之前,首先要明確乙個問題你需要什麼資料。先有目標,再...
python爬蟲 豆瓣電影
最近學習python 順便寫下爬蟲練手 爬的是豆瓣電影排行榜 python版本2.7.6 安裝 beautiful soup sudo apt get install python bs4 安裝 requests sudo apt get install python requests下面是py a...
Python python抓取豆瓣電影top250
一直對爬蟲感興趣,學了python後正好看到某篇關於爬取的文章,就心血來潮實戰一把吧。實現目標 抓取豆瓣電影top250,並輸出到檔案中 1.找到對應的url 2.進行頁面元素的抓取 3.編寫 第一步 實現抓取第乙個頁面 第二步 將其他頁面的資訊也抓取到 第三步 輸出到檔案 4.5.結果 1 控制台...