#匯入模組
import requests
from lxml import etree
import json
defgetonepage
(url)
:"得到一頁資訊"
header =
html = requests.get(url,headers=header)
return html.text
defparseonpage
(text)
: html = etree.html(text)
#電影名
name = html.xpath(
"//p[@class='name']/a/text()"
)#主演
star = html.xpath(
"//p[@class='star']/text()"
)#上映時間
releasetime = html.xpath(
"//p[@class='releasetime']/text()"
)for item in
range
(len
(name)):
yield
defwirte2file
(content)
:#路徑需要自己重新定義
with
open
(r"c:\users\john\desktop\爬蟲.txt"
,'a'
,encoding=
'utf-8'
)as fp:
fp.write(json.dumps(content,ensure_ascii=
false)+
'\n'
)def
main()
:for offset in
range(10
):url =
"".format
(offset*10)
text = getonepage(url)
for item in parseonpage(text)
: wirte2file(item)
print
(item)
main(
)
test=html.text
html = etree.html(test)
addr = html.xpath("//div[@class=『cinema-info』]/a/text()")#並非唯一
print(addr)
xpath路徑測試
//*[@id=「content」]/div/div[1]/ol/li[4]/div/div[2]/div[1]/a/span[1]
豆瓣addr = html.xpath("//div[@class=『hd』]/a/span[1]/text()")#並非唯一
html.xpath("//div[@class=『bd』]/p/text()")
//[@id=「content」]/div/div[1]/ol/li[4]/div/div[2]/div[2]/p[1]
//[@id=「content」]/div/div[1]/ol/li[4]/div/div[2]/div[2]/div/span[2]
html.xpath("//li/div/div[2]/div[2]/div/span[2]/text()")
//*[@id=「content」]/div/div[1]/ol/li[2]/div/div[1]/em
#匯入模組
import requests
from lxml import etree
import json
defgetonepage
(url)
:"得到一頁資訊"
header =
html = requests.get(url,headers=header)
return html.text
defparseonpage
(text)
: html = etree.html(text)
#電影名
name = html.xpath(
"//div[@class='exotic item-name']/a/text()"
)#並非唯一
for item in
range
(len
(name)):
yield
defwirte2file
(content)
:#路徑需要自己重新定義
with
open
(r"d:\爬蟲.txt"
,'a'
,encoding=
'utf-8'
)as fp:
fp.write(json.dumps(content,ensure_ascii=
false)+
'\n'
)def
main()
:for offset in
range(10
):url =
"".format
(offset)
text = getonepage(url)
for item in parseonpage(text)
: wirte2file(item)
print
(item)
main(
)
import requests
from lxml import etree
import json
defgetonepage
(url)
: header =
html = requests.get(url,headers=header)
return html.text
defparseonpage
(text)
: html = etree.html(text)
name = html.xpath(
"//p[@class='name']/a/text()"
) star = html.xpath(
"//p[@class='star']/text()"
) releasetime = html.xpath(
"//p[@class='releasetime']/text()"
)for item in
range
(len
(name)):
yield
defwrite2file
(content)
:with
open
(r"c:\users\john\desktop\爬蟲.txt"
,'a'
,encoding=
'utf-8'
)as fp:
fp.write(json.dumps(content,ensure_ascii=
false)+
'\n'
)def
main()
:for offset in
range(10
):url =
"".format
(offset*10)
text = getonepage(url)
for item in parseonpage(text)
: write2file(item)
print
(item)
main(
)
python爬取貓眼電影排行
完整的 如下在這裡 閒著沒事,把解析html中的正則方法改用了xpath與beautifulsoup,只能說各有各的優點吧。正則的話,提取資訊可以連貫,一次性提取出所有需要的資訊,當然前提是你的正則式子沒有寫錯,所以說正則寫起來相比xpath與beautifulsoup來說要複雜一下,提取出錯後,除...
爬取貓眼電影排行100電影
import json import requests from requests.exceptions import requestexception import re import time 獲取單頁的內容 def get one page url try response requests....
爬取貓眼電影排行榜
匯入我們需要的模組 import reimport requests 一 獲取網頁內容 1 宣告目標url,就是爬取的 位址 base url 2 模仿瀏覽器 headers 3 發起請求 response requests.get base url,headers headers 4 接收響應的資...