xpath爬貓眼排行 周二

2021-10-01 16:47:49 字數 3842 閱讀 4393

#匯入模組

import requests

from lxml import etree

import json

defgetonepage

(url)

:"得到一頁資訊"

header =

html = requests.get(url,headers=header)

return html.text

defparseonpage

(text)

: html = etree.html(text)

#電影名

name = html.xpath(

"//p[@class='name']/a/text()"

)#主演

star = html.xpath(

"//p[@class='star']/text()"

)#上映時間

releasetime = html.xpath(

"//p[@class='releasetime']/text()"

)for item in

range

(len

(name)):

yield

defwirte2file

(content)

:#路徑需要自己重新定義

with

open

(r"c:\users\john\desktop\爬蟲.txt"

,'a'

,encoding=

'utf-8'

)as fp:

fp.write(json.dumps(content,ensure_ascii=

false)+

'\n'

)def

main()

:for offset in

range(10

):url =

"".format

(offset*10)

text = getonepage(url)

for item in parseonpage(text)

: wirte2file(item)

print

(item)

main(

)

test=html.text

html = etree.html(test)

addr = html.xpath("//div[@class=『cinema-info』]/a/text()")#並非唯一

print(addr)

xpath路徑測試

//*[@id=「content」]/div/div[1]/ol/li[4]/div/div[2]/div[1]/a/span[1]

豆瓣addr = html.xpath("//div[@class=『hd』]/a/span[1]/text()")#並非唯一

html.xpath("//div[@class=『bd』]/p/text()")

//[@id=「content」]/div/div[1]/ol/li[4]/div/div[2]/div[2]/p[1]

//[@id=「content」]/div/div[1]/ol/li[4]/div/div[2]/div[2]/div/span[2]

html.xpath("//li/div/div[2]/div[2]/div/span[2]/text()")

//*[@id=「content」]/div/div[1]/ol/li[2]/div/div[1]/em

#匯入模組

import requests

from lxml import etree

import json

defgetonepage

(url)

:"得到一頁資訊"

header =

html = requests.get(url,headers=header)

return html.text

defparseonpage

(text)

: html = etree.html(text)

#電影名

name = html.xpath(

"//div[@class='exotic item-name']/a/text()"

)#並非唯一

for item in

range

(len

(name)):

yield

defwirte2file

(content)

:#路徑需要自己重新定義

with

open

(r"d:\爬蟲.txt"

,'a'

,encoding=

'utf-8'

)as fp:

fp.write(json.dumps(content,ensure_ascii=

false)+

'\n'

)def

main()

:for offset in

range(10

):url =

"".format

(offset)

text = getonepage(url)

for item in parseonpage(text)

: wirte2file(item)

print

(item)

main(

)

import requests

from lxml import etree

import json

defgetonepage

(url)

: header =

html = requests.get(url,headers=header)

return html.text

defparseonpage

(text)

: html = etree.html(text)

name = html.xpath(

"//p[@class='name']/a/text()"

) star = html.xpath(

"//p[@class='star']/text()"

) releasetime = html.xpath(

"//p[@class='releasetime']/text()"

)for item in

range

(len

(name)):

yield

defwrite2file

(content)

:with

open

(r"c:\users\john\desktop\爬蟲.txt"

,'a'

,encoding=

'utf-8'

)as fp:

fp.write(json.dumps(content,ensure_ascii=

false)+

'\n'

)def

main()

:for offset in

range(10

):url =

"".format

(offset*10)

text = getonepage(url)

for item in parseonpage(text)

: write2file(item)

print

(item)

main(

)

python爬取貓眼電影排行

完整的 如下在這裡 閒著沒事,把解析html中的正則方法改用了xpath與beautifulsoup,只能說各有各的優點吧。正則的話,提取資訊可以連貫,一次性提取出所有需要的資訊,當然前提是你的正則式子沒有寫錯,所以說正則寫起來相比xpath與beautifulsoup來說要複雜一下,提取出錯後,除...

爬取貓眼電影排行100電影

import json import requests from requests.exceptions import requestexception import re import time 獲取單頁的內容 def get one page url try response requests....

爬取貓眼電影排行榜

匯入我們需要的模組 import reimport requests 一 獲取網頁內容 1 宣告目標url,就是爬取的 位址 base url 2 模仿瀏覽器 headers 3 發起請求 response requests.get base url,headers headers 4 接收響應的資...