junjun
2023年4月20日
參考:
library
(rcurl
)
## loading required package: bitops
#install.packages("xml")
library
(xml
)library
(reshape
)#偽裝報頭
myheader=c
("user-agent"
="mozilla/5.0(windows;u;windows nt 5.1;zh-cn;rv:1.9.1.6",
"accept"
="accept-language"
="en-us",
"connection"
="keep-alive",
"accept-charset"
="gb2312,utf-8;q=0.7,*;q=0.7"
)#1)微型車抓取測試
)#轉碼
temp1
<-
iconv
(temp, "gb2312", "utf-8"
)encoding
(temp1
)
## [1] "utf-8"
#選擇utf-8進行網頁的解析
k<-
htmlparse
(temp1, astext
=t, encoding
="utf-8"
)#檢視doc的內容時顯示有亂碼,但沒關係,table的解析結果沒有亂碼
tables
<-
readhtmltable
(k, header=f
)#getnodeset(k,'//div[@class="uibox"]')
#汽車公司
#getnodeset(k,'//div[@class="h3-tit"]/text()')
#汽車車型(greylink,灰色鏈結即非上市車型)
model
<-
getnodeset
(k,'//a[contains(@class,"greylink")]/text()'
)#汽車車型(包含上市),這個xpath會出現同輛車重複4次這種情況,我沒找到最好的xpath=-=
model
<-
getnodeset
(k,'//li/h4/a/text()'
)class
(model
)
## [1] "xmlnodeset"
#從xmlnodeset轉化為character格式
a00<-
(model, xmlvalue
)class
(a00
)
## [1] "character"
a00
<-
as.data.frame
(a00
)a00
$tips
<-
rep(
"a00/", length
(a00
$a00))
#重新命名列名
a00<-
rename
(a00, c
(a00
="model", tips
="tips"))
data1
<-
a00write.csv
(a00, file
="e:\\新技術\\爬蟲\\汽車之家/微型車.csv"
)#2)################各車型的url#######################
#微型車
#小型車
#緊湊型車
#中型車
#中大型車
#豪華車
#mpv
#跑車
#皮卡
#微面
#輕客
#小型suv
#緊湊型suv
#中型suv
#中大型suv
#全尺寸suv
series
<-c(
"a0/","a/","b/","c/","d/","mpv/","s/","p/","mb/","qk/","suva0/","suva/","suvb/","suvc/","suvd/"
)#構建urllist,若寫成function的話,貌似不需要構建urllist
urllist
<-
0for(i
in1:length
(series))
#構建抓取迴圈
for(iin
1:length
(series))
#匯出結果後再去重處理下吧.
write.csv
(data1, file
="e:\\新技術\\爬蟲\\汽車之家/auto全車型.csv"
)
RCurl抓取團購資訊
抓取 資訊 myheader c accept language en us connection keep alive accept charset gb2312,utf 8 q 0.7,q 0.7 偽裝 header 防止不能爬取 temp geturl httpheader myheader,...
爬取汽車之家
爬汽車之家新聞 爬取汽車之家新聞 import requests 向汽車之家傳送get請求,獲取到頁面 ret requests.get print ret.text 用bs4解析 from bs4 import beautifulsoup 例項化得到物件,傳入要解析的文字,解析器 html.par...
scrapy獲取汽車之家資料
1 建立scrapy專案 2 找到對應介面 3 建立爬蟲檔案 cd scrapy carhome scrapy carhome spiders scrapy carhome scrapy carhome spiders scrapy genspider car 4 注釋robots協議 注意如果你的...