預先設定好items
import scrapy
class
superspideritem
(scrapy.item)
: title = scrapy.field(
) date = scrapy.field(
) content = scrapy.field(
)
爬取範圍和start_url
class
spider1spider
(scrapy.spider)
: name =
'spider1'
allowed_domains =
[''] start_urls =
['html/top/report.shtml'
]
def
parse
(self, response)
: tr_list = response.xpath(
"//div[@class='newshead clearfix']/table[2]//tr"
)for tr in tr_list:
items = superspideritem(
) items[
'title'
]= tr.xpath(
"./td[3]/a[1]/@title"
).extract_first(
)##### 提取title 用xpath
items[
'date'
]= tr.xpath(
"./td[6]//text()"
).extract_first(
)#### 同樣的方法提取date
content_href = tr.xpath(
"./td[3]/a[1]/@href"
).extract_first(
)#### 提取內容鏈結
####---有關yiled----####----content_url傳url鏈結,callback指定**函式----####
yield scrapy.request(
content_href,
callback=self.get_content,
####----meta-可以將資料轉移----####
####----乙個類字典的資料型別----####
meta=
) new_url = response.xpath(
"//div[contains(@align,'center')]//@href"
).extract(
)print
(new_url[-2
])if"page="
+str
(page_num*30)
notin new_url[-2
]:####---指明爬取的頁數---####
yield scrapy.request(
new_url[-2
],callback=self.parse
)
第二個函式
-匯集所有的函式並 傳給piplines
def
get_content
(self, response)
: items = superspideritem(
) items[
'date'
]= response.meta[
'date'
] items[
'title'
]= response.meta[
'title'
] items[
'content'
]= response.xpath(
"//td[@class='txt16_3']/text()"
).extract_first(
)yield items
piplines裡面並沒做什麼.因為沒對資料進行什麼處理,只是簡單的將資料列印
class
superspiderpipeline
(object):
defprocess_item
(self, item, spider)
: items = item
print
('*'
*100
)print
(items[
'date'])
print
(items[
'title'])
print
(items[
'content'
])
import scrapy
class
superspideritem
(scrapy.item)
: title = scrapy.field(
) date = scrapy.field(
) content = scrapy.field(
)
# -*- coding: utf-8 -*-
import scrapy
from superspider.items import superspideritem
page_num =
3class
spider1spider
(scrapy.spider)
: name =
'spider1'
allowed_domains =
['wz.sun0769.com'
] start_urls =
['html/top/report.shtml'
]def
parse
(self, response)
: tr_list = response.xpath(
"//div[@class='newshead clearfix']/table[2]//tr"
)for tr in tr_list:
items = superspideritem(
) items[
'title'
]= tr.xpath(
"./td[3]/a[1]/@title"
).extract_first(
) items[
'date'
]= tr.xpath(
"./td[6]//text()"
).extract_first(
) content_href = tr.xpath(
"./td[3]/a[1]/@href"
).extract_first(
)yield scrapy.request(
content_href,
callback=self.get_content,
meta=
) new_url = response.xpath(
"//div[contains(@align,'center')]//@href"
).extract(
)print
(new_url[-2
])if"page="
+str
(page_num*30)
notin new_url[-2
]:yield scrapy.request(
new_url[-2
],callback=self.parse
)defget_content
(self, response)
: items = superspideritem(
) items[
'date'
]= response.meta[
'date'
] items[
'title'
]= response.meta[
'title'
] items[
'content'
]= response.xpath(
"//td[@class='txt16_3']/text()"
).extract_first(
)yield items
class
superspiderpipeline
(object):
defprocess_item
(self, item, spider)
: items = item
print
('*'
*100
)print
(items[
'date'])
print
(items[
'title'])
print
(items[
'content'
])
scrapy 爬蟲入門(1)
pip install scrapy 就是這麼簡單 scrapy startproject words 會建立乙個words的專案目錄結構 words scrapy.cfg 專案的配置檔案 words 專案 init py items.py 專案的item檔案 pipelines.py 專案的pip...
Scrapy爬蟲筆記 1
1 安裝 使用pip install scrapy 假如使用了fiddler作為 伺服器進行除錯分析,為了避免該軟體的影響 開啟fiddler,進入 tools fiddler options connections 將 act as system proxy on startup 和 monito...
定向爬蟲 網路爬蟲例項1
1.獲取網頁內容 gethtmltext 2.獲取網路內容資訊並儲存到合適的資料結構中 fillunivlist 3.利用資料結構展示並輸出結果 printunivlist import requests from bs4 import beautifulsoup import bs4 def ge...