注:爬取後的資訊將以json格式儲存,並將檔案命名為「recruit.json」可用notepad++開啟。
# -*- coding: utf-8 -*-
import scrapy
class txhritem(scrapy.item):
#職位名稱
positionname = scrapy.field()
#職位類別
positiontype = scrapy.field()
#需求人數
neednum = scrapy.field()
#工作地點
workingspace = scrapy.field()
#發布時間
publishtime = scrapy.field()
# -*- coding: utf-8 -*-
import scrapy
from txhr.items import txhritem
class txhrspiderspider(scrapy.spider):
name = 'txhr'
allowed_domains = ['tencent.com']
initialurl = ''
bias = 0
url = initialurl + str(bias)
start_urls = [url]
def parse(self, response):
# even=偶,odd=奇
for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
#建立模型物件
item = txhritem()
item['positionname'] = each.xpath("td[1]/a/text()").extract()[0]
test_null = each.xpath("td[2]/text()").extract()
#因為網頁中有的記錄中的「職位類別」為空,如果不加上下面的判斷,程式會在中途報錯
if test_null == :
item['positiontype'] = "null"
else:
item['positiontype'] = test_null[0]
item['neednum'] = each.xpath("td[3]/text()").extract()[0]
item['workingspace'] = each.xpath("td[4]/text()").extract()[0]
item['publishtime'] = each.xpath("td[5]/text()").extract()[0]
yield item
self.bias += 10
#抓取前1000條社會招聘資訊
if self.bias < 1000:
url = self.initialurl+str(self.bias)
yield scrapy.request(url, callback=self.parse)
# -*- coding: utf-8 -*-
import json
class txhrpipeline(object):
def __init__(self):
self.output = open("recruit.json", 'w')
def process_item(self, item, spider):
# 將爬取的資訊先轉換為字典,再轉換為json格式的鍵值對
jsontext = json.dumps(obj=dict(item), ensure_ascii=false) + '\n'
self.output.write(jsontext)
return item
def close_spider(self):
self.output.close()
bot_name = 'txhr'
spider_modules = ['txhr.spiders']
newspider_module = 'txhr.spiders'
robotstxt_obey = true
default_request_headers =
item_pipelines =
scrapy爬蟲》爬取騰訊社招資訊
dos視窗輸入 scrapy startproject tencent cd tencent coding utf 8 define here the models for your scraped items see documentation in import scrapy class ten...
python3 scrapy 爬取騰訊招聘
安裝scrapy不再贅述,在控制台中輸入scrapy startproject tencent 建立爬蟲專案名字為 tencent 接著cd tencent 用pycharm開啟tencent專案 構建item檔案 coding utf 8 define here the models for yo...
scrapy 爬取流程
什麼時候到pipeline,什麼 時候到spider這個就不說了,這個是框架跳轉到的流程 關鍵是訪問之前要登入怎麼辦,資料還要注入呢 這是個列表,裡面就是爬取的鏈結了 我們前面爬取就只是寫了乙個,但是其實可以寫多個 鏈結又是怎麼訪問的呢 這東西你就可以手動提取鏈結返回了 這東西你就得好好注意了 從入...