# -*- coding: utf-8 -*-
import scrapy
class txhritem(scrapy.item):
positionname = scrapy.field()
positiontype = scrapy.field()
neednum = scrapy.field()
workingspace = scrapy.field()
publishtime = scrapy.field()
# -*- coding: utf-8 -*-
import scrapy
from txhr.items import txhritem
class txhrspiderspider(scrapy.spider):
name = 'txhr'
allowed_domains = ['tencent.com']
initialurl = ''
bias = 0
url = initialurl + str(bias)
start_urls = [url]
def parse(self, response):
# even=偶,odd=奇
for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
item = txhritem()
item['positionname'] = each.xpath("td[1]/a/text()").extract()[0]
test_null = each.xpath("td[2]/text()").extract()
if test_null == :
item['positiontype'] = "null"
item['positiontype'] = test_null[0]
item['neednum'] = each.xpath("td[3]/text()").extract()[0]
item['workingspace'] = each.xpath("td[4]/text()").extract()[0]
item['publishtime'] = each.xpath("td[5]/text()").extract()[0]
yield item
self.bias += 10
if self.bias < 1000:
url = self.initialurl+str(self.bias)
yield scrapy.request(url, callback=self.parse)
# -*- coding: utf-8 -*-
import json
class txhrpipeline(object):
def __init__(self):
self.output = open("recruit.json", 'w')
def process_item(self, item, spider):
# 將爬取的資訊先轉換為字典,再轉換為json格式的鍵值對
jsontext = json.dumps(obj=dict(item), ensure_ascii=false) + '\n'
return item
def close_spider(self):
bot_name = 'txhr'
spider_modules = ['txhr.spiders']
newspider_module = 'txhr.spiders'
robotstxt_obey = true
default_request_headers =
item_pipelines =
