我們爬取頁面中每個公司的崗位資訊,包括職位、辦公地點、工作經驗…上圖中的11個加上boss直聘的jobid共12個資訊
開啟shell
scrapy shell
view(response)
#發現返回403
#嘗試把headers一併給出
from scrapy import request
fetch(data)
view(response)
編寫**定義item
import scrapy
class wwwzhipincomitem(scrapy.item):
# define the fields for your item here like:
# name = scrapy.field()
job_id= scrapy.field()
positionname = scrapy.field()
salary= scrapy.field()
city= scrapy.field()
workyear= scrapy.field()
education= scrapy.field()
companyname= scrapy.field()
industryfield= scrapy.field()
financestage= scrapy.field()
companysize= scrapy.field()
name= scrapy.field()
recruiter= scrapy.field()
編寫spider
import scrapy
from www_zhipin_com.items import wwwzhipincomitem
import time
class zhipinspider(scrapy.spider):
name = 'zhipin'
allowed_domains = ['www.zhipin.com']
start_urls = ['']
positionurl = ''
curpage = 1
# 傳送 header,偽裝為瀏覽器
def start_requests(self):
return [self.next_request()]
def parse(self, response):
print("request -> " + response.url)
job_list = response.css('div.job-list > ul > li')
for job in job_list:
item = wwwzhipincomitem()
job_primary = job.css('div.job-primary')
item['job_id'] = job.xpath('.//a/@data-jobid').extract_first().strip()
item['city'] = job_primary.xpath('./div/p/text()').extract_first().strip()
item['workyear'] = job_primary.xpath('./div/p/text()[2]').extract_first().strip()
item['education'] = job_primary.xpath('./div/p/text()[3]').extract_first().strip()
item["positionname"] = job_primary.xpath('./div[1]/h3/a/div[1]/text()').extract_first()
item["salary"] = job_primary.xpath('./div[1]/h3/a/span/text()').extract_first()
item['companyname'] = job_primary.css('div.info-company > div.company-text > h3 > a::text').extract_first().strip()
company_infos = job_primary.xpath('./div[2]/div/p/text()').extract()
#main > div > div.job-list > ul > li:nth-child(1)
if len(company_infos) == 3: # 有一條招聘這裡只有兩項,所以加個判斷
item['industryfield'] = company_infos[0].strip()
item['financestage'] = company_infos[1].strip()
item['companysize'] = company_infos[2].strip()
item['name'] = job_primary.xpath('./div[3]/h3/text()[1]').extract()
item['recruiter'] = job_primary.xpath('./div[3]/h3/text()[2]').extract()
yield item
self.curpage += 1
time.sleep(5)
yield self.next_request()
# 傳送請求
設定setting
參考:
使用scrapy框架爬boss直聘
boss直聘 建立scrapy 專案 scrapy startproject scrapyproject建立spider檔案 scrapy genspider s boss zhipin.com目錄1.找介面 url 2.s boss.py 3.items.py 4.pipelines.py pag...
爬取boss直聘招聘資訊
直接上主 from bs4 import beautifulsoup import requests import ip proxy from urllib import parse headers def get boss info my ip,detailed url url proxy res...
BOSS直聘招聘資訊爬取
設定搜尋職位名稱 key words 資料分析 key urllib.parse.quote key words url key page 1 ka page 1 defget data url try res requests.get url,headers page headers status...