利用pyspider爬取boss職位資訊
pyspider初步**
#!/usr/bin/env python
#-*- encoding: utf-8 -*-
#created on 2019-04-23 16:01:48
#project: boss
from pyspider.libs.base_handler import *
from pyspider.libs.webrequest import *
from pyspider.database.mysql.mysqldb import sql
import time,sys
reload(sys)
sys.setdefaultencoding('utf-8')
class handler(basehandler):
#職位詳情
position_list = ['大資料']
#北京、上海、廣州、深圳、成都
#['c101010100','c101020100','c101280100','c101280600','c101270100']
city_list = ['c101270100']
#**設定
proxy = get_home_proxy()
crawl_config =
@every(minutes=72 * 60)
def on_start(self):
for city in self.city_list:
for position in self.position_list:
for work_year in range(102,108):
crawl_url = ''.format(city=city, workyear=work_year, position=position)
time.sleep(1)
self.crawl(url=crawl_url, callback=self.index_page, proxy=random.choice(self.proxy))
def index_page(self, response):
url = response.url
page_num = 1
while page_num <= 10:
crawlurl = url+"&page="+str(page_num)
time.sleep(1)
self.crawl(url=crawlurl, callback=self.detail_page, proxy=random.choice(self.proxy))
page_num +=1
@config(priority=2)
def detail_page(self, response):
page = response.etree
job_list =
#內容列表
content_list = page.xpath("//div[@class='job-list']/ul/li")
for each in content_list:
# 職位名稱
position_name = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/div[@class='job-title']/text()")[0]
#薪水salary = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/h3[@class='name']/a/span/text()")[0]
#地區city = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[0]
#經驗experience = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[1]
#學歷education = each.xpath("./div[@class='job-primary']/div[@class='info-primary']/p//text()")[2]
#公司company = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/h3[@class='name']/a/text()")[0]
if len(each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")) == 3:
#公司行業領域
industry_field= each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[0]
#融資輪
rounds = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[1]
#規模scale = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[2]
else:
#公司行業領域
industry_field= each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[0]
#融資輪
rounds = ''
#規模scale = each.xpath("./div[@class='job-primary']/div[@class='info-company']/div[@class='company-text']/p//text()")[1]
job =
return job_list
#將結果資料存入資料庫
def on_result(self, result):
if not result or not len(result)>0:
return
sql = sql()
for res in result:
sql.insert('boss_original',**res)
利用Ajax實現資料爬取 爬取微博主頁
有時候我們在利用requests抓取網頁時,得到的結果可能與在瀏覽器中看到的結果不一樣 在瀏覽器中能看到的東西,但是在爬取下來的網頁源 中看不到。這是因為requests獲取的都是最原始的html文件,而瀏覽器中的頁面則是經過js處理資料得到的結果,這些資料的 有很多種,第一種方式是 通過ajax,...
利用正則爬取貓眼電影
爬取貓眼電影 import json import requests from requests.exceptions import requestexception import redef get one page url 獲取乙個頁面的資訊 try proxies get random ip ...
利用python爬取電影資源
4.爬蟲的用途 5.貓眼電影資源爬取 寫程式,然後去網際網路上抓取資料的過程。自動的批量的採集我們需要的資源 網頁請求 資料分析 結果儲存 1 網頁都有自己唯一的url 2 網頁內容都是html結構的 3 使用的都是http https協議 1 給乙個url 2 寫程式,模擬瀏覽器訪問url 3 解...