dos視窗輸入:
scrapy startproject tencent
cd tencent
# -*- coding: utf-8 -*-dos視窗輸入:# define here the models for your scraped items
## see documentation in:
# import scrapy
class tencentitem(scrapy.item):
# define the fields for your item here like:
#職位名
positionname = scrapy.field()
#鏈結positionlink = scrapy.field()
#類別positiontype = scrapy.field()
#招聘人數
positionnum = scrapy.field()
#工作地點
positioncation = scrapy.field()
#職位名稱
positiontime = scrapy.field()
scrapy genspider myspider tencent.com
# -*- coding: utf-8 -*-import scrapy
from tencent.items import tencentitem
class myspiderspider(scrapy.spider):
name = 'myspider'
allowed_domains = ['tencent.com']
url = ''
offset = 0
start_urls = [url+str(offset)]
def parse(self, response):
for each in response.xpath('//tr[@class="even"]|//tr[class="odd"]'):
#初始化模型物件
item = tencentitem()
# 職位名
item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0]
# 鏈結
item['positionlink'] = '' + each.xpath("./td[1]/a/@href").extract()[0]
# 類別
item['positiontype'] = each.xpath("./td[2]/text()").extract()[0]
# 招聘人數
item['positionnum'] = each.xpath("./td[3]/text()").extract()[0]
# 工作地點
item['positioncation'] = each.xpath("./td[4]/text()").extract()[0]
# 職位名稱
item['positiontime'] = each.xpath("./td[5]/text()").extract()[0]
yield item
if self.offset < 2820:
self.offset += 10
else:
raise ("程式結束")
yield scrapy.request(self.url+str(self.offset),callback=self.parse)
# -*- coding: utf-8 -*-robox協議# define your item pipelines here
## don't forget to add your pipeline to the item_pipelines setting
# see:
import json
class tencentpipeline(object):
def __init__(self):
self.filename = open('tencent.json','wb')
def process_item(self, item, spider):
text =json.dumps(dict(item),ensure_ascii=false) + ',\n'
self.filename.write(text.encode('utf-8'))
return item
def close_spider(self):
self.filename.close()
# obey robots.txt rulesheadersrobotstxt_obey = false
default_request_headers =pipelines
item_pipelines =dos視窗輸入:
scrapy crawl myspider執行結果:
檢視debug:
2019-02-18 16:02:22 [scrapy.core.scraper] error: spider error processing (referer: 510)去網頁檢視:traceback (most recent call last):
file "e:\software\anaconda\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
file "e:\software\anaconda\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 30, in process_spider_output
for x in result:
file "e:\software\anaconda\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in return (_set_referer(r) for r in result or ())
file "e:\software\anaconda\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in return (r for r in result or () if _filter(r))
file "e:\software\anaconda\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in return (r for r in result or () if _filter(r))
file "c:\users\123\tencent\tencent\spiders\myspider.py", line 22, in parse
item['positiontype'] = each.xpath("./td[2]/text()").extract()[0]
這個職位少乙個屬性- -!!!(城市套路多啊!)
那就改一下myspider.py裡面的一行:
item['positiontype'] = each.xpath("./td[2]/text()").extract()[0]加個判斷,改為:
if len(each.xpath("./td[2]/text()").extract()) > 0:執行結果:item['positiontype'] = each.xpath("./td[2]/text()").extract()[0]
else:
item['positiontype'] = "none"
看**上最後一頁:
爬取成功!
爬蟲 爬取騰訊熱點
1.了解ajax載入 2.通過chrome的開發者工具,監控網路請求,並分析 3.用selenium完成爬蟲 4.實現 用selenium爬取 的熱點精選,熱點精選至少爬50個出來,儲存成 csv 每一行如下 標號 從1開始 標題,鏈結,前三個為必做,後面內容可以自己加 import time fr...
Scrapy爬蟲爬取電影天堂
目標 建立專案 scrapy startproject 爬蟲專案檔案的名字 生成 crawlspider 命令 scrapy genspider t crawl 爬蟲名字 爬蟲網域名稱 終端執行 scrapy crawl 爬蟲的名字 python操作mysql資料庫操作 爬蟲檔案 coding ut...
python2使用bs4爬取騰訊社招
要求 使用bs4進行解析,並把結果以json檔案形式儲存 注意 如果直接把python列表沒有序列化為json陣列,寫入到json檔案,會產生中文寫不進去到檔案,所以要序列化並進行utf 8編碼後寫入檔案。coding utf 8 import requests from bs4 import be...