安裝scrapy
pip install scrapy
新建專案
(python36) e:\www>scrapy startproject filedownloadnew scrapy project
'filedownload
', using template directory '
c:\users\brady\.conda\envs\python36\lib\site-packages\scrapy\templates\project
', created in
: e:\www\filedownload
you can start your first spider with:
cd filedownload
scrapy genspider example example.com
(python36) e:\www>
(python36) e:\www>scrapy startproject filedownloadnew scrapy project 'filedownload', using template directory 'c:\users\brady\.conda\envs\python36\lib\site-packages\scrapy\templates\project', created in:
e:\www\filedownload
you can start your first spider with:
cd filedownload
scrapy genspider example example.com
(python36) e:\www>
編輯爬蟲提取內容
# -*- coding: utf-8 -*-import scrapy
from scrapy.linkextractors import linkextractor
from scrapy.spiders import crawlspider, rule
from filedownload.items import filedownloaditem
class pexelsspider(crawlspider):
name = 'pexels'
allowed_domains = ['www.pexels.com']
start_urls = ['']
rules = (
rule(linkextractor(allow=r'/photo/'), callback='parse_item', follow=true),
)def parse_item(self, response):
print(response.url)
url = response.xpath("//img[contains(@src,'photos')]/@src").extract()
item = filedownloaditem()
try:
item['file_urls'] = url
print("爬取到列表 " + url)
yield item
except exception as e:
print(str(e))
配置item
class filedownloaditem(scrapy.item):# define the fields for your item here like:
# name = scrapy.field()
file_urls = scrapy.field()
setting.py
啟用檔案管道
'scrapy.pipelines.files.filespipeline':2 檔案管道
files_store='' //儲存路徑
item裡面
file_urls = scrapy.field()
files = scrapy.field()
爬蟲裡面 改為file_urls引數傳遞到管道
重寫檔案管道 儲存檔名為原名
pipelines.php裡面 新建自己管道,繼承管道
# -*- coding: utf-8 -*-# define your item pipelines here
## don't forget to add your pipeline to the item_pipelines setting
# see:
from scrapy.pipelines.files import filespipeline
class filedownloadpipeline(object):
def process_item(self, item, spider):
tmp = item['file_urls']
item['file_urls'] =
for i in tmp:
if "?" in i:
else:
print(item)
return item
class myfilespipeline(filespipeline):
def file_path(self, request, response=none, info=none):
file_path = request.url
file_path = file_path.split('/')[-1]
return 'full/%s' % (file_path)
setting.py 改為啟用自己檔案管道
item_pipelines =
獲取套圖
# -*- coding: utf-8 -*-from time import sleep
import scrapy
from scrapy.linkextractors import linkextractor
from scrapy.spiders import crawlspider, rule
class angelspider(crawlspider):
name = 'angel'
allowed_domains = ['angelimg.spbeen.com']
start_urls = ['']
base_url = ""
rules = (
rule(linkextractor(allow=r'^ang/\d+$'), callback='parse_item', follow=false),
)def parse_item(self, response):
item = response.meta.get('item',false)
if item:
pass
else:
item = {}
item['files'] =
item['file_urls'] =
print(response.url)
img_url = response.xpath('.//div[@id="content"]/a/img/@src').extract_first()
next_url = response.xpath('.//div[@class="page"]//a[contains(@class,"next")]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.request(next_url,callback=self.parse_item,meta=)
else:
print(item)
yield item
def parse_next_response(self,response,):
item = response.meta.get('item')
print(item,response.url)
github位址
4 scrapy的管道pipelines使用詳解
process item self,item,spider open spider self,spider 在爬蟲開啟的時候僅執行一次 close spider self,spider 在爬蟲關閉的時候僅執行一次 易錯點 這裡需要返回item,而且是在if外,當該管道在setting檔案中排第乙個時...
scrapy框架基於管道的持久化儲存
全棧資料的爬取 如何傳送post請求 yield scrapy.fromrequest url new url,callback self.parse,formdata 五大核心元件 物件 如何適當提公升scrapy爬取資料的效率 增加併發 預設scrapy開啟的併發執行緒為16個,可以適當進行增加...
scrapy 基於管道的持久化儲存操作
scrapy框架中已經為我們專門整合好了高效 便捷的持久化操作功能,我們直接使用即可。這兩個元件配合爬蟲檔案實現資料持久化 items.py 資料結構模板檔案。定義資料屬性。pipelines.py 管道檔案。接收資料 items 進行持久化操作。持久化流程 1.爬蟲檔案爬取到資料後,需要將資料封裝...