pip install scrapy_redis
scrapy-redis提供了兩種爬蟲
fromscrapy_redis.spiders import redisspider
class
myspider(redisspider):
"""spider that reads urls from redis queue (myspider:start_urls).
"""name = '
myspider_redis
'redis_key = '
myspider:start_urls
'def __init__(self, *args, **kwargs):
# dynamically define the allowed domains list.
domain = kwargs.pop('
domain
', ''
) self.allowed_domains = filter(none, domain.split(','
)) super(myspider, self).__init__(*args, **kwargs)
def parse(self, response):
return
from scrapy.spiders importrule
from scrapy.linkextractors import
linkextractor
from scrapy_redis.spiders import
rediscrawlspider
class
mycrawler(rediscrawlspider):
"""spider that reads urls from redis queue (myspider:start_urls).
"""name = '
mycrawler_redis
'redis_key = '
mycrawler:start_urls
'rules =(
#follow all links
rule(linkextractor(), callback='
parse_page
', follow=true),
)def__init__(self, *args, **kwargs):
#dynamically define the allowed domains list.
domain = kwargs.pop('
domain
', ''
) self.allowed_domains = filter(none, domain.split(','
)) super(mycrawler, self).
__init__(*args, **kwargs)
defparse_page(self, response):
return
使用redis_spider元件中封裝好的管道item_pipelines =
# 使用scrapy-redis元件的去重佇列 dupefilter_class = "scrapy_redis.dupefilter.rfpdupefilter"
# 使用scrapy-redis元件自己的排程器 scheduler = "scrapy_redis.scheduler.scheduler"
# 是否允許暫停 scheduler_persist = true
redis_host = 'redis服務的ip位址'
redis_port = 6379
redis_encoding = 『utf-8』
redis_params =
4. 開啟redis-server 和 redis-cli
5. scrapy runspider myspider.py 開啟分布式爬蟲
6. 向排程器中扔入乙個起始url, lpush redis_key url
scrapy redis部署總結
1.安裝 安裝scrapy redis包,開啟cmd工具,執行命令pip install scrapy redis 準備好要部署的爬蟲專案 準備好redis伺服器還有跟程式相關的mysql資料庫 開啟redis 連線上伺服器 2.部署 修改爬蟲專案的settings檔案 修改spiders爬蟲檔案 ...
scrapy redis功能簡介
connection 連線redis最基本檔案 default 預設值設定檔案 dupefiler key 儲存指紋 dupefilter 替換scrapy預設的url去重器 piklecompat 序列化 pipelines 將item儲存到redis中,實現item分布式儲存 queue 實現3...
scrapy redis的安裝部署
先說下自己的環境,redis是部署在centos上的,爬蟲執行在windows上,1.安裝redis yum install y redis2.修改配置檔案 vi etc redis.conf將 protected mode no解注釋,否則的話,在不設定密碼情況下遠端無法連線redis 3.重啟r...