1,建立scrapy startproject tb
2 , cd tb ,建立乙個spider scrapy genspider 爬蟲名字 **網域名稱
4,在pippelines.py寫儲存的方式 我這裡寫的是資料夾
5,seting裡面開啟
robotstxt_obey = false
download_delay = 3
**********_middlewares =
item_pipelines =6 spdier.py 中
import scrapyfrom scrapy import request
import lxml.html
from tb.items import tbitem
class taobaosspider(scrapy.spider):
name = 'tianmao1'
#allowed_domains = ['www.tianmao.com','detail.tmall.com']
#@property
def start_requests(self):
base_url = ""
yield request(url=base_url,callback=self.parse,dont_filter=true,meta=)
#start_urls = ['']
def parse(self,response):
item = tbitem()
tr_list = response.xpath('//div[@class="rate-grid"]/table/tbody/tr').extract()
for tr in tr_list:
html = lxml.html.fromstring(tr)
pinglun = html.xpath('//td[@class="tm-col-master"]/div/div[1]/text()')[0]
xinghao = html.xpath('//td[@class="col-meta"]/div/p/text()')[0]
xingming = html.xpath('//td[@class="col-author"]/div/text()')[0]
#time = html.xpath('//td[@class="tm-col-master"]/div[@class="tm-rate-date"]/text()')[0]
item["pinglun"]=pinglun
item["xianghao"]=xinghao
item["xingming"]=xingming
#item["time"]=time
yield item
yield request(url="",callback=self.parse,meta=,dont_filter=true)
7,在middlewares.py填寫
class seleniummiddlewares(object):8 ,啟動爬蟲 scrapy crawl 爬蟲名def __init__(self):
self.options = options()
#self.options.add_argument('-headless')
self.browser = webdriver.chrome(executable_path="f:\第七重新爬蟲\day06\day06全天\ziliao\chromedriver.exe",chrome_options=self.options)
def process_request(self,request,spider):
if int(request.meta["page"]) == 1:
self.browser.get(request.url)
time.sleep(5)
for y in range(10):
self.browser.execute_script("window.scrollby(0,220)")
time.sleep(3)
pages = self.browser.find_element_by_xpath('//li/a[@href="#j_reviews"]')
pages.click()
time.sleep(5)
return htmlresponse(url=self.browser.current_url,body=self.browser.page_source,request=request,encoding="utf-8")
if int(request.meta["page"]) == 2:
for y in range(20):
self.browser.execute_script("window.scrollby(0,200)")
time.sleep(3)
self.browser.execute_script("arguments[0].click();", pages)
#pages.click()
return htmlresponse(url=self.browser.current_url, body=self.browser.page_source, request=request,encoding="utf-8")
Scrapy摸索爬去New York Time
放上乙個典型錯誤 編碼錯誤 1 typeerror can t concat bytes to str class todaymoivepipeline object defprocess item self,item,spider now time.strftime y m d time.loca...
scrapy 爬去網頁(1)
第一次爬去 首先定義爬去的字段 class cnblogsitem scrapy.item define the fields for your item here like name scrapy.field title scrapy.field link scrapy.field desc sc...
用redis實現scrapy的url去重與增量爬取
scrapy 自帶了去重方案,通過rfpdupefilter類完成去重,檢視原始碼。def request seen self,request fp self.request fingerprint request if fp in self.fingerprints return true sel...