第一次爬去
首先定義爬去的字段:
class cnblogsitem(scrapy.item):編寫爬蟲:# define the fields for your item here like:
# name = scrapy.field()
title = scrapy.field()
link = scrapy.field()
desc = scrapy.field()
listurl = scrapy.field()
#此處注意?要轉義
rules = [ rule(sle(allow=('/t/python\?type=newest&page=\d'),), follow=true,callback='parse_item1') ]
def parse_item1(self, response):
sel = selector(response)
items =
base_url = get_base_url(response)
posttitle = sel.css('div.tab-content').css("section")#全部的問題數量每一頁
postcon = sel.css('div.postcon div.c_b_p_desc')
# #標題、url和描述的結構是乙個鬆散的結構,後期可以改進
for index in range(len(posttitle)):
item = cnblogsitem()
#問題名稱
item['title'] = posttitle[index].css("a").xpath('text()').extract()[0]
# item['link'] = ''+posttitle[index].css('a').xpath('@href').extract()[0]#提問人的主頁鏈結
#問題頁面鏈結
item['link'] = ''+posttitle[index].css("h2.title").css('a').xpath('@href').extract()[0]
#當前爬去的頁面
item['listurl'] = base_url
item['desc'] = posttitle[index].css("div.answers ").xpath("text()").extract()[0]
#print base_url + "********\n"
return items
編寫piplines
#coding:utf-8結果爬去了3456條資料原始碼在#! /usr/bin/python
'''author fiz
date:2016-03-31
'''import pymongo
from scrapy.conf import settings
from scrapy.exceptions import dropitem
from scrapy import log
class mongodbpipeline( object ):
def __init__( self ):
connection = pymongo.mongoclient()
db = connection[settings[ 'mongodb_db' ]]
self .collection = db[settings[ 'mongodb_collection' ]]
def process_item( self , item, spider):
valid = true
for data in item:
if not data:
valid = false
raise dropitem( "missing !" . format (data))
if valid:
self .collection.insert( dict (item))
log.msg( "question added to mongodb database!" ,
level = log.debug, spider = spider)
return item
![](https://pic.w3help.cc/25c/60a394370bf355b4c687892d136b5.jpeg)
Scrapy摸索爬去New York Time
放上乙個典型錯誤 編碼錯誤 1 typeerror can t concat bytes to str class todaymoivepipeline object defprocess item self,item,spider now time.strftime y m d time.loca...
基於Scrapy爬取網頁文章
settings定義爬取的一些設定如下 coding utf 8 scrapy settings for jobbole project for simplicity,this file contains only settings considered important or commonly ...
python 用scrapy爬去天貓評論
1,建立scrapy startproject tb 2 cd tb 建立乙個spider scrapy genspider 爬蟲名字 網域名稱 4,在pippelines.py寫儲存的方式 我這裡寫的是資料夾 5,seting裡面開啟 robotstxt obey falsedownload de...