from scrapy.exceptions import dropitem
from pymongo import mongoclient
from scrapy.conf import settings
from pymongo.errors import duplicatekeyerror
from traceback import format_exc
from .items import city58xiaoqu, city58itemchuzuinfo
''''''
class city58pipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.client = none
self.db = none
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('mongodb_uri'),
mongo_db=settings.get('mongodb_database', 'items')
)def open_spider(self, spider):
_ = spider
self.client = mongoclient(self.mongo_uri)
self.db = self.client[self.mongo_db]
self.db['city58_info'].ensure_index('id', unique=true) # 在表 city58_info 中建立索引,並保證索引的唯一性
self.db['city58_chuzu_info'].ensure_index('url', unique=true) # 在表 city58_chuzu_info 中建立索引,並保證索引的唯一性
def close_spider(self, spider):
_ = spider
self.client.close()
def process_item(self, item, spider):
try:
if isinstance(item, city58xiaoqu): # 判斷是否是小區的item
self.db['city58_info'].update(, , upsert=true) # 通過id判斷,有就更新,沒有就插入
elif isinstance(item, city58itemchuzuinfo): # 判斷是否是小區出租資訊的item
try:
fangjia = handlefangjiapipline.price_per_square_meter_dict[item['id']] # 把handlefangjiapipline管道的字典price_per_square_meter_dict中每平公尺平均**賦值給fangjia
# del handlefangjiapipline.price_per_square_meter_dict[item['id']]
item['price_pre'] = fangjia
self.db['city58_chuzu_info'].update(, , upsert=true) # 通過url判斷,有就更新,沒有就插入
except exception as e:
print(e)
except duplicatekeyerror:
spider.logger.debug(' duplicate key error collection') # 唯一鍵衝突報錯
except exception as e:
_ = e
spider.logger.error(format_exc())
return item
class handlezufangpipline(object):
def process_item(self, item, spider):
_ = spider, self
# self.db[self.collection_name].insert_one(dict(item))
# 判斷進來的item是否是city58itemxiaochuzuquinfo,是否含有面積引數
if isinstance(item, city58itemchuzuinfo) and 'mianji' in item:
item['chuzu_price_pre'] = int(item['zu_price']) / int(item['mianji']) # 租金除以面積得到平均**
return item
class handlefangjiapipline(object):
price_per_square_meter_dict = dict()
def process_item(self, item, spider):
_ = spider
# 判斷傳進來的item是否是個字典,並且是否含有price_list
if isinstance(item, dict) and 'price_list' in item:
item['price_list'] = [int(i) for i in item['price_list']]
if item['price_list']:
self.price_per_square_meter_dict[item['id']] = sum(item['price_list']) / len(item['price_list']) # 得到每個小區的平均**
else:
self.price_per_square_meter_dict[item['id']] = 0
raise dropitem()
return item
使用Scrapy框架爬取鏈家資料
coding utf 8 import scrapy from pachong6.items import pachong6item class lianjiaspider scrapy.spider name lianjia allowed domains m.lianjia.com start ...
scrapy框架全站資料爬取
每個 都有很多頁碼,將 中某板塊下的全部頁碼對應的頁面資料進行爬取 實現方式有兩種 1 將所有頁面的url新增到start urls列表 不推薦 2 自行手動進行請求傳送 推薦 yield scrapy.request url,callback callback專門用做於資料解析 下面我們介紹第二種...
使用scrapy框架爬取資料並存入excel表中
爬取 爬取目標 獲得乙個地區七天之內的天氣狀況,並存入excel 中 爬蟲檔案部分 import scrapy from items import tianqiyubaoitem class tianqispider scrapy.spider name tianqi allowed domains...