使用Scrapy框架爬取58同城的出租房資訊

2021-09-13 01:31:25 字數 3016 閱讀 4998

from scrapy.exceptions import dropitem

from pymongo import mongoclient

from scrapy.conf import settings

from pymongo.errors import duplicatekeyerror

from traceback import format_exc

from .items import city58xiaoqu, city58itemchuzuinfo

''''''

class city58pipeline(object):

def __init__(self, mongo_uri, mongo_db):

self.mongo_uri = mongo_uri

self.mongo_db = mongo_db

self.client = none

self.db = none

@classmethod

def from_crawler(cls, crawler):

return cls(

mongo_uri=crawler.settings.get('mongodb_uri'),

mongo_db=settings.get('mongodb_database', 'items')

)def open_spider(self, spider):

_ = spider

self.client = mongoclient(self.mongo_uri)

self.db = self.client[self.mongo_db]

self.db['city58_info'].ensure_index('id', unique=true) # 在表 city58_info 中建立索引,並保證索引的唯一性

self.db['city58_chuzu_info'].ensure_index('url', unique=true) # 在表 city58_chuzu_info 中建立索引,並保證索引的唯一性

def close_spider(self, spider):

_ = spider

self.client.close()

def process_item(self, item, spider):

try:

if isinstance(item, city58xiaoqu): # 判斷是否是小區的item

self.db['city58_info'].update(, , upsert=true) # 通過id判斷,有就更新,沒有就插入

elif isinstance(item, city58itemchuzuinfo): # 判斷是否是小區出租資訊的item

try:

fangjia = handlefangjiapipline.price_per_square_meter_dict[item['id']] # 把handlefangjiapipline管道的字典price_per_square_meter_dict中每平公尺平均**賦值給fangjia

# del handlefangjiapipline.price_per_square_meter_dict[item['id']]

item['price_pre'] = fangjia

self.db['city58_chuzu_info'].update(, , upsert=true) # 通過url判斷,有就更新,沒有就插入

except exception as e:

print(e)

except duplicatekeyerror:

spider.logger.debug(' duplicate key error collection') # 唯一鍵衝突報錯

except exception as e:

_ = e

spider.logger.error(format_exc())

return item

class handlezufangpipline(object):

def process_item(self, item, spider):

_ = spider, self

# self.db[self.collection_name].insert_one(dict(item))

# 判斷進來的item是否是city58itemxiaochuzuquinfo,是否含有面積引數

if isinstance(item, city58itemchuzuinfo) and 'mianji' in item:

item['chuzu_price_pre'] = int(item['zu_price']) / int(item['mianji']) # 租金除以面積得到平均**

return item

class handlefangjiapipline(object):

price_per_square_meter_dict = dict()

def process_item(self, item, spider):

_ = spider

# 判斷傳進來的item是否是個字典,並且是否含有price_list

if isinstance(item, dict) and 'price_list' in item:

item['price_list'] = [int(i) for i in item['price_list']]

if item['price_list']:

self.price_per_square_meter_dict[item['id']] = sum(item['price_list']) / len(item['price_list']) # 得到每個小區的平均**

else:

self.price_per_square_meter_dict[item['id']] = 0

raise dropitem()

return item

使用Scrapy框架爬取鏈家資料

coding utf 8 import scrapy from pachong6.items import pachong6item class lianjiaspider scrapy.spider name lianjia allowed domains m.lianjia.com start ...

scrapy框架全站資料爬取

每個 都有很多頁碼,將 中某板塊下的全部頁碼對應的頁面資料進行爬取 實現方式有兩種 1 將所有頁面的url新增到start urls列表 不推薦 2 自行手動進行請求傳送 推薦 yield scrapy.request url,callback callback專門用做於資料解析 下面我們介紹第二種...

使用scrapy框架爬取資料並存入excel表中

爬取 爬取目標 獲得乙個地區七天之內的天氣狀況,並存入excel 中 爬蟲檔案部分 import scrapy from items import tianqiyubaoitem class tianqispider scrapy.spider name tianqi allowed domains...