# -*- coding: utf-8 -*-
import scrapy
from epub.items import epubitem
class
epubdownloadspider
(scrapy.spider)
: name =
'epubdownload'
# allowed_domains = ['']
start_urls =
[''+str
(i)+
'.html/'
for i in
range(1
,51)]
defparse
(self, response)
: href=response.xpath(
'//a[contains(@href,"/d") and contains(@href,"epub_down")]/@href'
).extract(
)for i in
range
(len
(href)):
url =
''+href[i]
yield scrapy.request(url=url,callback=self.newparse)
defnewparse
(self,response)
: item=epubitem(
) link = response.xpath(
'//a[contains(@href,"down?id=") and contains(@href,"p=6")]/@href'
).extract(
) url=
''+link[0]
x=url.split(
'=')
list
=x[0
:len
(x)-1]
newurl=
'='.join(
list)+
'=' nam=response.xpath(
'//h1[@itemprop="name"]/text()'
).extract_first(
) item[
'name'
]=nam[
:len
(nam)-5
] item[
'down_url'
]=newurl
yield item
# -*- coding: utf-8 -*-
# define your item pipelines here
## don't forget to add your pipeline to the item_pipelines setting
# see:
import pymongo
import os
import json
class
epubpipeline
(object):
defprocess_item
(self, item, spider)
:return item
class
mongopipeline
(object):
def__init__
(self,mongo_uri,mongo_db)
: self.mongo_uri=mongo_uri
self.mongo_db=mongo_db
@classmethod
deffrom_crawler
(cls,crawler)
:return cls(
mongo_uri=crawler.settings.get(
'mongo_uri'),
mongo_db=crawler.settings.get(
'mongo_db'))
defopen_spider
(self,spider)
: self.client = pymongo.mongoclient(self.mongo_uri)
self.db=self.client[self.mongo_db]
defprocess_item
(self,item,spider)
: name=item.__class__.__name__
self.db[name]
.insert(
dict
(item)
)return item
defclose_spider
(self,spider)
: self.client.close(
)class
jsonpipeline
(object):
defprocess_item
(self, item, spider)
: base_dir = os.getcwd(
) filename = base_dir +
'/news.json'
# 開啟json檔案,向裡面以dumps的方式吸入資料
# 注意需要有乙個引數ensure_ascii=false ,不然資料會直接為utf編碼的方式存入比如
# :「/xe15」
with
open
(filename,
'a')
as f:
line = json.dumps(
dict
(item)
, ensure_ascii=
false)+
'\n'
f.write(line)
return item
bot_name =
'epub'
spider_modules =
['epub.spiders'
]newspider_module =
'epub.spiders'
# crawl responsibly by identifying yourself (and your website) on the user-agent
#user_agent = 'epub (+'
# obey robots.txt rules
robotstxt_obey =
false
item_pipelines=
mongo_uri=
'localhost'
mongo_db=
'epubdownload'
import scrapy
class
epubitem
(scrapy.item)
:# define the fields for your item here like:
# name = scrapy.field()
name = scrapy.field(
) down_url=scrapy.field(
)
[外鏈轉存失敗,源站可能有防盜煉機制,建議將儲存下來直接上傳(img-xohmaoyv-1589614884334)( scrapy框架全站資料爬取
每個 都有很多頁碼,將 中某板塊下的全部頁碼對應的頁面資料進行爬取 實現方式有兩種 1 將所有頁面的url新增到start urls列表 不推薦 2 自行手動進行請求傳送 推薦 yield scrapy.request url,callback callback專門用做於資料解析 下面我們介紹第二種...
scrapy框架爬取王者榮耀面板
建立專案命令 scrapy startproject wangzhephotomax 建立爬蟲 scrapy genspider wangzhecrawl 更改settings.py中的設定 user agent obey robots.txt rules robotstxt obey false ...
scrapy框架爬取豆瓣讀書(1)
scrapy,python開發的乙個快速 高層次的螢幕抓取和web抓取框架,用於抓取web站點並從頁面中提取結構化的資料。scrapy用途廣泛,可以用於資料探勘 監測和自動化測試。scrapy吸引人的地方在於它是乙個框架,任何人都可以根據需求方便的修改。它也提供了多種型別爬蟲的基類,如basespi...