scrapy框架爬取1000本epub格式玄幻小說

# -*- coding: utf-8 -*-
import scrapy
from epub.items import epubitem
class
epubdownloadspider
(scrapy.spider)
:    name =
'epubdownload'
# allowed_domains = ['']
start_urls =
[''+str
(i)+
'.html/'
for i in
range(1
,51)]
defparse
(self, response)
:        href=response.xpath(
'//a[contains(@href,"/d") and contains(@href,"epub_down")]/@href'
).extract(
)for i in
range
(len
(href)):
url =
''+href[i]
yield scrapy.request(url=url,callback=self.newparse)
defnewparse
(self,response)
:        item=epubitem(
)        link = response.xpath(
'//a[contains(@href,"down?id=") and contains(@href,"p=6")]/@href'
).extract(
)        url=
''+link[0]
x=url.split(
'=')
list
=x[0
:len
(x)-1]
newurl=
'='.join(
list)+
'='        nam=response.xpath(
'//h1[@itemprop="name"]/text()'
).extract_first(
)        item[
'name'
]=nam[
:len
(nam)-5
]        item[
'down_url'
]=newurl
yield item

# -*- coding: utf-8 -*-
# define your item pipelines here
## don't forget to add your pipeline to the item_pipelines setting
# see: 
import pymongo
import os
import json
class
epubpipeline
(object):
defprocess_item
(self, item, spider)
:return item
class
mongopipeline
(object):
def__init__
(self,mongo_uri,mongo_db)
:        self.mongo_uri=mongo_uri
self.mongo_db=mongo_db
@classmethod
deffrom_crawler
(cls,crawler)
:return cls(
mongo_uri=crawler.settings.get(
'mongo_uri'),
mongo_db=crawler.settings.get(
'mongo_db'))
defopen_spider
(self,spider)
:        self.client = pymongo.mongoclient(self.mongo_uri)
self.db=self.client[self.mongo_db]
defprocess_item
(self,item,spider)
:        name=item.__class__.__name__
self.db[name]
.insert(
dict
(item)
)return item
defclose_spider
(self,spider)
:        self.client.close(
)class
jsonpipeline
(object):
defprocess_item
(self, item, spider)
:        base_dir = os.getcwd(
)        filename = base_dir +
'/news.json'
# 開啟json檔案，向裡面以dumps的方式吸入資料
# 注意需要有乙個引數ensure_ascii=false ，不然資料會直接為utf編碼的方式存入比如
# :「/xe15」
with
open
(filename,
'a')
as f:
line = json.dumps(
dict
(item)
, ensure_ascii=
false)+
'\n'
f.write(line)
return item

bot_name =
'epub'
spider_modules =
['epub.spiders'
]newspider_module =
'epub.spiders'
# crawl responsibly by identifying yourself (and your website) on the user-agent
#user_agent = 'epub (+'
# obey robots.txt rules
robotstxt_obey =
false
item_pipelines=
mongo_uri=
'localhost'
mongo_db=
'epubdownload'

import scrapy
class
epubitem
(scrapy.item)
:# define the fields for your item here like:
# name = scrapy.field()
name = scrapy.field(
)    down_url=scrapy.field(
)

[外鏈轉存失敗,源站可能有防盜煉機制,建議將儲存下來直接上傳(img-xohmaoyv-1589614884334)(

scrapy框架全站資料爬取

每個都有很多頁碼，將中某板塊下的全部頁碼對應的頁面資料進行爬取實現方式有兩種 1 將所有頁面的url新增到start urls列表不推薦 2 自行手動進行請求傳送推薦 yield scrapy.request url,callback callback專門用做於資料解析下面我們介紹第二種...

scrapy框架爬取王者榮耀面板

建立專案命令 scrapy startproject wangzhephotomax 建立爬蟲 scrapy genspider wangzhecrawl 更改settings.py中的設定 user agent obey robots.txt rules robotstxt obey false ...

scrapy框架爬取豆瓣讀書（1）

scrapy，python開發的乙個快速高層次的螢幕抓取和web抓取框架，用於抓取web站點並從頁面中提取結構化的資料。scrapy用途廣泛，可以用於資料探勘監測和自動化測試。scrapy吸引人的地方在於它是乙個框架，任何人都可以根據需求方便的修改。它也提供了多種型別爬蟲的基類，如basespi...

scrapy框架爬取1000本epub格式玄幻小說

scrapy框架全站資料爬取

scrapy框架爬取王者榮耀面板

scrapy框架爬取豆瓣讀書（1）

相關推薦