scrapy框架爬取1000本epub格式玄幻小說

2021-09-17 21:47:27 字數 3685 閱讀 3315

# -*- coding: utf-8 -*-

import scrapy

from epub.items import epubitem

class

epubdownloadspider

(scrapy.spider)

: name =

'epubdownload'

# allowed_domains = ['']

start_urls =

[''+str

(i)+

'.html/'

for i in

range(1

,51)]

defparse

(self, response)

: href=response.xpath(

'//a[contains(@href,"/d") and contains(@href,"epub_down")]/@href'

).extract(

)for i in

range

(len

(href)):

url =

''+href[i]

yield scrapy.request(url=url,callback=self.newparse)

defnewparse

(self,response)

: item=epubitem(

) link = response.xpath(

'//a[contains(@href,"down?id=") and contains(@href,"p=6")]/@href'

).extract(

) url=

''+link[0]

x=url.split(

'=')

list

=x[0

:len

(x)-1]

newurl=

'='.join(

list)+

'=' nam=response.xpath(

'//h1[@itemprop="name"]/text()'

).extract_first(

) item[

'name'

]=nam[

:len

(nam)-5

] item[

'down_url'

]=newurl

yield item

# -*- coding: utf-8 -*-

# define your item pipelines here

## don't forget to add your pipeline to the item_pipelines setting

# see:

import pymongo

import os

import json

class

epubpipeline

(object):

defprocess_item

(self, item, spider)

:return item

class

mongopipeline

(object):

def__init__

(self,mongo_uri,mongo_db)

: self.mongo_uri=mongo_uri

self.mongo_db=mongo_db

@classmethod

deffrom_crawler

(cls,crawler)

:return cls(

mongo_uri=crawler.settings.get(

'mongo_uri'),

mongo_db=crawler.settings.get(

'mongo_db'))

defopen_spider

(self,spider)

: self.client = pymongo.mongoclient(self.mongo_uri)

self.db=self.client[self.mongo_db]

defprocess_item

(self,item,spider)

: name=item.__class__.__name__

self.db[name]

.insert(

dict

(item)

)return item

defclose_spider

(self,spider)

: self.client.close(

)class

jsonpipeline

(object):

defprocess_item

(self, item, spider)

: base_dir = os.getcwd(

) filename = base_dir +

'/news.json'

# 開啟json檔案,向裡面以dumps的方式吸入資料

# 注意需要有乙個引數ensure_ascii=false ,不然資料會直接為utf編碼的方式存入比如

# :「/xe15」

with

open

(filename,

'a')

as f:

line = json.dumps(

dict

(item)

, ensure_ascii=

false)+

'\n'

f.write(line)

return item

bot_name =

'epub'

spider_modules =

['epub.spiders'

]newspider_module =

'epub.spiders'

# crawl responsibly by identifying yourself (and your website) on the user-agent

#user_agent = 'epub (+'

# obey robots.txt rules

robotstxt_obey =

false

item_pipelines=

mongo_uri=

'localhost'

mongo_db=

'epubdownload'

import scrapy

class

epubitem

(scrapy.item)

:# define the fields for your item here like:

# name = scrapy.field()

name = scrapy.field(

) down_url=scrapy.field(

)

[外鏈轉存失敗,源站可能有防盜煉機制,建議將儲存下來直接上傳(img-xohmaoyv-1589614884334)(

scrapy框架全站資料爬取

每個 都有很多頁碼,將 中某板塊下的全部頁碼對應的頁面資料進行爬取 實現方式有兩種 1 將所有頁面的url新增到start urls列表 不推薦 2 自行手動進行請求傳送 推薦 yield scrapy.request url,callback callback專門用做於資料解析 下面我們介紹第二種...

scrapy框架爬取王者榮耀面板

建立專案命令 scrapy startproject wangzhephotomax 建立爬蟲 scrapy genspider wangzhecrawl 更改settings.py中的設定 user agent obey robots.txt rules robotstxt obey false ...

scrapy框架爬取豆瓣讀書(1)

scrapy,python開發的乙個快速 高層次的螢幕抓取和web抓取框架,用於抓取web站點並從頁面中提取結構化的資料。scrapy用途廣泛,可以用於資料探勘 監測和自動化測試。scrapy吸引人的地方在於它是乙個框架,任何人都可以根據需求方便的修改。它也提供了多種型別爬蟲的基類,如basespi...