python爬取小說

一、準備

安裝 requests 、pyquery庫

二、使用

定義了search類初始化時傳入**第一章url 和**名即可

再呼叫all_content方法即可

# -
*-coding:utf8-*-
import re
import requests
from requests.exceptions import requestexception
from pyquery import pyquery as pq
class
searchbook
:    def __init__
(self,url,bookname)
:        self.url=url
self.bookname=bookname
self.baseurl=url.
split
('/')[
0]+'//'
+url.
split
('/')[
2]_headers=
def __writeintxt__
(self,info):""
"寫入txt檔案，封裝"
""        newname=self.bookname +
'.txt'
with
open
(newname,
'a+'
,encoding=
'utf-8'
)as file:
file.
write
('\n'
.join
([info]))
file.
write
('\n'
+'='*20
+'\n'
)      
def __getallchapter__
(self,url):""
"獲取章節內容"
""try
:            one_chapter=requests.
get(url,headers=self._headers)
#print
(one_chapter.encoding)
one_chapter.encoding=self.
codemode
(one_chapter)
#print
(one_chapter.encoding)
#加入headers標識計算機
if one_chapter.status_code==
200:
content=one_chapter.text
#print
(content)
doc=
pq(content)
title=
doc(
'.bookname h1').
text()
self.
__writeintxt__
(title)
maincontent=
doc(
'#content').
text()
self.
__writeintxt__
(maincontent)
next_url=
doc(
'.bottem1 a').
items()
#print
(next_url,
type
(next_url)
)                all_url=
for a in next_url:
#print
(a.text()
)if a.
text()
in["上一章"
,"章節目錄"
,"下一章"
,"章節列表",]
:                        all_url.
(a.attr.href)
#print
(all_url)  
#獲取下一章節的url
iflen(all_url)==3
:if all_url[-1
]:#print
(all_url)
if all_url[-1
][:1
:]=='/'
:                            next_url=self.baseurl+all_url[-1
]print
(title,
'\n'
,next_url)
else
:                            next_url=self.baseurl +all_url[-2
]+ all_url[-1
]print
(title,
'\n'
,next_url)
self.
__getallchapter__
(next_url)
else
:                    pass
else
:print
('end'
)        except requestexception:
print
('error'
)   
def getcharset
(self,content)
:        留在此處可以考慮過載該方法
但本例暫不考慮"""
charset=re.
compile
(r']'
, flags=re.i)
print
(charset.
findall
(content)
)return
(charset.
findall
(content)
)    def all_content
(self):""
"此函式獲取"
""        self.
__getallchapter__
(self.url)
def getcontent
(self,info):""
"此方法為避免__getallchapter__方法太過冗長而設"
""        pass
def codemode
(self,getrequest)
:"""為應對網頁出現亂碼問題
重新定義編碼"""
if getrequest.encoding in
['windows-1252'
,'iso-8859-1']:
return
('gbk'
)else
:

Python爬取小說

感覺這個夠蛋疼的，因為你如果正常寫的話，前幾次執行沒問題，之後你連都沒改，再執行就出錯了。其實這可能是網路請求失敗，或者有反爬蟲的東西吧。但這就會讓你寫的時候非常苦惱，所以這這東西，健壯性及其重要！import requests from bs4 import beautifulsoup impo...

python 爬取小說

前些天突然想看一些可能是因為壓力大，所以就要有補償機制吧。為了節省流量，就想著把內容爬下來，然後就可以在路上看了。於是有了下面的指令碼。usr bin env python coding utf 8 import requests from lxml import etree 為了解決unicod...

Python爬取小說

這裡主要爬取筆趣閣的鏈結因為筆趣閣對段時間的爬取次數做了限制，所以每次我們只能爬取十章 coding utf 8 import re import soup as soup from bs4 import beautifulsoup import requests import chardet i...

python爬取小說

Python爬取小說

python 爬取小說

Python爬取小說

相關推薦