爬蟲進一步學習,找到了乙份《筆趣說》****的爬取**。**亟待需要維護,修正。
但頻繁爬取後出現503錯誤,等待進一步學習解決。
from urllib import request
from bs4 import beautifulsoup
import collections
import re
import os
import time
import sys
import types
"""parameters:
target - 《筆趣看》網指定的**目錄位址(string)
returns:
無modify:
2017-05-06
"""class
download
(object):
def__init__
(self, target)
: self.__target_url = target
self.__head =
""" parameters:
無 returns:
novel_name + '.txt' - 儲存的**名(string)
numbers - 章節數(int)
modify:
2017-05-06
"""defget_download_url
(self)
: charter = re.
compile
(u'[第弟](.+)章'
, re.ignorecase)
#url請求
target_req = request.request(url = self.__target_url, headers = self.__head)
#回應 target_response = request.urlopen(target_req)
#html文件
target_html = target_response.read(
).decode(
'gbk'
,'ignore'
)#bs4,目標文件,解析器。
listmain_soup = beautifulsoup(target_html,
'lxml'
)#找到listmian
chapters = listmain_soup.find_all(
'div'
,class_ =
'listmain'
)#在建立乙個bs4
download_soup = beautifulsoup(
str(chapters)
,'lxml'
)# ['《一念永恆', '最新章節列表'] 如分解出來如下。
# print(str(download_soup.dl.dt).split("》")[0][5:],str(download_soup.dl.dt).split("》"))
novel_name =
str(download_soup.dl.dt)
.split(
"》")[0
][5:
] flag_name =
"《"+ novel_name +
"》"+
"正文卷"
download_dict = collections.ordereddict(
) begin_flag =
false
numbers =
0for child in download_soup.dl.children:
if child !=
'\n'
:if child.string == u"%s"
% flag_name:
begin_flag =
true
if begin_flag ==
true
and child.a !=
none
: download_url =
""+ child.a.get(
'href'
) download_name = child.string
download_dict[download_name]
= download_url
numbers +=
1return novel_name +
'.txt'
, numbers, download_dict
"""函式說明:爬取文章內容
parameters:
returns:
soup_text - 章節內容(string)
modify:
2017-05-06
"""def**********
(self, url)
: download_req = request.request(url = url, headers = self.__head)
download_response = request.urlopen(download_req)
(download_response)
download_html = download_response.read(
).decode(
'gbk'
,'ignore'
) soup_texts = beautifulsoup(download_html,
'lxml'
) texts = soup_texts.find_all(id=
'content'
, class_ =
'showtxt'
) soup_text = beautifulsoup(
str(texts)
,'lxml'
).div.text.replace(
'\xa0',''
)return soup_text
"""函式說明:將爬取的文章內容寫入檔案
parameters:
name - 章節名稱(string)
path - 當前路徑下,**儲存名稱(string)
text - 章節內容(string)
returns:
無 modify:
2017-05-06
"""defwriter
(self, name, path, text)
: write_flag =
true
with
open
(path,
'a', encoding=
'utf-8'
)as f:
f.write(name +
'\n\n'
)for each in text:
if each ==
'h':
write_flag =
false
if write_flag ==
true
and each !=
' ':
f.write(each)
if write_flag ==
true
and each ==
'\r'
: f.write(
'\n'
)
f.write(
'\n\n'
)if __name__ ==
"__main__"
:#**位址
target_url =
str(
input()
) d = download(target = target_url)
name, numbers, url_dict = d.get_download_url(
)if name in os.listdir():
os.remove(name)
index =
1print
(% name[:-
4])for key, value in url_dict.items():
(key,value)
d.writer(key, name, d.**********(value)
) sys.stdout.write(
%float
(index/numbers)
+'\r'
) sys.stdout.flush(
) index +=
1print
(% name[:-
4])
爬蟲之小說爬取
以筆趣閣 為例,爬取一念永恆這本 具體 如下 1 from bs4 import beautifulsoup 2from urllib import request 3import requests 4importre5 import sys6 def down this chapter chapt...
Python爬蟲例項,爬取小說
import pprint import requests from bs4 import beautifulsoup 獲取原始碼 defget source url r requests.get url if r.status code 200 print r.status code 錯誤 rai...
爬取小說的簡易python爬蟲
學習一段時間的python之後決定寫些東西 剛好自己喜歡看 就像寫一段爬取 的爬蟲,這裡以筆趣閣的 為例。我發現筆趣閣的每個 的目錄源 基本都包含其所有的章節的url,所以這段 是先獲取所有的url然後逐頁獲取其文字 import requests 這裡以 天地霸氣訣為例 import re imp...