1、**文件格式:(**採集位址
2、免費**穩定性不可靠,採用裝飾器重連同時切換**
# coding: utf-8
# 筆趣閣 單篇**採集
# 替換第一章位址,總章節數。
# ip.txt 為**池。
import urllib2
from bs4 import beautifulsoup
import sys
import traceback
import random
reload(sys)
sys.setdefaultencoding('utf-8')
f = open("out.txt", "a+")
headers =
url = "/17_17281/7647045.html"
# 第一章**
page = 1301
# 章節數
nexthref = url
ippool =
defippool
(): reader = open('ip.txt')
line = reader.readline()
while line:
if line.strip() != '':
line = reader.readline()
reader.close()
retries = 0
# 重試的次數
count =
defconn_try_again
(function):
def(*args, **kwargs):
try:
return function(*args, **kwargs)
except exception, err:
print("--重試訪問,當前次數 %s ,(總次數5)--" % (count['num'] + 1))
if count['num'] < 5:
count['num'] += 1
else:
raise exception(err)
bsobj = none
@conn_try_again
defgetcontent
(url):
global nexthref, page, bsobj
# 定義乙個**開關
proxyswitch = true
try:
poollen = len(ippool)
if (poollen > 0):
i = random.randint(0, poollen - 1)
print(ippool[i])
proxy_host = ippool[i][2] + "://" + ippool[i][0] + ":" + ippool[i][1]
proxy_temp =
proxy_support = urllib2.proxyhandler(proxy_temp)
else:
print('--**池當前無可用**,使用本機位址訪問--')
proxy_support = urllib2.proxyhandler({})
nullproxy_handler = urllib2.proxyhandler({})
if proxyswitch:
opener = urllib2.build_opener(proxy_support)
else:
opener = urllib2.build_opener(nullproxy_handler)
urllib2.install_opener(opener)
req = urllib2.request(url, headers=headers)
response = urllib2.urlopen(req, timeout=3)
# print(response.read())
bsobj = beautifulsoup(response, 'lxml')
except exception, err:
raise exception(err)
contentdiv = bsobj.find('div', id='content')
content = bsobj.find('div', id='content').get_text()
preandnextbar = bsobj.find('div', attrs=)
title = bsobj.find('div', attrs=).h1.get_text()
if ("下一章"
in preandnextbar.get_text()):
next = none
alist = preandnextbar.findall('a')
for i in alist:
if ("下一章"
in i.get_text()):
next = i
if (next == none):
print("下一章為空")
return
true
nexthref = "" + next.get('href')
print(title)
# print(content)
print(nexthref)
f.write("#####" + '\n')
f.write(title + '\n')
f.write(content + '\n')
count['num'] = 0
else:
return
true
defmain
(): ippool()
global page
try:
for num in range(1, page):
if (getcontent(nexthref)):
break
print("--- end ---")
except exception, e:
print(traceback.print_exc())
finally:
f.close()
main()
附:**採集 c 筆趣閣小說爬蟲
流年似水,回想上一次博文發表,好像已經是一年多以前,差點就忘了自己是個文件攻城獅的本質,罪過啊。最近在研究爬蟲,python用的不太習慣,還是回歸老本行c 比較好一點,個人又比較喜歡看 所以就選取筆大大做個白老鼠 默哀 寫個爬蟲玩完,迷茫啊。這個專案有幾個比較重要的點 一 正規表示式,參考 二 抓取...
初級爬蟲爬取筆趣閣小說
import requests from pyquery import pyquery as pq def get content a response requests.get a response.encoding gbk doc pq response.text text doc conten...
1 4 爬蟲 筆趣閣獲取小說例子
筆趣閣 1.模擬搜尋 2.圖書查詢 章節 3.獲取章節 內容 4.本地儲存 txt mysql def searchbook bookname input 請輸入圖書的名稱 1.轉移字元 中文在url中亂碼 bookname bookname.encode gbk 2.請求 resp request...