#筆趣閣**
# 1.模擬搜尋
# 2.圖書查詢-章節
# 3.獲取章節-內容
# 4.本地儲存:txt、mysql、
def searchbook():
bookname = input("請輸入圖書的名稱: ")
# 1.轉移字元:中文在url中亂碼
bookname = bookname.encode("gbk")
# 2.請求
resp = requests.get(url = url_one, params = ,headers=head,timeout=10)
# 3.判斷是否成功
if resp.status_code == 200:
resp.encoding = "gbk"
print(resp.text)
# 4.解析內容:1 .資料來源 2.html.parser
soup = beautifulsoup(resp.text, "html.parser")
# 4.1 tag 根據標籤的名稱獲取,第乙個出現的
title = soup.title # 拿標題
print(title)
img = soup.img # 拿a標籤
print(img)
a = soup.a # 拿a標籤
print(a)
# 4.2 string text 獲取內容
print(title.string, img, a.string)
# 4.3獲取屬性 attrs 屬性字典集合 get(key)訪問
# print(img.attrs)
print(img.attrs.get("src"))
# # # 4.4查詢
# # find_all() 查詢所有標籤,list列表[tag,tag...]
# find() = soup.tag 第乙個出現的標籤
# name:標籤名 ,string:單個,list:多個
div_list = soup.find_all(name="div",attrs=)
for div in div_list:
# 判斷不能none
bookname = div.h4.a.string
bookurl = div.h4.a.attrs.get("href")
bookauthor = div.small.string
bookdir = div.p.string
# and 與 需要滿足所有所有條件
if bookname != none and bookurl != none and bookauthor != none and bookdir != none:
bookname.replace(" ", "")
bookurl.replace(" ", "")
bookauthor.replace(" ", "")
bookdir.replace(" ", "")
print(bookname + "\n", bookurl + "\n", bookauthor + "\n", bookdir + "\n")
# 5.儲存到字典
book_dict[bookname] = bookurl
else:
print("錯誤!重新開始")
searchbook()
pass
def getbookchapter():
bookname = input("請輸入已找到的圖書的名稱: ")
# 判斷是否存在字典中
# keys() 返回字典key的列表 集合
if bookname in book_dict.keys():
# resp = requests.get(url=url_one, params=, headers=head, timeout=10)
resp = requests.get(url=book_dict[bookname],headers=head, timeout=time)
# 3.判斷是否成功
if resp.status_code == 200:
resp.encoding = "gbk"
soup = beautifulsoup(resp.text, "html.parser")
title = soup.title.string # 拿標題
print(title.string)
dd_list = soup.find_all(name="dd", attrs=)
for dd in dd_list:
try:
chapter = dd.a.attrs.get("title")
chapterurl = dd.a.attrs.get("href")
print(chapter,chapterurl)
bookurl = book_dict[bookname]
getbookchaptercontent(chapter, chapterurl, bookurl,bookname)
except exception:
pass
continue
else:
print("錯誤!重新開始")
getbookchapter()
pass
def getbookchaptercontent(chapter, chapterurl, bookurl,bookname):
# 判斷是否存在url,進行拼接
resp = requests.get(url=chapterurl) # 發起請求
if resp.ststus_code == 200:
resp.encoding = "gbk"
soup4 = beautifulsoup(resp.text,"html.parser") # 格式化
div = soup4.find(name="div",attrs=) #返回乙個標籤物件,而不是列表物件
text = div.text
if text !=none and text !="": #判斷不能為空
text = div.text.replace("
爬蟲筆趣閣例子
from lxml import etree from selenium.webdriver.support.wait import webdriverwait from selenium.webdriver.support import expected conditions as es from...
c 筆趣閣小說爬蟲
流年似水,回想上一次博文發表,好像已經是一年多以前,差點就忘了自己是個文件攻城獅的本質,罪過啊。最近在研究爬蟲,python用的不太習慣,還是回歸老本行c 比較好一點,個人又比較喜歡看 所以就選取筆大大做個白老鼠 默哀 寫個爬蟲玩完,迷茫啊。這個專案有幾個比較重要的點 一 正規表示式,參考 二 抓取...
初級爬蟲爬取筆趣閣小說
import requests from pyquery import pyquery as pq def get content a response requests.get a response.encoding gbk doc pq response.text text doc conten...