importrequests
import
osfrom bs4 import
beautifulsoup
shici_url = '
'url = '
/book/
'headers =
## 請求頁面資料
response = requests.get(url=url,headers=headers)
page_text =response.text
soup = beautifulsoup(page_text,'
lxml')
#獲取所有**名稱組成的列表
a_list = soup.select('
.bookmark-list>ul>li>h2>a')
#獲取書籍的詳細
defget_book_detail(page_url):
book_detail_content = requests.get(url=page_url, headers=headers).text
soup = beautifulsoup(book_detail_content, '
lxml')
book_content = soup.select('
.chapter_content>p')
ifnot
book_content:
book_content = soup.select('
.chapter_content')
content = ''
for book_c in
book_content:
content = content +book_c.text
#獲取 詳細內容
return
content
#獲取書籍的列表頁面
defget_book_list(book_url,f):
book_list_content = requests.get(url=book_url, headers=headers).text
soup = beautifulsoup(book_list_content, '
lxml')
book_mulu = soup.select('
.book-mulu>ul>li>a')
for book in
book_mulu:
page_title =book.text
print(page_title + "")
page_url = shici_url+book['
href']
#呼叫 詳細頁面
content =get_book_detail(page_url)
f.write(page_title+"
\n\n
"+content+"
\n\n\n")
print(page_title+"")
f.close()
#判斷目錄是否存在
file_path = '
./史書/'if
notos.path.exists(file_path):
os.mkdir(file_path)
n =0
for a in
a_list:
n = n + 1
#書名 book_name =a.text
print("
"%book_name)
#建立以當前書名為檔名的txt檔案
file_name = file_path+str(n)+'
.'+book_name+'
.txt
'f = open(file_name,'
a+',encoding='
utf-8')
#url
book_url = shici_url+a['
href']
#通過url 進入到 書籍的列表頁面
get_book_list(book_url,f)
Python 爬蟲爬取網頁
工具 python 2.7 import urllib import urllib2 defgetpage url 爬去網頁的方法 request urllib.request url 訪問網頁 reponse urllib2.urlopen request 返回網頁 return response...
python爬蟲爬取策略
在爬蟲系統中,待抓取url佇列是很重要的一部分。待抓取url佇列中的url以什麼樣的順序排列也是乙個很重要的問題,因為這涉及到先抓取那個頁面,後抓取哪個頁面。而決定這些url排列順序的方法,叫做抓取策略。下面重點介紹幾種常見的抓取策略 一 深度優先遍歷策略 深度優先遍歷策略是指網路爬蟲會從起始頁開始...
python爬蟲 seebug爬取
1.找相關的標籤一步一步往下查詢 2.有cookie才能查詢 3.用import re而不用from re import 是為了防止衝突 coding utf 8 from requests import import re from bs4 import beautifulsoup as bs h...