python爬蟲學習01 電子書爬取

import requests        #匯入requests庫
'''獲取網頁資訊
'''if __name__ == '__main__':          #主函式入口
target = ''#要爬取的目標位址
req = requests.get(url=target)  #進行get請求
req.encoding='utf-8'            #設定編碼
print(req.text)                 #列印輸出

import requests        #匯入requests庫
from bs4 import beautifulsoup  #引入beautifulsoup庫
'''引入beautifulsoup對網頁內容進行解析
獲取網頁電子書文字資訊
'''if __name__ == '__main__':          #主函式入口
target = ''#要爬取的目標位址
req = requests.get(url=target)  #發起請求，獲取html資訊
req.encoding='utf-8'            #設定編碼
html = req.text                 #將網頁的html資訊儲存在html變數中
bs = beautifulsoup(html,'lxml') #使用lxml對網頁資訊進行解析
texts = bs.find('div',id='content') #獲取所有的內容
print(texts)                            #列印輸出

import requests        #匯入requests庫
from bs4 import beautifulsoup  #引入beautifulsoup庫
'''引入beautifulsoup對網頁內容進行解析
獲取網頁電子書文字資訊
最後一句texts.text 是提取所有文字，然後再使用 strip 方法去掉回車，
最後使用 split 方法根據 \xa0 切分資料，因為每一段的開頭，都有四個空格
'''if __name__ == '__main__':          #主函式入口
target = ''#要爬取的目標位址
req = requests.get(url=target)  #發起請求，獲取html資訊
req.encoding='utf-8'            #設定編碼
html = req.text                 #將網頁的html資訊儲存在html變數中
bs = beautifulsoup(html,'lxml') #使用lxml對網頁資訊進行解析
texts = bs.find('div',id='content') #獲取所有的內容
print(texts.text.strip().split('\xa0'*4))                            #列印輸出

import requests        #匯入requests庫
from bs4 import beautifulsoup  #引入beautifulsoup庫
'''檢視章節列表資訊
引入beautifulsoup對網頁內容進行解析
獲取網頁電子書文字資訊
'''if __name__ == '__main__':          #主函式入口
req = requests.get(url=target)      #發起請求，獲取html資訊
req.encoding='utf-8'                #設定編碼
html = req.text                     #將網頁的html資訊儲存在html變數中
bs = beautifulsoup(html,'lxml')     #使用lxml對網頁資訊進行解析
chapters = bs.find('div',id='list') #獲取所有的內容
chapters = chapters.find_all('a')         #找到list中的a標籤中的內容
for chapter in chapters:
print(chapter)                  #列印章節列表

import requests        #匯入requests庫
from bs4 import beautifulsoup  #引入beautifulsoup庫
'''檢視章節列表資訊
引入beautifulsoup對網頁內容進行解析
獲取網頁電子書文字資訊
'''if __name__ == '__main__':          #主函式入口
server = ''
req = requests.get(url=target)      #發起請求，獲取html資訊
req.encoding='utf-8'                #設定編碼
html = req.text                     #將網頁的html資訊儲存在html變數中
bs = beautifulsoup(html,'lxml')     #使用lxml對網頁資訊進行解析
chapters = bs.find('div',id='list') #獲取所有的內容
chapters = chapters.find_all('a')         #找到list中的a標籤中的內容
for chapter in chapters:
url = chapter.get('href')       #獲取章節鏈結中的href
print("《"+chapter.string+"》")           #列印章節名字

import requests        #匯入requests庫
from bs4 import beautifulsoup  #引入beautifulsoup庫
import time
from tqdm import  tqdm
'''檢視章節列表資訊
引入beautifulsoup對網頁內容進行解析
獲取網頁電子書文字資訊
'''def get_content(target):
req = requests.get(url=target)  # 發起請求，獲取html資訊
req.encoding = 'utf-8'  # 設定編碼
html = req.text  # 將網頁的html資訊儲存在html變數中
bf = beautifulsoup(html, 'lxml')  # 使用lxml對網頁資訊進行解析
texts = bf.find('div', id='content')  # 獲取所有的內容
content = texts.text.strip().split('\xa0' * 4)
return content
if __name__ == '__main__':          #主函式入口
server = ''     #電子書**位址
book_name = '《元尊》.txt'
req = requests.get(url=target)      #發起請求，獲取html資訊
req.encoding='utf-8'                #設定編碼
html = req.text                     #將網頁的html資訊儲存在html變數中
chapter_bs = beautifulsoup(html,'lxml')     #使用lxml對網頁資訊進行解析
chapters = chapter_bs.find('div',id='list') #獲取所有的內容
chapters = chapters.find_all('a')         #找到list中的a標籤中的內容
for chapter in tqdm(chapters):
chapter_name = chapter.string           #章節名字
url = server + chapter.get('href')       #獲取章節鏈結中的href
				linux學習電子書
redhat fedora core 6 unleashed redhat enterprise linux 5 administration unleashed fedora core 5 初學者指南 o reilly building embedded linux systems oreilly...
				verilog 學習電子書
verilog 學習電子書 計算機人學習 verilog 作為乙個計算機專業的計算機人，乙個純軟體的開發者也開始的數字邏輯之旅。verilog 學習電子書 學習一門新的語言，不能僅僅從網上學習東西，畢竟網上的東西都是零散的，學習還是要系統一些，有時間的話多多的讀一讀書，從書上的例子中學習東西，還要多...
				電子書收藏
以下是我蒐集的電子書備份。1.網路硬體 完整版 日 三輪賢一 著，盛榮 譯 人民郵電出版社 2015年8月第1版 2.python學習手冊 第4版 mark lutz 著 李軍 劉紅偉 等譯 機械工業出版社 2011年4月第1版 3.android軟體安全與逆向分析 豐生強著 人民郵電出版社 201...

python爬蟲學習01 電子書爬取

linux學習電子書

verilog 學習電子書

電子書收藏

相關推薦