1from bs4 import beautifuls
乙個十分好用的網頁提取的工具包
2import requests
用於傳送各種html請求
3import re
用編寫各種正則語句
使用requests傳送get請求,得到網頁:
def
get_page
(self,page_num):
url = self.base_url+'?see_lz=%s&pn=%s'%(self.see_lz,page_num)
try:
page = requests.get(url).text
return page
except requests.requestexception as e:
if hasattr(e,'reason'):
print('無法連線:%s'%e)
return
none
通過分析網頁結構,定位到想要的塊,使用beautifulsoup的find,findall方法匹配快的名稱,class,id等資訊,具體的使用可以檢視beautifulsoup文件
def
get_info
(self):
page = self.get_page(1)
soup = beautifulsoup(page,'lxml')
self.page_num = soup.find('li',class_='l_reply_num').findall('span')[1].string
self.page_num = int(self.page_num)
self.title = soup.find('h3',class_=re.compile('core_title_txt.*?')).string
通過各種正則語句來定位資訊,然後對定位的資料進行處理。使用re.compile()
生成正規表示式,使用re.sub()
函式對正規表示式匹配到的字串進行替換。更多的正則使用方法見正規表示式語法
class
filter:
##去除
remove_img = re.compile('
') ##去除超連結
remove_link = re.compile('|')
##將製表符td換成\n
replace_td = re.compile('')
##將換行符br換成\n
replace_br = re.compile('
') ##將其他標籤替換為空
remove_tag = re.compile('<.>')
deffilter
(self,x):
x = re.sub(self.remove_img,'',x)
x = re.sub(self.remove_link,'',x)
x = re.sub(self.replace_td,'\n',x)
x = re.sub(self.replace_br,'\n',x)
x = re.sub(self.remove_tag,'',x)
return x.strip()
#coding=utf-8
from bs4 import beautifulsoup
import requests
import re
class
filter:
##去除
remove_img = re.compile('
') ##去除超連結
remove_link = re.compile('|')
##將製表符td換成\n
replace_td = re.compile('')
##將換行符br換成\n
replace_br = re.compile('
') ##將其他標籤替換為空
remove_tag = re.compile('<.>')
deffilter
(self,x):
x = re.sub(self.remove_img,'',x)
x = re.sub(self.remove_link,'',x)
x = re.sub(self.replace_td,'\n',x)
x = re.sub(self.replace_br,'\n',x)
x = re.sub(self.remove_tag,'',x)
return x.strip()
class
spider:
def__init__
(self,base_url,see_lz,txt_path):
self.base_url = base_url
self.see_lz = see_lz
self.page_num = 0
self.title = ''
self.m_filter = filter()
self.txt_path = txt_path
self.floor = 0
defget_page
(self,page_num):
url = self.base_url+'?see_lz=%s&pn=%s'%(self.see_lz,page_num)
try:
page = requests.get(url).text
return page
except requests.requestexception as e:
if hasattr(e,'reason'):
print('無法連線:%s'%e)
return
none
defget_info
(self):
page = self.get_page(1)
soup = beautifulsoup(page,'lxml')
self.page_num = soup.find('li',class_='l_reply_num').findall('span')[1].string
self.page_num = int(self.page_num)
self.title = soup.find('h3',class_=re.compile('core_title_txt.*?')).string
defget_content
(self):
self.get_info()
print('開始寫檔案(總共%s頁):'%self.page_num)
file = open(self.txt_path,'w',encoding='utf-8')
file.write('----------------<< %s >>-------------\n\n\n'%(self.title))
for i in range(self.page_num):
print('第%s頁內容寫入。。。'%(i+1))
page = self.get_page(i+1)
soup = beautifulsoup(page,'lxml')
contents = soup.find('div',class_='p_postlist').contents
for content in contents:
stance= content.find('div',id=re.compile('post_content_.*?'))
ifnot(stance):
continue
self.floor += 1
seg = '------------------------------------------ 第%s樓 ----------------------------------\n'%(self.floor)
file.write(seg)
file.write(self.m_filter.filter(str(stance))+'\n\n')
print('寫入完成!')
if __name__=='__main__':
base_url = ''
see_lz = 1
txt_path = 'c:/users/wgy/desktop/bdtb.txt'
m_spider = spider(base_url,see_lz,txt_path)
m_spider.get_content()
爬蟲爬取百度貼吧 python
本爬蟲是在pycharm中編寫完成,伺服器環境是ubuntu16.04,使用語言是python3,匯入的模組包是requests模組 匯入模組 import requests class tiebaspider object def init self self.base url self.head...
Python爬蟲 百度貼吧
get請求 from urllib import request import urllib import time 第一頁 第二頁 2 1 50 第三頁 3 1 50 第四頁 4 1 50 第n頁 n 1 50 推測第一頁 headers 根據url傳送請求,獲取伺服器響應檔案 defloadpa...
python百度貼吧爬蟲
coding utf 8 coding utf 8 import urllib import urllib2 import reimport thread import time class bdtb def init self,baseurl,seelz self.baseurl baseurl ...