貼吧:實現:
# !/usr/bin/env python
# -*- encoding:utf-8 -*-
import urllib
import urllib.request
import re
import time
#文字處理類
class tools:
removeimg = re.compile('')
removea = re.compile('|')
replacebr = re.compile('
') def replace(self,x):
x = re.sub(self.removeimg,"",x)
x = re.sub(self.removea,"",x)
x = re.sub(self.replacebr,"\n",x)
return x.strip()
#實現爬取類
class bdtb:
def __init__(self):
self.headers =
self.filename = "bdtb.txt"
self.contents =
self.tools = tools()
def get_html(self,url):
try:
myreq = urllib.request.request(url,headers = self.headers)
myresponse = urllib.request.urlopen(myreq)
html = myresponse.read().decode('utf-8')
return html
except urllib.request.urlerror as e:
if(hasattr(e,'reason')):
print("未連線,原因:" + e.reason)
#爬取標題
def get_title(self,html):
p_title = re.compile('.*?',re.s)
mytitle = re.search(p_title,html).group(1)
return mytitle
#爬取貼子頁數
def get_pagenum(self,html):
p_pagenum = re.compile('.*?.*?(.*?)',re.s)
mypagenum = re.search(p_pagenum,html).group(1)
return mypagenum
#獲取帖子內容
def get_content(self,html):
p_content = re.compile('(.*?)
',re.s)
items = re.findall(p_content,html)
for item in items:
content = "\n" + self.tools.replace(item) + "\n"
#寫入檔案
def writedata(self,pagenum,title,contents):
if(title):
self.filename = title + '.txt'
with open(self.filename,'w') as f:
f.write("本貼共" + str(pagenum) + "頁:\n")
floor = 1
for content in contents:
f.write("\n第" + str(floor)+"樓:"+100*"-"+"\n")
f.write(content)
floor += 1
def start(self):
pageindex = 1
url_download = '' + str(pageindex)
html = self.get_html(url_download)
title = self.get_title(html)
pagenum = int(self.get_pagenum(html))
while(pageindex <= pagenum):
self.get_content(html)
self.writedata(pagenum,title,self.contents)
pageindex += 1
url_download = '' + str(pageindex)
html = self.get_html(url_download)
spider = bdtb()
starttime = time.time()
spider.start()
endtime = time.time()
print("用時:%.2f seconds"%(endtime - starttime))
Python3爬蟲爬取百度貼吧
1.需求分析 為了爬取貼吧中樓主所發表的帖子,並把內容提取出來儲存到txt檔案中。2.全部 這份 寫的比較早,所以裡面提取內容基本上用的全是正規表示式,並沒有呼叫一些非常高階的包。如下 coding utf 8 import urllib.request import urllib.parse im...
Python3 爬蟲 抓取百度貼吧
前言 天象獨行 import os,urllib.request,urllib.parse 測試要求 1 輸入吧名,首頁,結束頁進行爬蟲。2 建立乙個以吧名為名字的資料夾,裡面是每一頁的html的內容,檔名格式 吧名 page.html url ba name input home page int...
爬取百度貼吧
import urllib.request import urllib.parse import os,time 輸入貼吧名字 baname input 請輸入貼吧的名字 start page int input 請輸入起始頁 end page int input 請輸入結束頁 不完整的url ur...