python學習系列1 爬取糗事百科段子

#!/usr/bin/env python

# -*- coding: utf-8 -*-

# @time : 2018\8\3 0003 20:38

# @author : aries

# @site :

# @file : getstoriesfromqsbk.py

# @software: pycharm

import urllib

import urllib2

from bs4 import beautifulsoup

class qsbk:

#初始化方法，定義爬取時所需的變數

def __init__(self):

self.pageindex=1

self.user_agent='mozilla/5.0 (windows nt 6.1; win64; x64)'

self.headers =

#存放讀取的故事及相關資訊，每乙個元素是一頁的內容

self.contents=

#存放程式是否繼續執行的變數

self.enable=false

#存放頁面總共的頁數

self.pageall=0

#獲取某一頁的頁面**

print u"連線糗事百科失敗，錯誤原因：", e.code

return none

if hasattr(e, "reason"):

print u"連線糗事百科失敗，錯誤原因：", e.reason

return none

#獲取某一頁不帶的頁面內容

def getpagecontent(self,pageindex):

pagecode = self.getpagecode(pageindex)

if pagecode:

soup = beautifulsoup(pagecode)

nodes = soup.select('#content-left')

node = nodes[0]

contents =

# 該頁所有發布者

author = node.select('h2')

# 該頁所有發布內容

content = node.select('div.content')

# 該頁所有點讚數(好笑)

stats_vote = node.select('.stats-vote .number')

stats_comments = node.select('.stats-comments .number')

# 總頁數

if self.pageall==0:

pageall = node.select('span.page-numbers')

self.pageall = int(pageall[len(pageall) - 1].get_text().strip())

# 儲存頁面內容

for i in range(0, len(author)):

[author[i].get_text().strip(), content[i].get_text().strip(), stats_vote[i].get_text().strip(),

stats_comments[i].get_text().strip()])

return contents

else:

print "頁面載入失敗..."

return none

#根據回車，列印該頁的乙個段子內容

def getonestory(self,content):

if self.pageindex<=self.pageall+1:

#列印乙個段子

item=content

#等待使用者輸入

input=raw_input()

if input=="q":

self.enable=false

return

self.loadpage()

else:

print "\n已載入全部頁面內容，請按q退出！"

#等待使用者輸入

input=raw_input()

#判斷接收任意鍵已經按下的結果退出

if input=="q":

self.enable=false

return

#載入並提取頁面的內容。加入列表中

def loadpage(self):

if self.enable==true:

if len(self.contents)==0:

pagecontent=self.getpagecontent(self.pageindex)

if pagecontent:

self.contents = pagecontent

self.pageindex += 1

else:

print "頁面載入失敗..."

return

#開始方法

def start(self):

print "正在讀取糗事百科，按回車檢視新段子，退出請按q"

#使變數為true，程式正常執行

self.enable=true

#先載入一頁內容和總共的頁數

self.loadpage()

while self.enable:

if self.contents:

#傳入乙個段子

content=self.contents[0]

#傳入之後刪除該段子

del self.contents[0]

#列印乙個段子,傳入乙個段子的內容

self.getonestory(content)

spider=qsbk()

spider.start()

python 爬取糗事百科

step 1 構建乙個提取糗事百科笑話的函式import urllib2 import urllib import re import thread import time import sys reload sys sys.setdefaultencoding utf 8 defgetpage p...

Python爬取糗事百科

一引入模組因為urlopen功能比較簡單，所以設定 ip需引入proxyhandler和build opener模組，ip的獲取可以上西祠查詢 import re from urllib.request import request,build opener,proxyhandler base...

Python 爬取糗事百科

coding utf 8 import urllib2 import urllib import re class qiushi def init self self.page 1 從網頁獲取糗事 def getqiushis self,page url page 偽裝瀏覽器 user agent ...

python學習系列1 爬取糗事百科段子

python 爬取糗事百科

Python爬取糗事百科

Python 爬取糗事百科

相關推薦