python三種方法爬取糗事百科時間對比

2021-08-14 19:45:21 字數 2423 閱讀 3011

# -*- coding: utf-8 -*-

"""created on fri jan 19 22:59:33 2018

@author: administrator

"""import requests

import time

headers=

import re

def re_scraper(url):

res=requests.get(url,headers=headers)

id=re.findall(r'',res.text)

text=re.findall(r'\s+(.*?)\s+',res.text)[:-1]

haoxiao=re.findall(r'(\d+) 好笑',res.text)

for id,text,xiao,ping in zip(id,text,haoxiao,pinglun):

info=

return info

from bs4 import beautifulsoup

def bs_scraper(url):

res=requests.get(url,headers=headers)

soup=beautifulsoup(res.text,'html.parser')

text_soup=soup.find_all('div',class_='content')#文字內容

text_list=list(map(lambda x:x.text.strip(),text_soup))

id_soup=soup.find_all('h2')#暱稱

id_list=list(map(lambda x:x.text.strip(),id_soup))

xiao_soup=soup.find_all('span',class_='stats-vote')#好笑數

xiao_list=list(map(lambda x:int(x.i.text),xiao_soup))

lun_list=list(map(lambda x:x.find('i','number'),lun_soup))

ping_list=

for pinglun in lun_list:

if pinglun != none:#抓取的有none值

else:

pass

for id,text,xiao,ping in zip(id_list,text_list,xiao_list,ping_list):

info=

return info

from lxml import etree

def xp_scraper(url):

res=requests.get(url,headers=headers)

file=etree.html(res.text)

zong_list=

for tt in ['long','hot','old']:#有三種形式

#自己構造迴圈部分,就是包含所有資訊的標籤(選定單條整個資訊)

zong='//div[@class="article block untagged mb15 typs_{}"]'.format(tt)

for cc in zong_list:

all=file.xpath(cc)

for each in all:

id=each.xpath('div[1]/a[2]/h2/text()')

if id != :

id1=id[0].strip()

else:

id1='匿名'

text=each.xpath('a[1]/div/span/text()')[0].strip()

haoxiao=each.xpath('div[2]/span[1]/i/text()')[0].strip()

pinglun=each.xpath('div[2]/span[2]/a/i/text()')[0].strip()

info=

return info

if __name__ == '__main__':

for name,scraper in [('re',re_scraper),('bs',bs_scraper),('xp',xp_scraper)]:

start = time.time()

for i in range(1,31):

url=''.format(i)

scraper(url)

end = time.time()

print(name,end-start)

結果如下:(時間)

re 11.39765214920044

bs 21.445226430892944

xp 11.944683313369751

python 爬取糗事百科

step 1 構建乙個提取糗事百科笑話的函式import urllib2 import urllib import re import thread import time import sys reload sys sys.setdefaultencoding utf 8 defgetpage p...

Python爬取糗事百科

一 引入模組 因為urlopen功能比較簡單,所以設定 ip需引入proxyhandler和build opener模組,ip的獲取可以上西祠 查詢 import re from urllib.request import request,build opener,proxyhandler base...

Python 爬取糗事百科

coding utf 8 import urllib2 import urllib import re class qiushi def init self self.page 1 從網頁獲取糗事 def getqiushis self,page url page 偽裝瀏覽器 user agent ...