# -*- coding: utf-8 -*-
"""created on fri jan 19 22:59:33 2018
@author: administrator
"""import requests
import time
headers=
import re
def re_scraper(url):
res=requests.get(url,headers=headers)
id=re.findall(r'',res.text)
text=re.findall(r'\s+(.*?)\s+',res.text)[:-1]
haoxiao=re.findall(r'(\d+) 好笑',res.text)
for id,text,xiao,ping in zip(id,text,haoxiao,pinglun):
info=
return info
from bs4 import beautifulsoup
def bs_scraper(url):
res=requests.get(url,headers=headers)
soup=beautifulsoup(res.text,'html.parser')
text_soup=soup.find_all('div',class_='content')#文字內容
text_list=list(map(lambda x:x.text.strip(),text_soup))
id_soup=soup.find_all('h2')#暱稱
id_list=list(map(lambda x:x.text.strip(),id_soup))
xiao_soup=soup.find_all('span',class_='stats-vote')#好笑數
xiao_list=list(map(lambda x:int(x.i.text),xiao_soup))
lun_list=list(map(lambda x:x.find('i','number'),lun_soup))
ping_list=
for pinglun in lun_list:
if pinglun != none:#抓取的有none值
else:
pass
for id,text,xiao,ping in zip(id_list,text_list,xiao_list,ping_list):
info=
return info
from lxml import etree
def xp_scraper(url):
res=requests.get(url,headers=headers)
file=etree.html(res.text)
zong_list=
for tt in ['long','hot','old']:#有三種形式
#自己構造迴圈部分,就是包含所有資訊的標籤(選定單條整個資訊)
zong='//div[@class="article block untagged mb15 typs_{}"]'.format(tt)
for cc in zong_list:
all=file.xpath(cc)
for each in all:
id=each.xpath('div[1]/a[2]/h2/text()')
if id != :
id1=id[0].strip()
else:
id1='匿名'
text=each.xpath('a[1]/div/span/text()')[0].strip()
haoxiao=each.xpath('div[2]/span[1]/i/text()')[0].strip()
pinglun=each.xpath('div[2]/span[2]/a/i/text()')[0].strip()
info=
return info
if __name__ == '__main__':
for name,scraper in [('re',re_scraper),('bs',bs_scraper),('xp',xp_scraper)]:
start = time.time()
for i in range(1,31):
url=''.format(i)
scraper(url)
end = time.time()
print(name,end-start)
結果如下:(時間)
re 11.39765214920044
bs 21.445226430892944
xp 11.944683313369751
python 爬取糗事百科
step 1 構建乙個提取糗事百科笑話的函式import urllib2 import urllib import re import thread import time import sys reload sys sys.setdefaultencoding utf 8 defgetpage p...
Python爬取糗事百科
一 引入模組 因為urlopen功能比較簡單,所以設定 ip需引入proxyhandler和build opener模組,ip的獲取可以上西祠 查詢 import re from urllib.request import request,build opener,proxyhandler base...
Python 爬取糗事百科
coding utf 8 import urllib2 import urllib import re class qiushi def init self self.page 1 從網頁獲取糗事 def getqiushis self,page url page 偽裝瀏覽器 user agent ...