#!/user/bin/env python
# coding=utf-8
# @author: holley
# @file: baike1.py
# @datetime: 4/12/2018 14:32
'''description:
'''import requests
import re
import csv
from bs4 import beautifulsoup
from lxml import etree
class crawler(object):
def __init__(self):
self.base_url = ''
self.start_url = ''
self.csvfilename = 'qiushibaike.csv'
# self.data_list =
self.headers =
# def get_urls(self, url):
# html = requests.get(url, proxies=self.proxies).text
# soup = beautifulsoup(html, 'lxml')
# button = soup.find('ul', ).find_all('li')[-1]
# url = button.find('a')['href']
# return self.base_url + url
def get_html(self, url):
html = requests.get(url, proxies=self.proxies).text
return html
def get_contents(self, html):
selector = etree.html(html)
data_list =
parts = selector.xpath('//*[@id="content-left"]/div')
# 將每乙個段子部分分割槽,所需要的內容一一對應
for i in parts:
# print(type(i)) # soup = beautifulsoup(etree.tostring(i), 'lxml')
string_soup = str(etree.tostring(i), encoding="utf-8")
# print(soup)
# 獲取使用者id
try:
id = soup.find('h2').string.strip()
# 獲取使用者性別
pattern = re.compile('.*?', re.s)
gender = re.search(pattern, string_soup).group(1)
gender = gender.split('gender ')[1].split('icon')[0]
# 獲取使用者年齡
age = soup.find('div', ).string
except attributeerror:
id, gender, age = '匿名使用者', ' ', ' '
# 獲取內容
joke_div = soup.find('div', )
joke = joke_div.find('span').gettext().strip()
# 獲取
# 將csvdata中的資料迴圈寫入到csvfilename檔案中
for items in data:
writer.writerow(items)
if __name__ == '__main__':
base_url = ''
c = crawler()
for i in range(1, 14):
start_url = base_url + str(i) + '/'
html = c.get_html(start_url)
data = c.get_contents(html)
c.write_csv(data)
Scrapy 爬取糗事百科段子
1.python爬蟲實戰一之爬取糗事百科段子 2.在工作目錄建立myproject scrapy startproject myproject3.編寫 myproject myproject items.py coding utf 8 define here the models for your ...
爬取糗事百科,朗讀段子
一閒下來就不務正業了,寫個爬蟲,聽段子。額,mac自帶的語音朗讀,windows我就不知道啦,有興趣的可以去研究一下哈。環境 python 2.7 mac os 10.12 使用朗讀的 from subprocess import call call say hello pengge 當然了,聽起來...
爬取糗事百科段子內容
import requests,sqlite3,re class processdatatool object 資料處理的工具類 工具類中一般不寫 init 初始化屬性,只封裝工具方法對資料進行操作。工具類中的方法一般是以工具類居多。classmethod def process data cls,...