簡介
純屬python小練習
檔案結構
#!/usr/bin/python
# -*- coding: utf-8 -*-
import urllib2
class html**********(object):
def downlod(self, url):
if url is none:
return none
response = urllib2.urlopen(url)
if response.getcode() != 200:
return none
return response.read()
html_outputer.py - 輸出結果到檔案中
#!/usr/bin/python
# -*- coding: utf-8 -*-
class htmloutputer(object):
def collect_data(self, movie_data):
if movie_data is none:
return
fout = open('output.html', 'a+')
for data in movie_data:
print data['name'] + '|', data['rate'] + '|', data['actor'], '\n'
fout.write('%s,' % data['name'].encode('utf-8'))
fout.write('%s,' % data['rate'])
fout.write('%s\n' % data['actor'].encode('utf-8'))
fout.close()
html_parser.py: 解析器:解析html的dom樹
#!/usr/bin/python
# -*- coding: utf-8 -*-
from bs4 import beautifulsoup
class htmlparser(object):
def __init__(self):
pass
def parser_html(self, cnt):
if cnt is none:
return
soup = beautifulsoup(cnt, 'html.parser', from_encoding='utf-8')
# movie_name, movie_desc, movie_rate =
return self.get_movie_names(soup)
def get_movie_names(self, soup):
movie_data =
movie_all = soup.find('div', class_='article').find_next('table').find_next_sibling('div').find_next_sibling('div').find_all('table')
count = 1
for movie_one in movie_all:
# if count > 2:
# break
count += 1
return movie_data
def get_movie_name(self, cnt):
info = {}
soup = beautifulsoup(str(cnt), 'html.parser', from_encoding='utf-8')
movie_one = soup.find('tr', class_='item').find_next('td').find_next_sibling('td').find('div', class_='pl2')
info['name'] = movie_one.find('a').get_text().replace("\n", "").replace(" ", "")
info['actor'] = movie_one.find('p', class_='pl').get_text().replace("\n", "").replace(" ", "")
info['rate'] = movie_one.find('div', class_='star clearfix').find('span', class_='rating_nums').get_text()
return info
spider_main.py - 主函式
#!/usr/bin/python
# -*- coding: utf-8 -*-
import html_parser, html_outputer, html_**********
class spidermain(object):
def __init__(self):
self.parser = html_parser.htmlparser()
self.outputer = html_outputer.htmloutputer()
self.********** = html_**********.html**********()
def craw(self, url):
html_cnt = self.**********.downlod(url)
movie_data = self.parser.parser_html(html_cnt)
self.outputer.collect_data(movie_data)
if __name__ == '__main__':
url = ''
spider = spidermain()
spider.craw(url)
綜述其實就是使用了urllib2和beautifulsoup庫,沒啥好說的,你也可以直接改url,然後更改html_parser.py檔案來滿足你自己的爬蟲需求。當前也可以更改html_outputer.py來定義儲存格式,目前是csv。
python爬取豆瓣影評
看的別人的 爬取某部影片的影評 沒有模擬登入只能爬6頁 encoding utf 8 import requests from bs4 import beautifulsoup import re import random import io import sys import time 使用se...
python爬取資料豆瓣讀書
xpath爬取指令碼 from urllib import request from lxml import etree base url response request.urlopen base url html response.read decode utf 8 htmls etree.ht...
python爬取豆瓣網頁短評實戰!
首先我們開啟我的父親母親的網頁介面 鏈結 可以觀察到如下介面以及讀者對本書的評價 接下來我們直接附上 書名 我的父親母親 出版社 南海出版公司 原作名 alfred and emily 譯者 匡詠梅 出版年 2013 1 頁數 238 定價 29.50元 裝幀 精裝 叢書 新經典文庫 萊辛作品 is...