python爬豆瓣 Python 爬一下豆瓣電影

簡介

純屬python小練習

檔案結構

#!/usr/bin/python

# -*- coding: utf-8 -*-

import urllib2

class html**********(object):

def downlod(self, url):

if url is none:

return none

response = urllib2.urlopen(url)

if response.getcode() != 200:

return none

return response.read()

html_outputer.py - 輸出結果到檔案中

#!/usr/bin/python

# -*- coding: utf-8 -*-

class htmloutputer(object):

def collect_data(self, movie_data):

if movie_data is none:

return

fout = open('output.html', 'a+')

for data in movie_data:

print data['name'] + '|', data['rate'] + '|', data['actor'], '\n'

fout.write('%s,' % data['name'].encode('utf-8'))

fout.write('%s,' % data['rate'])

fout.write('%s\n' % data['actor'].encode('utf-8'))

fout.close()

html_parser.py: 解析器：解析html的dom樹

#!/usr/bin/python

# -*- coding: utf-8 -*-

from bs4 import beautifulsoup

class htmlparser(object):

def __init__(self):

pass

def parser_html(self, cnt):

if cnt is none:

return

soup = beautifulsoup(cnt, 'html.parser', from_encoding='utf-8')

# movie_name, movie_desc, movie_rate =

return self.get_movie_names(soup)

def get_movie_names(self, soup):

movie_data =

movie_all = soup.find('div', class_='article').find_next('table').find_next_sibling('div').find_next_sibling('div').find_all('table')

count = 1

for movie_one in movie_all:

# if count > 2:

# break

count += 1

return movie_data

def get_movie_name(self, cnt):

info = {}

soup = beautifulsoup(str(cnt), 'html.parser', from_encoding='utf-8')

movie_one = soup.find('tr', class_='item').find_next('td').find_next_sibling('td').find('div', class_='pl2')

info['name'] = movie_one.find('a').get_text().replace("\n", "").replace(" ", "")

info['actor'] = movie_one.find('p', class_='pl').get_text().replace("\n", "").replace(" ", "")

info['rate'] = movie_one.find('div', class_='star clearfix').find('span', class_='rating_nums').get_text()

return info

spider_main.py - 主函式

#!/usr/bin/python

# -*- coding: utf-8 -*-

import html_parser, html_outputer, html_**********

class spidermain(object):

def __init__(self):

self.parser = html_parser.htmlparser()

self.outputer = html_outputer.htmloutputer()

self.********** = html_**********.html**********()

def craw(self, url):

html_cnt = self.**********.downlod(url)

movie_data = self.parser.parser_html(html_cnt)

self.outputer.collect_data(movie_data)

if __name__ == '__main__':

url = ''

spider = spidermain()

spider.craw(url)

綜述其實就是使用了urllib2和beautifulsoup庫，沒啥好說的，你也可以直接改url，然後更改html_parser.py檔案來滿足你自己的爬蟲需求。當前也可以更改html_outputer.py來定義儲存格式，目前是csv。

python爬豆瓣 Python 爬一下豆瓣電影

python爬取豆瓣影評

python爬取資料豆瓣讀書

python爬取豆瓣網頁短評實戰！

python爬豆瓣 Python 爬一下豆瓣電影

python爬取豆瓣影評

python爬取資料豆瓣讀書

python爬取豆瓣網頁短評實戰！

相關推薦