分類中電影的詳細鏈結可在ajax返回的json中檢視。
spider.py:
# -*- coding: utf-8 -*-
import scrapy,json
from urllib.parse import quote
from sec_douban.items import secdoubanitem
class spidermoviespider(scrapy.spider):
name = 'spidermovie'
allowed_domains = ['movie.douban.com']
page_start = 0
category_index = 0
url = ""
start_urls = [url.format(quote(category[category_index]),str(page_start))]
def parse(self, response):
if self.category_index < len(self.category):
movie_list=json.loads(response.text)['subjects']
#如果返回長度大於0,則取出20條url
if len(movie_list) >= 1:
print("233:" + response.url)
for each in movie_list:
#print(each['url'])
yield scrapy.request(each['url'],callback=self.myparse)
self.page_start += 20
yield scrapy.request(self.url.format(quote(self.category[self.category_index])
,str(self.page_start)),callback=self.parse)
#無url則說明分類取完,換下乙個分類
else:
self.page_start = 0
self.category_index += 1
yield scrapy.request(self.url.format(quote(self.category[self.category_index])
, str(self.page_start)),callback=self.parse)
print(self.category[self.category_index]+':'+str(self.page_start)+'\n')
def myparse(self,response):
movie_list =
for each in response.xpath("//div[@id='content']"):
item = secdoubanitem()
item_director=each.xpath(".//span/a[contains(@rel,'v:directedby')]/text()").extract()
if len(item_director) > 0:
item['director'] = item_director[0]
else:
item['director'] = "暫無"
item['moviename'] = each.xpath("//span[@property='v:itemreviewed']/text()").extract()[0]
item['type'] = each.xpath(".//span[@property='v:genre']/text()").extract()
item_year = each.xpath(".//span[@class='year']/text()").extract()
if len(item_year) > 0:
item['year'] = item_year[0][1:-1]
else:
item['year'] = "暫無"
item['grade'] = each.xpath("//div[@id='content']//strong[@class='ll rating_num']/text()").extract()[0]
item['url']=response.url
item_summary = each.xpath(".//span[@property='v:summary']/text()").extract()
if len(item_summary) >0 :
item['summary']=item_summary[0].strip()
else:
item['summary'] = "暫無"
yield item
return movie_list
pipelines.py
import codecs,json
class secdoubanpipeline(object):
def __init__(self):
self.output=codecs.open('movie.json','w',encoding='utf-8')
def process_item(self, item, spider):
jsontext=json.dumps(dict(item),ensure_ascii=false)+',\n'
self.output.write(jsontext)
return item
def close_spider(self,spider):
self.output.close()
最後匯出**:
scrapy框架爬取豆瓣電影的資料
1.什麼是scrapy框架?scrapy是乙個為了爬取 資料,提取結構性資料而編寫的應用框架。其可以應用在資料探勘,資訊處理或儲存歷史資料等一系列的程式中。其最初是為了頁面抓取 更確切來說,網路抓取 所設計的,也可以應用在獲取api所返回的資料 例如 amazon associates web se...
豆瓣熱門電影爬取
import requests import json import csv defgetonepagedata page start url headers params response requests.get start url,headers headers,params params i...
爬取豆瓣網電影資訊
coding utf 8 import urllib2 import bs4 from bs4 import beautifulsoup 爬取豆瓣網電影簡介,包括電影名,導演,評分以及介紹等 class dbtop def init self self.usr agent mozilla 5.0 w...