建立爬蟲專案douban
scrapy startproject douban
設定items.py檔案,儲存要儲存的資料型別和欄位名稱
# -*- coding: utf-8 -*-
import scrapy
class
doubanitem
(scrapy.item):
title = scrapy.field
() # 內容
content = scrapy.field
() # 評分
rating_num = scrapy.field
() # 簡介
quote = scrapy.field
()
設定爬蟲檔案doubanmovies.py
# -*- coding: utf-8 -*-
import scrapy
from douban.items import doubanitem
class
doubanmoviesspider
(scrapy.spider):
name = 'doubanmovies'
allowed_domains = ['movie.douban.com']
offset = 0
url = ''
start_urls = [url + str(offset)]
defparse
(self, response):
# print('*'*60)
# print(response.url)
# print('*'*60)
item = doubanitem()
info = response.xpath("//div[@class='info']")
for each in info:
item['title'] = each.xpath(".//span[@class='title'][1]/text()").extract()
item['content'] = each.xpath(".//div[@class='bd']/p[1]/text()").extract()
item['rating_num'] = each.xpath(".//span[@class='rating_num']/text()").extract()
item['quote'] = each .xpath(".//span[@class='inq']/text()").extract()
yield item
# print(item)
self.offset += 25
if self.offset <= 250:
yield scrapy.request(self.url + str(self.offset),callback=self.parse)
設定管道檔案,使用mongodb資料庫來儲存爬取的資料。重點部分
# -*- coding: utf-8 -*-
from scrapy.conf import settings
import pymongo
class
doubanpipeline
(object):
def__init__
(self):
self.host = settings['mongodb_host']
self.port = settings['mongodb_port']
defprocess_item
(self, item, spider):
# 建立mongodb客戶端連線物件,該例從settings.py檔案裡面獲取mongodb所在的主機和埠引數,可直接書寫主機和埠
self.client = pymongo.mongoclient(self.host,self.port)
# 建立資料庫douban
self.mydb = self.client['douban']
# 在資料庫douban裡面建立表doubanmovies
# 把類似字典的資料轉換為phthon字典格式
content = dict(item)
# 把資料新增到表裡面
self.mysheetname.insert(content)
return item
設定settings.py檔案
# -*- coding: utf-8 -*-
bot_name = 'douban'
spider_modules = ['douban.spiders']
newspider_module = 'douban.spiders'
user_agent = 'mozilla/5.0 (compatible; msie 9.0; windows nt 6.1; trident/5.0;'
# configure a delay for requests for the same website (default: 0)
# see
# see also autothrottle settings and docs
download_delay = 3
# the download delay setting will honor only one of:
#concurrent_requests_per_domain = 16
#concurrent_requests_per_ip = 16
# disable cookies (enabled by default)
cookies_enabled = false
# configure item pipelines
# see
item_pipelines =
# mongodb資料庫設定變數
mongodb_host = '127.0.0.1'
mongodb_port = 27017
終端測試
scrapy crawl douban
lxml爬取豆瓣
coding utf 8 author wang fake useragent 第三方庫user agent模組,它提供了最新的,最全面的user agent 瀏覽器標識,支援谷歌,火狐,ie,opera等主流瀏覽器的user agent值 安裝方法 pip install fake userage...
python爬取豆瓣影評
看的別人的 爬取某部影片的影評 沒有模擬登入只能爬6頁 encoding utf 8 import requests from bs4 import beautifulsoup import re import random import io import sys import time 使用se...
nodejs爬取豆瓣影評
爬取豆瓣心靈奇旅影評,包括使用者主頁頭像 let request require request let fs require fs const path require path var startnum 0 起始爬取位置 傳送請求 function reqdata url else 請求處理 a...