這次使用scrapy
簡單的爬取一些多列表電影資料,儲存在csv檔案及json檔案中,最後把這些資料全部儲存在mongodb中。涉及的知識點有pipeline,yield,中介軟體,xpath,items 的使用。
# -*- coding: utf-8 -*-
import scrapy
from douban.items import doubanitem
class
doubanspiderspider
(scrapy.spider)
:# 爬蟲的名字
name =
'douban_spider'
# 允許的網域名稱
allowed_domains =
['movie.douban.com'
]# 入口url
start_urls =
['']def
parse
(self, response)
: movie_list = response.xpath(
'//div[@class="article"]//ol[@class="grid_view"]/li'
)for it in movie_list:
douban_item = doubanitem(
) douban_item[
'serial_number'
]= it.xpath(
".//div[@class='item']//em/text()"
).extract_first(
) douban_item[
'movie_name'
]= it.xpath(
'.//div[@class="hd"]//a/span[1]/text()'
).extract_first(
) content = it.xpath(
'.//div[@class="bd"]//p[1]/text()'
).extract(
)for c_introduce in content:
douban_item[
'introduce']=
"".join(c_introduce.split())
douban_item[
'star'
]= it.xpath(
'.//div[@class="star"]/span[@class="rating_num"]/text()'
).extract_first(
) douban_item[
'evaluate'
]= it.xpath(
'.//div[@class="star"]/span[4]/text()'
).extract_first(
) douban_item[
'describe'
]= it.xpath(
'.//p[@class="quote"]/span/text()'
).extract_first(
)print
(douban_item)
yield douban_item
next_link = response.xpath(
'//span[@class="next"]/a/@href'
).extract(
)if next_link:
next_link = next_link[0]
yield scrapy.request(
""+next_link, callback=self.parse)
import scrapy
class
doubanitem
(scrapy.item)
:# define the fields for your item here like:
# name = scrapy.field()
# 序號
serial_number = scrapy.field(
)# 電影名稱
movie_name = scrapy.field(
)# 電影的簡介
introduce = scrapy.field(
)# 電影的評分
star = scrapy.field(
) evaluate = scrapy.field(
)# 描述
describe = scrapy.field(
)# pass
class
my_user_agent
(object):
defprocess_request
(self,request,spider)
:# user agent 列表
user_agent_list =
['msie (msie 6.0; x11; linux; i686) opera 7.23'
,'opera/9.20 (macintosh; intel mac os x; u; en)'
,'opera/9.0 (macintosh; ppc mac os x; u; en)'
,'itunes/9.0.3 (macintosh; u; intel mac os x 10_6_2; en-ca)'
,'mozilla/4.76 [en_jp] (x11; u; sunos 5.8 sun4u)'
,'itunes/4.2 (macintosh; u; ppc mac os x 10.2)'
,'mozilla/5.0 (macintosh; intel mac os x 10.6; rv:5.0) gecko/20100101 firefox/5.0'
,'mozilla/5.0 (macintosh; intel mac os x 10.6; rv:9.0) gecko/20100101 firefox/9.0'
,'mozilla/5.0 (macintosh; intel mac os x 10.8; rv:16.0) gecko/20120813 firefox/16.0'
,'mozilla/4.77 [en] (x11; i; irix;64 6.5 ip30)'
,'mozilla/4.8 [en] (x11; u; sunos; 5.7 sun4u)'
]# 隨機生成user agent
user_agent = random.choice(user_agent_list)
request.headers[
'user_agent'
]= user_agent
import pymongo
from douban.settings import mongo_host,mongo_port,mongo_db_name,mongo_db_collecttion
class
doubanpipeline
(object):
def__init__
(self)
: host = mongo_host
port = mongo_port
dbname = mongo_db_name
cname = mongo_db_collecttion
client = pymongo.mongoclient(host=host,port=port)
mydb = client[dbname]
self.post = mydb[cname]
defprocess_item
(self, item, spider)
: data =
dict
(item)
self.post.insert(data)
return item
使用scrapy框架爬取資料並存入excel表中
爬取 爬取目標 獲得乙個地區七天之內的天氣狀況,並存入excel 中 爬蟲檔案部分 import scrapy from items import tianqiyubaoitem class tianqispider scrapy.spider name tianqi allowed domains...
爬取資料存入mysql
import pymysql mysql coon 主要的功能就是,將鏈結資料庫的操作變成只連線一次 需要先建立py1011資料庫,並建立列表xueqiu test用來儲存資料 class mysql conn object 魔術方法,初始化,建構函式 def init self self.db p...
scrapy 單執行緒爬取並存入mwsql
scrapy基本結構 爬取流程 定義隨機請求頭 抓取非同步網頁請參考 scrapy框架 基礎結構加爬取非同步載入資料的 專案完整例項 items.py class booksitem scrapy.item define the fields for your item here like name...