在setting中設定mysql連線資訊
host='101.201.70.139'mysql_dbname='anttest'
mysql_password='myjr678!@#'
mysql_user='root'
2、在pipeline中新增mysql非同步連線類
from twisted.enterprise import adbapi
import mysqldb
from mysqldb import cursors
class mysqltwistedpipline(object):
def __init__(self,dbpool):self.dbpool=dbpool
@classmethod
def from_settings(cls,settings):
dbparms = dict(
host = settings['host'],
db = settings['mysql_dbname'],
user = settings['mysql_user'],
passwd = settings['mysql_password'],
charset='utf8',
cursorclass=mysqldb.cursors.dictcursor,
use_unicode=true
)dbpool = adbapi.connectionpool('mysqldb',**dbparms)
return cls(dbpool)
3、重寫 process_item方法
def process_item(self, item, spider):query = self.dbpool.runinteraction(self.do_insert,item)
query.adderrback(self.handle_error)
4.定義錯誤處理
def handle_error(self,failure):print failure
5、新增向資料庫新增資料的方法
def do_insert(self,cursor,item):sql插入語句
無需輸入conn.commit
6、在setting.py中新增item——pipeline路徑#完整檔案
# -*- coding: utf-8 -*-# define your item pipelines here
## don't forget to add your pipeline to the item_pipelines setting
# see:
from twisted.enterprise import adbapi
import mysqldb
from mysqldb import cursors
class zhongguosoudipipeline(object):
def process_item(self, item, spider):
return item
class mysqltwistedpipline(object):
def __init__(self,dbpool):
self.dbpool=dbpool
@classmethod
def from_settings(cls,settings):
dbparms = dict(
host = settings['host'],
db = settings['mysql_dbname'],
user = settings['mysql_user'],
passwd = settings['mysql_password'],
charset='utf8',
cursorclass=mysqldb.cursors.dictcursor,
use_unicode=true
)dbpool = adbapi.connectionpool('mysqldb',**dbparms)
return cls(dbpool)
def process_item(self, item, spider):
query = self.dbpool.runinteraction(self.do_insert,item)
query.adderrback(self.handle_error)
def handle_error(self,failure):
print failure
def do_insert(self,cursor,item):
sql_in = 'insert into land_deals(title,publish_date,province_id,province_name,articles_link,content,create_time,transact_status)values ("%s","%s","%d","%s","%s","%s",now(),"%d")'%(item['title'],item['publish_time'],0,"",item['url'],item['con'].replace('"','\''),1)
cursor.execute(sql_in)
scrapy框架基本使用
進入工程目錄 建立爬蟲檔案 編寫爬蟲檔案 執行工程 allow domains 允許的網域名稱 parse self,response scrapy工程預設是遵守robots協議的,需要在配置檔案中進行操作。基於管道 3.將在爬蟲檔案中解析的資料儲存封裝到item物件中 4.將儲存了解析資料的ite...
Scrapy框架基礎使用
1 流程框架 2 在命令列中輸入scrapy,會有scrapy常見命令引數 在命令中輸入scrapy startproject quote建立乙個叫quote的專案 cd到建立好的專案目錄中,然後執行scrapy genspider quotes quotes.toscrape.com,建立spid...
爬蟲入門六(非同步的爬蟲框架scrapy)
非同步的爬蟲框架。高效能的資料解析,持久化儲存,全棧資料的爬取,中介軟體,分布式 框架 就是乙個整合好了各種功能且具有很強通用性的乙個專案模板。linux pip3 install scrapy windows a.pip3 install wheel d.pip3 install pywin32 ...