使用scrapy框架爬蟲，寫入到資料庫

import scrapy
class bossitem(scrapy.item):
# define the fields for your item here like:
# name = scrapy.field()
name = scrapy.field()     #對應實體類的屬性
salary = scrapy.field()

# -*- coding: utf-8 -*-
import scrapy
from boss.items import bossitem
class zhipinspider(scrapy.spider):
name = 'zhipin'    #爬蟲名稱
allowed_domains = ['lianjia.com']     #爬取域
start_urls = ['']   #爬取鏈結
def parse(self, response):
items = 
posts = response.xpath("//div[@class='content__list--item--main']")
for each in posts:
item = bossitem()
item["name"] = each.xpath("//p[@class='content__list--item--title twoline']/a/text()").extract()[0]
address = each.xpath("p[class='content__list--item--des']/a[position()<4]/text()").extract()
item["salary"] = each.xpath("span[@class='content__list--item-price']/em/text()").extract()[0]
#item["address"] = address[1]+address[2]+address[3]
print(item)
# yield item
return items
#測試是否能爬取到網頁
# with open("lianjia.html","w",encoding="utf-8") as file:
#     file.write(response.text)

item_pipelines = 
#瀏覽器**
# obey robots.txt rules
robotstxt_obey = false

import json
class bosspipeline(object):
def __init__(self):
self.file = open("lianjia.json","w",encoding="utf-8")
def process_item(self, item, spider):
content = json.dumps(dict(item),ensure_ascii = false)+"\n"
self.file.write(content)
return item
def close_spider(self,spider):
self.file.close()

from scrapy import cmdline
cmdline.execute("scrapy crawl zhipin".split())

執行結束後，會出現乙個json的檔案。

建立資料庫表python

import json
import pymysql
import traceback
from time import sleep
class pymysql(object):
create_table = 'create table lianjia(id int not null primary key auto_increment,name varchar(255) not null,salary int,address varchar(255))default charset=utf8'
def __init__(self, host, user, pwd, db):
self.conn = pymysql.connect(host, user, pwd, db)
self.cursor = self.conn.cursor()
def create_table_func(self):
self.cursor.execute(pymysql.create_table)
print('資料表建立完畢')
def insert_date(self,sql):
try:
self.cursor.execute(sql)
self.conn.commit()
except:
print(traceback.format_exc())
self.conn.rollback()
def select_data(self):
self.cursor.execute(pymysql.select)
all_data = self.cursor.fetchall()
for i in all_data:
print('查詢結果為：{}'.format(i))
if __name__ == '__main__':
my = pymysql('localhost', 'root', '123456', 'pytest')
# my.create_table_func()
with open('../lianjia.json','r',encoding='utf-8') as f:
for line in f.readlines():
print(line)
temp = json.loads(line)
name = temp['name'].strip();
salary = temp['salary']
address = temp['address']
sql = 'insert into lianjia(name,salary,address) values("%s","%s","%s")' % (name, salary, address)
my.insert_date(sql)

執行此方法將資料寫到資料據庫中。

相關問題

debug: forbidden by robots.txt

debug: crawled (403)

debug: redirecting (302) to

爬蟲2 2 scrapy框架檔案寫入

目錄 pipelines.py 前提回顧，spider.py中 data yield data import json class qsbkpipeline object def init self self.fp open qsbk.json w encoding utf 8 初始化檔案指標 de...

scrapy爬蟲框架

作者經過幾周的python爬蟲實踐之後，深入學習了一下scrapy這個爬蟲框架，現將一些基本知識和總結整理一下，以備後查。2.scrapy的命令列使用這部分網上很多部落格都有總結，不需要背，理解會用主要的命令 startproject crawl fetch list genspider.即可，...

scrapy 爬蟲框架

1.安裝公升級pip版本 pip install upgrade pip 通過pip安裝scrapy框架 pip install scrapy 安裝成功只執行scrapy 進行測試是否安裝成功 2.scrapy startproject 爬蟲專案名稱執行此命令,可以生成乙個爬蟲專案會預先生成...

使用scrapy框架爬蟲，寫入到資料庫

爬蟲2 2 scrapy框架 檔案寫入

scrapy爬蟲框架

scrapy 爬蟲框架

相關推薦

爬蟲2 2 scrapy框架檔案寫入