import scrapy
class bossitem(scrapy.item):
# define the fields for your item here like:
# name = scrapy.field()
name = scrapy.field() #對應實體類的屬性
salary = scrapy.field()
# -*- coding: utf-8 -*-
import scrapy
from boss.items import bossitem
class zhipinspider(scrapy.spider):
name = 'zhipin' #爬蟲名稱
allowed_domains = ['lianjia.com'] #爬取域
start_urls = [''] #爬取鏈結
def parse(self, response):
items =
posts = response.xpath("//div[@class='content__list--item--main']")
for each in posts:
item = bossitem()
item["name"] = each.xpath("//p[@class='content__list--item--title twoline']/a/text()").extract()[0]
address = each.xpath("p[class='content__list--item--des']/a[position()<4]/text()").extract()
item["salary"] = each.xpath("span[@class='content__list--item-price']/em/text()").extract()[0]
#item["address"] = address[1]+address[2]+address[3]
print(item)
# yield item
return items
#測試是否能爬取到網頁
# with open("lianjia.html","w",encoding="utf-8") as file:
# file.write(response.text)
item_pipelines =
#瀏覽器**
# obey robots.txt rules
robotstxt_obey = false
import json
class bosspipeline(object):
def __init__(self):
self.file = open("lianjia.json","w",encoding="utf-8")
def process_item(self, item, spider):
content = json.dumps(dict(item),ensure_ascii = false)+"\n"
self.file.write(content)
return item
def close_spider(self,spider):
self.file.close()
from scrapy import cmdline
cmdline.execute("scrapy crawl zhipin".split())
執行結束後,會出現乙個json的檔案。
建立資料庫表python
import json
import pymysql
import traceback
from time import sleep
class pymysql(object):
create_table = 'create table lianjia(id int not null primary key auto_increment,name varchar(255) not null,salary int,address varchar(255))default charset=utf8'
def __init__(self, host, user, pwd, db):
self.conn = pymysql.connect(host, user, pwd, db)
self.cursor = self.conn.cursor()
def create_table_func(self):
self.cursor.execute(pymysql.create_table)
print('資料表建立完畢')
def insert_date(self,sql):
try:
self.cursor.execute(sql)
self.conn.commit()
except:
print(traceback.format_exc())
self.conn.rollback()
def select_data(self):
self.cursor.execute(pymysql.select)
all_data = self.cursor.fetchall()
for i in all_data:
print('查詢結果為:{}'.format(i))
if __name__ == '__main__':
my = pymysql('localhost', 'root', '123456', 'pytest')
# my.create_table_func()
with open('../lianjia.json','r',encoding='utf-8') as f:
for line in f.readlines():
print(line)
temp = json.loads(line)
name = temp['name'].strip();
salary = temp['salary']
address = temp['address']
sql = 'insert into lianjia(name,salary,address) values("%s","%s","%s")' % (name, salary, address)
my.insert_date(sql)
執行此方法將資料寫到資料據庫中。
相關問題
debug: forbidden by robots.txt
debug: crawled (403)
debug: redirecting (302) to
爬蟲2 2 scrapy框架 檔案寫入
目錄 pipelines.py 前提回顧,spider.py中 data yield data import json class qsbkpipeline object def init self self.fp open qsbk.json w encoding utf 8 初始化檔案指標 de...
scrapy爬蟲框架
作者經過幾周的python爬蟲實踐之後,深入學習了一下scrapy這個爬蟲框架,現將一些基本知識和 總結整理一下,以備後查。2.scrapy的命令列使用 這部分網上很多部落格都有總結,不需要背,理解會用主要的命令 startproject crawl fetch list genspider.即可,...
scrapy 爬蟲框架
1.安裝 公升級pip版本 pip install upgrade pip 通過pip安裝scrapy框架 pip install scrapy 安裝成功 只執行scrapy 進行測試是否安裝成功 2.scrapy startproject 爬蟲專案名稱 執行此命令,可以生成乙個爬蟲專案 會預先生成...