scrapy框架 JDbook爬蟲

京東圖書爬蟲

# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
import json
class jjdspider(scrapy.spider):
name = 'jjd'
allowed_domains = ['jd.com', 'p.3.cn']
start_urls = ['']
def parse(self, response):# 提取所有大分類和所有小分類
# 獲取大分類的分組
div_list = response.xpath("//div[@class='mc']/dl/dt")
for dt in div_list:
item = {}
item["b_cate"] = dt.xpath("./a/text()").extract_first()
# 獲取小分類的分組
em_list = dt.xpath("./following-sibling::*[1]/em")
for em in em_list:
#小分類的位址
#構造小分類的url位址的請求，能夠進入列表頁
yield scrapy.request(
item["s_href"],
callback=self.parse_book_list,
meta=
)def parse_book_list(self, response):
item = response.meta["item"]
#圖書列表頁書的分組
li_list = response.xpath("//div[@id='plist']/ul/li")
for li in li_list:
item["book_name"] = li.xpath(".//div[@class='p-name']/a/em/text()").extract_first().strip()
item["book_author"] = li.xpath(".//span[@class='p-bi-name']/span/a/text()").extract_first()
item["book_press"] = li.xpath(".//span[@class='p-bi-store']/a/text()").extract_first()
item["book_pub_data"] = li.xpath(".//span[@class='p-bi-date']/text()").extract_first().strip()
item["book_sku"] = li.xpath("./div/@data-sku").extract_first()
# item["book_price"] = li.xpath(".//div[@class='p-price']/strong/i/text()").extract_first()
#獲取**的url位址
price_url = ""
#填充完整**的url位址
price_url_temp = price_url.format(item["book_sku"])
#傳送請求獲取**
yield scrapy.request(
price_url_temp,
callback=self.parse_book_price,
meta=
)#實現翻頁
next_url = response.xpath(".//a[@class='pn-next']/@href").extract_first()
if not next_url:
yield response.follow(
next_url,
callback= self.parse_book_list,
meta = 
)def parse_book_price(self, response):
item = response.meta["item"]
#item["book_price"] = json.loads(response.body.decode())[0]["op"]
yield item

scrapy框架全站資料爬取

每個都有很多頁碼，將中某板塊下的全部頁碼對應的頁面資料進行爬取實現方式有兩種 1 將所有頁面的url新增到start urls列表不推薦 2 自行手動進行請求傳送推薦 yield scrapy.request url,callback callback專門用做於資料解析下面我們介紹第二種...

使用scrapy框架爬boss直聘

boss直聘建立scrapy 專案 scrapy startproject scrapyproject建立spider檔案 scrapy genspider s boss zhipin.com目錄1.找介面 url 2.s boss.py 3.items.py 4.pipelines.py pag...

scrapy框架爬取王者榮耀面板

建立專案命令 scrapy startproject wangzhephotomax 建立爬蟲 scrapy genspider wangzhecrawl 更改settings.py中的設定 user agent obey robots.txt rules robotstxt obey false ...

scrapy框架 JDbook爬蟲

scrapy框架全站資料爬取

使用scrapy框架爬boss直聘

scrapy框架爬取王者榮耀面板

相關推薦