items建立爬取模型如下# -*- coding: utf-8 -*-
# scrapy settings for jobbole project
## for simplicity, this file contains only settings considered important or
# commonly used. you can find more settings consulting the documentation:
bot_name = 'jobbole'
spider_modules = ['jobbole.spiders']
newspider_module = 'jobbole.spiders'
item_pipelines =
# obey robots.txt rules
robotstxt_obey = true
# -*- coding: utf-8 -*-
import scrapy
from jobbole.items import jobboleitem
from bs4 import beautifulsoup
class jobbolespider(scrapy.spider):
name = 'jobbole'
allowed_domains =
start_urls = [""]
def parse(self, response):
item = jobboleitem()
print 'image_urls', item['page_urls']
yield item
new_url = response.xpath('//*[@class="next page-numbers"]//@href').extract_first() # 翻頁
print 'new_url', new_url
if new_url:
yield scrapy.request(new_url, callback=self.parse)
pipeline定義如何使用爬取的鏈結獲得所需的爬取內容,如下import scrapy
class jobbleitem(scrapy.item):
# define the fields for your item here like:
執行scrapy crawl projectname,即可在指定的資料夾看到批量儲存好的文章,檔名就是文章標題。-*- coding: utf-8 -*-
# define your item pipelines here
## don't forget to add your pipeline to the item_pipelines setting
# see:
from jobbole import settings
import os
import urllib
from bs4 import beautifulsoup
class jobbolepipeline(object):
def process_item(self, item, spider):
a=0dir_path = '%s/%s' % (settings.pages_store, # 儲存路徑
print 'dir_path', dir_path
if not os.path.exists(dir_path):
for page_url in item['page_urls']:
soup1 = beautifulsoup(html)
headitems = soup1.find("div", attrs=).gettext().encode("gb18030",'ignore')#將爬取內容按正確格式編碼
print headitems
list_name = page_url.split('/')
print 'listname',list_name
file_name = str(headitems).strip('\n')+'.txt' #去除爬取內容末尾的換行符,以便生成檔名
print 'filename', file_name
file_path = '%s/%s' % (dir_path, file_name)
print 'filepath', file_path
if os.path.exists(file_name):
with open(file_path, 'wb') as file_writer:
content = soup1.find("div", attrs=).gettext().encode("gb18030",'ignore')#這行很重要,將爬取內容按正確格式編碼
return item
