輸入部落格的主網域名稱,就可以抓取這個部落格所有的文章編號。
# -*- coding:utf8 -*-
import string
import urllib2
import re
import time
import random
class csdn_spider:
def __init__(self,url):
self.myurl = url
self.datas =
print u"csdn爬蟲已啟動...."
def csdn(self):
url = self.myurl + "?viewmode=list"
user_agents = [
'mozilla/5.0 (windows; u; windows nt 5.1; it; rv:1.8.1.11) gecko/20071127 firefox/2.0.0.11',
'opera/9.25 (windows nt 5.1; u; en)',
'mozilla/4.0 (compatible; msie 6.0; windows nt 5.1; sv1; .net clr 1.1.4322; .net clr 2.0.50727)',
'mozilla/5.0 (compatible; konqueror/3.5; linux) khtml/3.5.5 (like gecko) (kubuntu)',
'lynx/2.8.5rel.1 libwww-fm/2.14 ssl-mm/1.4.1 gnutls/1.2.9',
"mozilla/5.0 (x11; ubuntu; linux i686; rv:10.0) gecko/20100101 firefox/10.0 ",
]agent = random.choice(user_agents)
req = urllib2.request(url)
req.add_header('user-agent', agent)
req.add_header('host', 'blog.csdn.net')
req.add_header('accept', '*/*')
req.add_header('referer', '')
req.add_header('get', url)
mypage = urllib2.urlopen(req).read().decode("utf8")
#print mypage
pagenum = self.page_counter(mypage)
#print pagenum
self.find_data(self.myurl,pagenum)
def page_counter(self,mypage):#尾頁
mymatch = re.search(u'/article/list/(\d+?)">尾頁',mypage,re.s)
if mymatch:
pagenum = int(mymatch.group(1))
print u"爬蟲報告:發現目錄一共%d頁" %pagenum
else:
pagenum = 0
print u"爬蟲報告:沒找到頁面的數量"
return pagenum
def find_data(self,myurl,pagenum):
name = myurl.split("/")
f = open(name[-1] + '.txt','w+')
for i in range(1,pagenum+1):
print i
print u"爬蟲報告:第%d頁正在載入中......" % i
url = myurl + "/article/list/" + str(i)
user_agents = [
'mozilla/5.0 (windows; u; windows nt 5.1; it; rv:1.8.1.11) gecko/20071127 firefox/2.0.0.11',
'opera/9.25 (windows nt 5.1; u; en)',
'mozilla/4.0 (compatible; msie 6.0; windows nt 5.1; sv1; .net clr 1.1.4322; .net clr 2.0.50727)',
'mozilla/5.0 (compatible; konqueror/3.5; linux) khtml/3.5.5 (like gecko) (kubuntu)',
'lynx/2.8.5rel.1 libwww-fm/2.14 ssl-mm/1.4.1 gnutls/1.2.9',
"mozilla/5.0 (x11; ubuntu; linux i686; rv:10.0) gecko/20100101 firefox/10.0 ",
]agent = random.choice(user_agents)
req = urllib2.request(url)
req.add_header('user-agent', agent)
req.add_header('host', 'blog.csdn.net')
req.add_header('accept', '*/*')
req.add_header('referer', url)
req.add_header('get', url)
mypage = urllib2.urlopen(req).read()
myitems = re.findall(u'">
C 採集CSDN單個部落格所有文章
原理 通過htmlagilitypack解析html原始碼得到所需的資料。1 首先通過 底部的 xx條資料 共xx頁 獲取得總頁數 3 獲取單個文章的內容。using system using system.collections.generic using system.componentmode...
批量抓取csdn部落格列表文章,簡化後轉為pdf儲存
標籤 blog python 簡介 這是乙個簡單的python指令碼。目前實現的功能只能將某個指定csdn部落格的所有博文精簡網頁後,只保留標題和內容,然後轉為pdf,以博文標題為名稱儲存在本地。update 新版本支援 高亮外掛程式。todo usage 需要安裝這些python模組 beauti...
如何爬取CSDN部落格中分欄的所有文章的標題 鏈結
import re import requests from bs4 import beautifulsoup headers 網頁鏈結 link 獲取網頁 r requests.get link,headers headers,timeout 10 使用soup進行過濾 soup beautifu...