python 多執行緒 gzip壓縮爬蟲

#coding=utf-8

import urllib

import urllib2

import threading

import htmlparser

import queue

import os

import stringio

import gzip

import re

import time

class geturllist(htmlparser.htmlparser):

def __init__(self):

htmlparser.htmlparser.__init__(self)

self.urlqueue=queue.queue(-1)

def handle_starttag(self,tag,attrs):

if tag =='a':

for key,value in attrs:

if key =='href':

print value

self.urlqueue.put(value)

class spider(threading.thread):

def __init__(self,myname,parser,path='f:\\uuuuu'):

threading.thread.__init__(self)

self.count=0

self.name=myname

self.parser=parser

self.timeout=5

self.dicpath=path

def run(self):

start=time.clock()

while true:

if self.parser.urlqueue.empty()==false:

url=self.parser.urlqueue.get()

data=

request=urllib2.request(url,urllib.urlencode(data))

opener=urllib2.build_opener()

try:

response=opener.open(request)

if response.code==200:

predata=response.read()

pdata=stringio.stringio(predata)

gzipper=gzip.gzipfile(fileobj=pdata)

try:

data=gzipper.read()

except:

data=predata

try:

self.parser.feed(data)

except:

print "open error"

try:

filepath = self.dicpath+"\\"+str(self.count)+".html"

print filepath

self.count += 1

file = open(filepath,'w')

file.write(data)

file.close()

except:

print " filewriting error"

except exception,e:

print "request error"+str(e)

else:

end=time.clock()

if end-start>self.timeout:

break

starturl=""

urldata=urllib.urlopen(starturl)

parser=geturllist()

parser.feed(urldata.read())

urldata.close()

spiderlist=

for i in range(3):

th=spider("the"+str(i)+"spider",parser)

for t in spiderlist:

print t

t.start()

for t in spiderlist:

t.join()

這個爬蟲的話只是使用了htmlparser進行文字的解析，然後開闢了3個執行緒進行操作，這個爬蟲應用起來還有許多的弊端，如動態網頁處理，網頁編碼的處理，網頁加密等，等以後學了爬蟲框架和beautifulsoap之後會有新的更新！加油！！

ps:這個爬蟲我沒有控制bfs的深度，所以你懂得，自己加個變數控制一下吧

開啟gzip壓縮

前端gzip壓縮一直都是必備的，簡單又能能壓縮不少的檔案體積，用了好久了今天記錄一下。我們伺服器用的nginx，進入伺服器下nginx.conf檔案，gzip on gzip min length 1k gzip buffers 4 16k gzip comp level 4 壓縮程度，1 9，建議...

檔案壓縮（Gzip）

今天頭鐵用system.io.compression類來寫一下檔案的gzip壓縮，結果給自己整暈了主要是壓縮之後我發現是有內容的，又想著寫一下解壓部分，結果要麼溢位，要麼解壓成功後得到乙個啥也沒有的空殼。下面我給大家分享一下壓縮部分吧我覺得應該也是有問題的，因為他有內容但是明顯不夠，純屬個人看...

python多執行緒 python多執行緒

通常來說，多程序適用於計算密集型任務，多執行緒適用於io密集型任務，如網路爬蟲。關於多執行緒和多程序的區別，請參考這個下面將使用python標準庫的multiprocessing包來嘗試多執行緒的操作，在python中呼叫多執行緒要使用multiprocessing.dummy，如果是多程序則去掉...

python 多執行緒 gzip壓縮 爬蟲

開啟gzip壓縮

檔案壓縮（Gzip）

python多執行緒 python多執行緒

相關推薦

python 多執行緒 gzip壓縮爬蟲