import urllib.request
import urllib.parse
import urllib.error
import re,time
import queue
import threading
operner = urllib.request.build_opener()
operner.addheaders = [headers]
urllib.request.install_opener(operner)
urlque = queue.queue()
list_url =
###使用**獲取網頁url內容
def use_proxy(url):
try:
# proxy = urllib.request.proxyhandler()
# operner = urllib.request.build_opener()
# urllib.request.install_opener(operner)
headers = ("user-agent",
operner = urllib.request.build_opener()
operner.addheaders = [headers]
urllib.request.install_opener(operner)
data = urllib.request.urlopen(url).read().decode('utf-8')
#print (data)
return data
except urllib.error.urlerror as e:
if hasattr(e,"code"):
print (e.code)
elif hasattr(e,"reason"):
print (e.reason)
except exception as e:
print ("exception"+str(e))
time.sleep(1)
###獲取文章的url連線,並將連線加入到佇列
class get_url(threading.thread):
def __init__(self,key,pagestart,pageend,urlque):
threading.thread.__init__(self)
self.pagestart = pagestart
self.pageend = pageend
self.key = key
self.urlque = urlque
def run(self):
try:
keycode = urllib.parse.quote(self.key)
for page in range(self.pagestart,self.pageend+1):
url = "" % (keycode,page)
data = use_proxy(url)
print ("data1的內容是",data)
listurl_pattern = '
' result = re.compile(listurl_pattern,re.s).findall(data)
print (result)
if len(result) == 0:
print ("沒有可用的url")
sys.exit()
for i in range(len(result)):
res = result[i].replace("amp;","").split(" ")[0].replace("\"" ,"")
self.urlque.put(res) ##加入佇列
self.urlque.task_done()
#return list_url
except urllib.error.urlerror as e:
if hasattr(e, "code"):
print(e.code)
elif hasattr(e, "reason"):
print(e.reason)
except exception as e:
print ("exception:",e)
##根據url獲取文章內容
class get_url_content(threading.thread):
def __init__(self,urlque):
threading.thread.__init__(self)
self.urlque = urlque
def run(self):
fh1 = open("d:\\python-script\\1.html", 'wb')
fh1.write(html1.encode("utf-8"))
fh1.close()
fh = open("d:\\python-script\\1.html", 'ab')
while true:
try:
url = self.urlque.get()
data_content = use_proxy(url)
title_pattern = '
.*?'
result_title = re.compile(title_pattern, re.s).findall(data_content)
##標題
res_title = result_title[0].replace("
","").strip()
content_pattern = 'id="js_content">(.*?)
' content = re.compile(content_pattern, re.s).findall(data_content)
#c = '
'# for i in content:
# ##內容
# c_content=i.replace(c, "").replace("
", "").replace("", "")
fh.write(res_title.encode("utf-8"))
for i in content:
fh.write(i.strip().encode("utf-8"))
except unicodeencodeerror as e:
continue
fh.close()
class contrl(threading.thread):
def __init__(self,urlqueue):
threading.thread.__init__(self)
self.urlqueue = urlqueue
while true:
print ("程式正在執行")
if self.urlqueue.empty():
time.sleep(3)
print ("程式執行完畢")
exit()
if __name__ == '__main__':
pagestart = 1
pageend = 2
key = "人工智慧"
get_url = get_url(key,pagestart,pageend,urlque)
get_url.start()
get_content = get_url_content(urlque)
get_content.start()
cntrol = contrl(urlque)
cntrol.start()
python微信爬蟲
import urllib.request import re import time import urllib.error 自定義函式,功能為使用 伺服器爬乙個 def use proxy proxy addr,url 異常處理機制 try req urllib.request.request ...
微博爬蟲python 微博爬蟲 python
本文爬取的是m站的微博內容,基於python 2.7 一 微博內容爬取 1.要爬取的微博首頁 2.手機微博是看不到翻頁,是一直往下載入的,但是其json格式的資料仍然以翻頁的形式呈現。3.開啟開發者工具,向下翻頁面,可以在network下的xhr的響應檔案中,找到json檔案的 如 通過分析發現每個...
關於微信指數爬蟲
1,普通條件欄位很好理解,就是size,page,keyword之類的,大多是控制資料庫的查詢條件,並且明文傳輸沒有加密。2,所以加密條件欄位就應該是有過加密的字段,例如passwd e10adc3949ba59abbe56e057f20f883e,密碼通常是要加密的,而且理論上應該是使用不可逆的加...