#-*-coding:utf-8-*-
import sys
from htmlparser import htmlparser
reload(sys)
encoding = sys.getdefaultencoding()
if not encoding == 'utf-8':
sys.setdefaultencoding('utf-8')
class contentparser(htmlparser):
def __init__(self):
self.text = ''
self.is_comment = 0
self.is_content = 0
htmlparser.__init__(self)
def handle_starttag(self, tag, attr):
if tag == 'li':
for k,v in attr:
if k == 'id':
self.is_comment = 1;
if tag == 'a':
self.is_content = 0
if tag == 'p':
align = 0
for k,v in attr:
if k == 'align':
align = 1
if not align and not self.is_comment:
self.is_content = 1
def handle_endtag(self, tag):
if tag == 'li':
self.is_comment = 0
if tag == 'p':
self.is_content = 0
def handle_data(self, text):
if self.is_content:
text.find('稱呼')==-1 and\
text.find('內容')==-1:
self.text += '\n'+text
#print text
def get_text(self):
return self.text
if __name__ == '__main__':
fd = open(sys.argv[1])
cp = contentparser()
cp.feed(fd.read())
fd.close()
這個檔案儲存成progress.py
import sys,urllib2,time
from progress import contentparser
from htmlparser import htmlparser
reload(sys)
sys.setdefaultencoding('utf-8')
class linkparser(htmlparser):
def __init__(self):
self.link = ''
self.content = ''
self.mulu = ''
self.has_mulu = 0
self.is_mulu = 0
self.is_href = 0
self.start_time = 0
self.end_time = 0
htmlparser.__init__(self)
def handle_starttag(self, tag, attr):
if tag == 'div':
for k,v in attr:
if k == 'class' and v == 'mulu':
self.is_mulu = 1
if tag == 'a' and self.is_mulu:
self.is_href = 1
for k,v in attr:
if k == 'href':
self.link = v
if tag == 'td' and self.is_mulu:
for k,v in attr:
if k == 'colspan':
self.has_mulu = 1
def handle_endtag(self, tag):
if tag == 'div' and self.is_mulu and len(self.mulu):
self.is_mulu = 0
print 'end',self.mulu
self.mulu = ''
self.end_time = time.time()
print 'time : ', str(self.end_time - self.start_time)
if tag == 'a':
self.is_href = 0
if tag == 'td' and self.is_mulu and self.has_mulu:
self.has_mulu = 0
def handle_data(self, text):
if self.is_mulu and self.is_href:
self.content = text
progressing(self.link, self.mulu, self.content)
return
if self.has_mulu:
self.mulu = text
print 'begin',self.mulu
self.start_time = time.time()
def progressing(url, filename, chaptername):
chapter_text = get_chapter_text(url)
fd = open(filename, 'a')
fd.write(chaptername)
fd.write('\n'.format(chapter_text))
fd.close()
def get_chapter_text(url):
fd = urllib2.urlopen(urllib2.request(url))
cp = contentparser()
try:
cp.feed(fd.read())
except htmlparseerror, msg:
print msg
return cp.get_text()
if __name__ == '__main__':
fd = urllib2.urlopen(urllib2.request(''))
lp = linkparser()
try:
lp.feed(fd.read())
except htmlparseerror, msg:
print msg
第乙個小程式
2.建立pages目錄檔案 作用是放各個頁面的 3.建立頁面 給頁面起名字,並建立四個檔案 1 js 邏輯的實現 2 json 負責標題欄和一些狀態列 3 wxml 頁面文字 4 wxss 頁面樣式 4.把內容元素封裝在view內部,寫法 內容 5.這節課需要用到三個元件 文字 按鈕 1 2 文字 ...
python的第乙個小程式,helloword
程式的編寫有兩種就是編譯式和互動式。黑視窗的方式是互動式,互動式執行程式的方法如下 進入環境 終端輸入 python就進入了python直譯器。exit 退出 python環境 互動式的缺點是是寫一行執行一行,無法儲存,編譯式,可以統一程式設計,可以儲存和維護程式,所以採用編譯式進行程式編寫。編譯式...
第乙個python 程式
有人在論壇 上問 將日誌格式化的方法,剛好學python,就拿這個練手了 09 55 54 error1 tmp error log.3 50 times mon jun 28 00 00 53 2009 09 55 54 error1 tmp error log.3 50 times 09 56 ...