#corpor file reader
#author: percylee
#time: 2006/08
class corporfilereader:
"""reader for corpor file, which is labeled just like pku-corpor of
renmin ribao. e.g. 在/p 1998年/t 來臨/v 之際/f ,/w ..."""
def __init__(self,fpath,splitstr = ' '):
"""need file path to init corporfilereader"""
self.fpath = fpath
self.title = ''
self.document = ''
self.docwordlist =
self.splitstr = splitstr
def __doctitle(self):
if self.docwordlist.__len__() <= 2:
return none
title = self.docwordlist[1]#for [0] is '199801-.../m'
for wordno in range(2,self.docwordlist.__len__()):
if self.docwordlist[wordno].find('/w') >= 0:
title = title + self.docwordlist[wordno] + ' '
return title
def read(self):
"""read title and document from corpor file"""
file = open(self.fpath)
self.document = file.read()
self.docwordlist = self.document.split(self.splitstr)
self.title = self.__doctitle()
def doctitle(self):
"""get document title"""
return self.title
#test class
print '...in test...'
corporader = corporfilereader('g://pycode//pkucorpora1.txt')
print 'create one object of ' + corporader.__doc__
print 'and read one document which/'s title is ' + corporader.doctitle()
...in test...
create one object of reader for corpor file, which is labeled just like pku-corpor of
renmin ribao. e.g. 在/p 1998年/t 來臨/v 之際/f ,/w ...
and read one document which's title is 邁向/v 充滿/v 希望/n 的/u 新/a 世紀/n
