# -*- coding: utf-8 -*-
"""created on thu apr 16 23:18:27 2015
@author: shifeng
"""'''
功能:解析cdr_sample.xml檔案,輸出格式為dnorm接收的格式,並將訓練集的「label」寫入到文件中
xml檔案:見csdn資源共享
'''import codecs
import stringio
import xml
from lxml import etree
from xml.sax import *
from xml.sax.handler import *
from xml.etree import elementtree as et
import xml.dom.minidom
dom = xml.dom.minidom.parse("cdr_sample.xml")
root = dom.documentelement
#print root.nodename
#print root.nodevalue
#print root.nodetype
#print root.element_node
#-----------
'''方法一(未採納):
#知道元素名字的子元素,使用getelementsbytagname方法獲取
#colloction為根節點,有四個元素,知道其名,通過root.getelementsbytagname(i)便能取出其子元素
colloction_ele = ["source", "date", "key", "document"]
for i in colloction_ele:
print root.getelementsbytagname(i)[0].nodename #獲取標籤名字
# print root.getelementsbytagname(i)[0].getattribute
#documents有三個標籤
document_ele = ["id", "passage", "annotation"]
documents = root.getelementsbytagname("document")
#print len(documents)
for i in documents: #對每個文件,
for j in document_ele: #取出每個標籤
print i.getelementsbytagname(j)[0].nodename #獲取標籤名字
print i.getelementsbytagname(j)[0].firstchild.data #獲取標籤之間的資料
if j == "annotation":
print i.getelementsbytagname(j)[0].getattribute("id") #獲取標籤屬性
'''#-----------
write_text = open("train_text.txt","w")
#-----------
root_2 = et.parse("cdr_sample.xml")
documents = root_2.findall("./document")
for per in documents: #找到所有document
for child in per: #對於每個document解析其標籤id,passage,annotation
child_tag = child.tag
if child_tag =="id":
text_id = child.text
print child_tag,":",text_id
write_text.write(text_id+"\t") #寫入檔案,id和tab符號
elif child_tag =="passage": #對每個passage進行處理
passages = child
for passage in passages: #每個document標籤下,有多個passage標籤,
#passage有四種標籤,對每種標籤進行處理
passage_tag = passage.tag
if passage_tag == "offset": #r如果是偏移量,取出偏移量
offset = int(passage.text)
print "offset:",offset
elif passage_tag == "text": #如果是文字,取出文字,title_text或者abstract_text
text =passage.text
print passage_tag,"::",text
write_text.write(text) #寫入檔案,title_text和abstract_text兩個,連續寫在一起
elif passage_tag =="annotation": #如果是標註的,
annotations = passage
print 10*"*"
for annotation in annotations: #每個passage標籤下,annotation有四種標籤,對每種標籤處理
annotation_tag = annotation.tag
# print annotation_tag,"+++++++++++++++++++"
if annotation_tag == "location":
print annotation.attrib["offset"],annotation.attrib["length"]
elif annotation_tag == "text":
diease_name = annotation.text
print diease_name
elif annotation_tag == "infon" and annotation.attrib["key"] !="type":
#每個passage標籤下,有多個annotation,每個annotation下有兩個infon標籤,取第二個
infons = annotation
print infons.attrib["key"],infons.text
# for infon in infons:
# print infon.attrib["key"]
elif child_tag =="annotation": #document_ele[2]: #annotation
annotation = child
write_text.write("\n") #每個文件遍歷完一遍後,加乙個換行符號
print 30*"*"
write_text.close()
#「label」對照待續....
'''doc = etree.parse("cdr_sample.xml")
xml_string = etree.tostring(doc)
root = etree.fromstring(xml_string)
parser = make_parser()
# markdecodehandler
# markdecodehandler
handler = userdecodehandler()
parser.setcontenthandle(handler)
parser.parse(root)
for item in handler.marks:
for j in item.items():
print i,j
print type(doc)
print type(root)
# print doc.tag
print root.tag
# with codecs.open("cdr_sample.xml") as xml:
# text = xml.readlines()
# s_xml = ""
# for i in text:
# i=i.strip("\n")
# s_xml+=i
# print s_xml
# soup = beautifulsoup(s_xml)
# print soup.title
# for i in text:
# print i
'''
PythonXML檔案解析
sax是一種基於事件驅動的api。利用sax解析xml文件牽涉到兩個部分 解析器和事件處理器。解析器負責讀取xml文件,並向事件處理器傳送事件,如元素開始跟元素結束事件 而事件處理器則負責對事件作出相應,對傳遞的xml資料進行處理。適於處理下面的問題 在python中使用sax方式處理xml要先引入...
學習筆記 Python XML解析
xml.sax 菜鳥教程 官網文件 python由xml包 lib xml 提供對xml的支援。python處理xml主要有兩種模型,xml.dom和xml.sax分別定義了兩種處理模型的介面 the xml handling submodules are 手冊 inte ce section pu...
python xml解析和生成
解析使用xml.etree.elementtree 模組,生成使用xml.dom.minidom模組,elementtree比dom快,dom生成簡單且會自動格式化。xml version 1.0 encoding utf 8 baspools bas basprovider 0 basprovid...