首先介紹crf++的具體安裝和使用
下面我講一下我的思路:
由於task2根目錄下的_crfpp.so沒有引入
所以正常能執行的是資料夾task2_b-i下的內容,此資料夾下對詞語分類只有b i兩種
首先我用msr_training.utf8 通過python程式 make_crf_train_data.py轉化成訓練語料需要的格式,即tag_train_data.utf8,
然後我開始訓練模型,得到model 再利用crf自帶的python工具包,對輸入文字分詞,具體實現是通過python程式 crf_segment.py ,
最後就將msr_test.utf8 分詞得到 crf_tag_result.utf8.
crf_segment.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# crf_segmenter.py
# usage:python crf_segmenter.py crf_model test_file result_file
# 利用crf自帶的python工具包,對輸入文字進行分詞
import codecs
import sys
import crfpp
def crf_segmenter(input_file, output_file, tagger):
input_data = codecs.open(input_file, 'r', 'utf-8')
output_data = codecs.open(output_file, 'w', 'utf-8')
forline
in input_data.readlines():
tagger.clear()
forword
inline.strip():
word = word.strip()
ifword:
tagger.add((word + "\to\tb").encode('utf-8'))
tagger.parse()
size = tagger.size()
xsize = tagger.xsize()
for i in range(0, size):
for j in range(0, xsize):
char = tagger.x(i, j).decode('utf-8')
tag = tagger.y2(i)
if tag == 'b':
output_data.write(' ' + char)
elif tag == 'm':
output_data.write(char)
elif tag == 'e':
output_data.write(char + ' ')
else: # tag == 's'
output_data.write(' ' + char + ' ')
output_data.write('\n')
input_data.close()
output_data.close()
if __name__ == '__main__':
iflen(sys.argv) != 4:
print("usage: python crf_segmenter.py crf_model test_file result_file")
sys.exit()
crf_model = sys.argv[1]
input_file = sys.argv[2]
output_file = sys.argv[3]
tagger = crfpp.tagger("-m " + crf_model)
crf_segmenter(input_file, output_file, tagger)
make_crf_train_data.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# make_crf_train_data.py
# 得到crf++要求的格式的訓練檔案
# 用法:命令列--python dataprocess.py input_file output_file
import sys
import codecs
# 4 tags for character tagging: b(begin), e(end), m(middle), s(single)
def character_4tagging(input_file, output_file):
input_data = codecs.open(input_file, 'r', 'utf-8')
output_data = codecs.open(output_file, 'w', 'utf-8')
forline
in input_data.readlines():
word_list = line.strip().split()
forword
in word_list:
iflen(word) == 1:
output_data.write(word + "\ts\n")
else:
output_data.write(word[0] + "\tb\n")
for w in
word[1:len(word) - 1]:
output_data.write(w + "\tm\n")
output_data.write(word[len(word) - 1] + "\te\n")
output_data.write("\n")
input_data.close()
output_data.close()
# 6 tags for character tagging: b(begin), e(end), m(middle), s(single), m1, m2
def character_6tagging(input_file, output_file):
input_data = codecs.open(input_file, 'r', 'utf-8')
output_data = codecs.open(output_file, 'w', 'utf-8')
forline
in input_data.readlines():
word_list = line.strip().split()
forword
in word_list:
iflen(word) == 1:
output_data.write(word + "\ts\n")
elif len(word) == 2:
output_data.write(word[0] + "\tb\n")
output_data.write(word[1] + "\te\n")
elif len(word) == 3:
output_data.write(word[0] + "\tb\n")
output_data.write(word[1] + "\tm\n")
output_data.write(word[2] + "\te\n")
elif len(word) == 4:
output_data.write(word[0] + "\tb\n")
output_data.write(word[1] + "\tm1\n")
output_data.write(word[2] + "\tm\n")
output_data.write(word[3] + "\te\n")
elif len(word) == 5:
output_data.write(word[0] + "\tb\n")
output_data.write(word[1] + "\tm1\n")
output_data.write(word[2] + "\tm2\n")
output_data.write(word[3] + "\tm\n")
output_data.write(word[4] + "\te\n")
elif len(word) > 5:
output_data.write(word[0] + "\tb\n")
output_data.write(word[1] + "\tm1\n")
output_data.write(word[2] + "\tm2\n")
for w in
word[3:len(word) - 1]:
output_data.write(w + "\tm\n")
output_data.write(word[len(word) - 1] + "\te\n")
output_data.write("\n")
input_data.close()
output_data.close()
if __name__ == '__main__':
iflen(sys.argv) != 3:
print ("usage: python dataprocess.py inputfile outputfile")
sys.exit()
input_file = sys.argv[1]
output_file = sys.argv[2]
character_4tagging(input_file, output_file)
r 運用CRF技術進行簡單分詞
input data codecs.open pku training,r utf 8 output data codecs.open pku training out,w utf 8 for line in input data.readlines word list line.strip spl...
CRF中文分詞開源版發布啦
crf中文分詞開源版發布啦 langiner gmail.com 中文分詞經過艱苦的研發,終於發布了。中文分詞是網際網路應用不可缺少的基礎技術之一,也是語音和語言產品必不可少的技術元件。自2003年第一屆國際中文分詞評測以來,由字構詞的分詞方法獲得了壓倒性優勢,國內主要通過crf 開源軟體包來學習該...
CRF 及CRF 安裝與解釋
conditional random field 條件隨機場,一種機器學習技術 模型 crf由john lafferty最早用於nlp技術領域,其在nlp技術領域中主要用於文字標註,並有多種應用場景,例如 本文主要描述如何使用crf技術來進行中文分詞。1.crf把分詞當做字的詞位分類問題,通常定義字...