import sys
import os
import re
from pyltp import sentencesplitter, segmentor, postagger, parser, namedentityrecognizer, sementicrolelabeller
rootdir = 'd:/users/liang/pycharmprojects/analysiscase/'
data = os.path.join(rootdir, 'data/')
model = os.path.join(rootdir, 'model/ltp_data_v3.4.0/')
result = os.path.join(rootdir, 'result/')
# 名字轉換器
def get_seg(filename):
patter = '\d+'
return 'seg-{}.txt'.format(re.findall(patter, filename)[0])
def get_postag(filename):
patter = "\d+"
return 'postag-{}.txt'.format(re.findall(patter, filename)[0])
def get_txt(filename):
return filename.replace('.txtoriginal', '')
def get_train(filename):
return filename.replace('.txtoriginal', '_train_data')
# 檔案路徑 結果路徑 檔名
def process_pre(data_dir, result_dir, result_dir2, filename):
f_original = open(os.path.join(data_dir, filename), 'r', encoding='utf-8')
data_original = f_original.readlines()
str1 = ''
str2 = ''
for aaa in data_original[0:]:
# 分詞
segmentor = segmentor()
segmentor.load_with_lexicon(os.path.join(model, 'cws.model'), filename)
words = segmentor.segment(aaa)
segment123 = list(words)
for x in segment123[0:]:
str1 += '\t' + x
str1 += '\n'
# 標註
postagger = postagger()
postagger.load(os.path.join(model, 'pos.model'))
postags = postagger.postag(words)
postag123 = list(postags)
for x in postag123[0:]:
str2 += '\t' + x
str2 += '\n'
segmentor.release()
postagger.release()
segfilename = get_seg(filename)
postagfilename = get_postag(filename)
f1 = open(os.path.join(result_dir, segfilename), 'w+', encoding='utf-8')
f2 = open(os.path.join(result_dir, postagfilename), 'w', encoding='utf-8')
f1.write(str1)
f2.write(str2)
f1.close()
f2.close()
f_original.close()
# 資料標註雜糅
# 標註好的資料檔名
filename2 = get_txt(filename)
# 存放訓練資料檔名
filetrain = get_train(filename)
# 開啟標好的檔案
f2 = open(os.path.join(data_dir, filename2), 'r', encoding='utf-8')
# 開啟整合好的檔案
filetrain = open(os.path.join(result_dir2, filetrain), 'w+', encoding='utf-8')
# 1.標註好的檔案
data1 = f2.readlines()
# 1.整合的資料
data_combine = ""
# 1.原始資料
f_original = open(os.path.join(data_dir, filename), 'r', encoding='utf-8')
data2 = f_original.read()
# 處理好的資料
data_segment = str1.split('\t')
data_postag = str2.split('\t')
l = len(data_segment)
for i, data_s in enumerate(data_segment[0:]):
flag = 0
if (i == 0):
continue
if (i != l - 1):
data_combine += data_s + '/' + data_postag[data_segment.index(data_s)]
for ner in data1[0:]:
# data_split
# ['頭暈','39','40','症狀和體徵\n']
data_split = ner.split('\t')
if data_s == data2[int(data_split[1]):int(data_split[2]) + 1]:
flag = 1
if data_split[3] == '症狀和體徵\n' or data_split[3] == '症狀和體徵':
data_combine += '#s-nss '
break
elif data_split[3] == '檢查和檢驗\n' or data_split[3] == '檢查和檢驗':
data_combine += '#s-nii '
break
elif data_split[3] == '疾病和診斷\n' or data_split[3] == '疾病和診斷':
data_combine += '#s-ndd '
break
elif data_split[3] == '**\n' or data_split[3] == '**':
data_combine += '#s-nt '
break
elif data_split[3] == '身體部位\n' or data_split[3] == '身體部位':
data_combine += '#s-npb '
break
else:
break
if (flag == 0) and (i != l - 1):
data_combine += '#o '
else:
continue
filetrain.write(data_combine)
filetrain.close()
f_original.close()
# filesegment.close()
# filepostag.close()
if __name__ == '__main__':
filedir = data + '{}/'
resultdir = result + "one/{}/"
resultdir2 = result + "two/{}/"
# process_pre(filedir, resultdir, resultdir2,filename)
for root, dirs, files in os.walk(data):
print(root)
for item in files[1::2]:
# print(root.split('/')[-1], item)
rootsplit = root.split('/')[-1]
one = filedir.format(rootsplit) #每個資料夾資料路徑
two = resultdir.format(rootsplit)#結果1路徑
three = resultdir2.format(rootsplit)#結果2路徑
process_pre(one, two, three, item) #主要在於傳哪些引數,資料路徑,結果1和2路徑,檔名
python進行檔案操作
什麼是檔案 檔案是系統儲存區域的乙個命名位置,用來儲存一些資訊,便於後續訪問。能夠在非易失性儲存器中實現持續性儲存,比如在硬碟上。當我們要讀取或者寫入檔案時,我們需要開啟檔案 在操作完畢時,我們需要關閉檔案,以便釋放和檔案操作相關的系統資源,因此,檔案操作的主要包括以下 開啟檔案 python使用內...
用Python寫批處理
import oscommand labelme json to dataset json os.system command 今天在製作使用經labelme標註過的json檔案資料來源的時候,使用到了這個例項,遇到了乙個問題,在command中,如果含有括號的話,切記要記得加引號 coding u...
用批處理進行進製轉換
echo off setlocal enabledelayedexpansion set p a 請輸入要轉換的十進位制數 set aa a set p b 請輸入要轉換的幾進製?set str 0123456789abcde hex set a m a b set a n a b set n st...