LSTM中文分詞

2021-09-26 21:23:27 字數 4609 閱讀 4566

預處理及訓練過程:

print('共有 %s 條資料' % len(data))

print('平均長度 %d' % np.mean([len(d.replace(" ", "")) for d in data]))

x_data =

y_data =

for sentence in data:

sentence = sentence.split(" ")

x =

y =

try:

for s in sentence:

s = s.strip()

if len(s) == 0:

continue

elif len(s) == 1:

elif len(s) > 1:

for i in range(1, len(s) - 1):

if len(x) > maxlen:

x = x[:maxlen]

y = y[:maxlen]

else:

for i in range(maxlen - len(x)):

except:

continue

else:

if len(x) > 0:

x_data = np.array(x_data)

y_data = np_utils.to_categorical(y_data, 5)

return x_data, y_data

x_train, y_train = load_data('data/msr/msr_training.utf8')

x_test, y_test = load_data('data/msr/msr_test_gold.utf8')

print('x_train size:', x_train.shape)

print('y_train size:', y_train.shape)

print('x_test size:', x_test.shape)

print('y_test size:', y_test.shape)

x = input(shape=[maxlen, ], dtype='int32', name='input')

embedding = embedding(input_dim=len(vocab) + 1, output_dim=embedding_size, input_length=maxlen, mask_zero=true)(x)

blstm = bidirectional(lstm(hidden_size, return_sequences=true), merge_mode='concat')(embedding)

blstm = dropout(0.6)(blstm)

blstm = bidirectional(lstm(hidden_size, return_sequences=true), merge_mode='concat')(blstm)

blstm = dropout(0.6)(blstm)

output = timedistributed(dense(5, activation='softmax'))(blstm)

model = model(x, output)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs)

print(model.evaluate(x_train, y_train, batch_size=batch_size))

print(model.evaluate(x_test, y_test, batch_size=batch_size))

def viterbi(nodes):

trans =

paths =

for l in range(1, len(nodes)):

paths_ = paths.copy()

paths = {}

for i in nodes[1].keys():

nows = {}

for j in paths_.keys():

if j[-1] + i in trans.keys():

nows[j + i] = paths_[j] + nodes[l][i] + trans[j[-1] + i]

nows = sorted(nows.items(), key=lambda x: x[1], reverse=true)

paths[nows[0][0]] = nows[0][1]

paths = sorted(paths.items(), key=lambda x: x[1], reverse=true)

return paths[0][0]

def cut_words(data):

data = re.split('[,。!?、\n]', data)

sens =

xs =

for sentence in data:

sen =

x =

sentence = list(sentence)

for s in sentence:

s = s.strip()

if not s == '' and s in char2id:

if len(x) > maxlen:

sen = sen[:maxlen]

x = x[:maxlen]

else:

for i in range(maxlen - len(x)):

if len(sen) > 0:

xs = np.array(xs)

ys = model.predict(xs)

results = ''

for i in range(ys.shape[0]):

nodes = [dict(zip(['s', 'b', 'm', 'e'], d[:4])) for d in ys[i]]

ts = viterbi(nodes)

for x in range(len(sens[i])):

if ts[x] in ['s', 'e']:

results += sens[i][x] + '/'

else:

results += sens[i][x]

return results[:-1]

print(cut_words('中國共產黨第十九次全國代表大會,是在全面建成小康社會決勝階段、中國特色社會主義進入新時代的關鍵時期召開的一次十分重要的大會。'))

print(cut_words('把這本書推薦給,具有一定程式設計基礎,希望了解資料分析、人工智慧等知識領域,進一步提公升個人技術能力的社會各界人士。'))

print(cut_words('結婚的和尚未結婚的。'))

張巨集論《深度有趣》

中文分詞 中文分詞及其應用

一 中文分詞原理 中文分詞是指將乙個漢字序列切分成乙個乙個單獨的詞。分詞就是將連續的字序列按照一定的規範重新組合成詞序列的過程。現有的分詞方法可分為三大類,分別是基於字串匹配的分詞方法 基於理解的分詞方法和基於統計的分詞方法。一 基於字串匹配的分詞方法 基於字串匹配的分詞方法又稱機械分詞方法,它是按...

bilstm crf中文分詞 多標準中文分詞模型

這是復旦19年6月剛發的文章,初略看一遍,記筆記。chinese word segmentation簡稱cws 將多標準學習引入到cws,不同分詞標準語料共享common knowledge,能夠互相提公升 作者提到應該是第一次從頭開始訓練transformer做分詞任務,避免了rnn和cnn的長依...

mysql 中文分詞 MySQL 中文分詞原理

一,首先我們來了解一下其他幾個知識點 1.mysql的索引意義?索引是加快訪問表內容的基本手段,尤其是在涉及多個表的關聯查詢裡。當然,索引可以加快檢索速度,但是它也同時降低了索引列的插入,刪除和更新值的速度。換通俗的話來講 mysql中的索引就是乙個特殊的平衡二叉樹,當在平衡二叉樹中搜尋某一條值的時...