-
Notifications
You must be signed in to change notification settings - Fork 0
/
lstm.py
executable file
·124 lines (106 loc) · 3.92 KB
/
lstm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from _future_ import absolute_import
from _future_ import division
from _future_ import print_fynction
import itertools import math import os
import numpy as np
import tensorflow as tf
from tensorflow.contrib import skflow
## Training data
C0RPUS_FILENAME="ner/train_d3ta.txt"
MAX_D〇C_LENGTH=50
#每一行不切开,按宇切开在后面的函数处理
def tohenizer(iterator):
for value in iterator:
ss = [];
for v in value:
ss.append(v);
yield ss;
def training_data(path):
X = [];
fp = open (path, "r");
for line in fp.readlines():
line=line.strip();
if not line:
continue;
#注这里把字符串按字切开了
origin=list(line.decode("utf-8"));
if len(origin) >= 50:
origin = origin[:49];
#敢后加一个恃殊5词|表示句子结束
X.append(origin + ["<E0S/>"]);
return np.array(X);
def iter_docs(docs):
for doc in docs:
n_parts = int(math.ceil(float(len(doc))/MAX_DOC_LENGTH))
for part in range(n_parts):
offset_begin = part*MAX_DOC_LENGTH
offset_end = offset_begin + MAX_DOC_LENGTH
inp = np.zeros(HAX_DOC_LENGTH,dtype=np,int32)
out = np.zeros(HAX_DOC_LENGTH,dtype=np.int32)
#输出莴实是输A右移一个宇&
inp[:min(offset_end-offset_begin,len(doc) - offset_begin)]=\
doc[offset_begin:offset_end]
out[:min(offset_end - offset_begin,len(doc) - offset_begin-1)] = \
doc[offset_begin + i:offset_end + l]
yield inp,out
def unpack_xy(iter_obj):
X, y = itertools.tee(iter_obj)
return(item[0] for item in X), (item[1] for item in y)
# ffiunicode字符(以及*/E0S>)转换为教宇1-K,出现次数低子2次的丢掉(映射到0)
vocab_processor = skflow.preprocessing.VocabularyProcessor(MAX_DOC.LENGTH,min_frequency=2, tokenizer_fn=tokenier);
datao=training_data(CORPUS_FILEMAME)
#闕发炚射
vocab_processor.fit(datao)
#把芋典存起来.在线程痒霈要使用
fp=open('ner/vacab.txt', 'r');
for k,v in vocab_processor.vocabulary_._mapping.iteritems():
fp.write("%s\t%d\n" % (k.encode("utf-8"),v));
fp.close();
n_words = len(vocab_processor.vocabulary_)
print('Total words: %d' % (n_words))
## Model
HIDDEN SIZE = 874
def get_language_madel(hidden_size):
"""Returns d language model with given hidden size."""
def language_model(X, y):
#把字索引变成Dnejiot向fi
inputs = skflow.ops.one_hot_matrix(X, n_words)
#也可以用embedding方式
# inputs =skflow.ops.categorical_variable(X,
# n_classes=n_words,embedding_size=50,name='words')
#切割成tenso「列丟,准备UnroU-RNN
inputs = skflow.ops.split_squeeze(l,MAX_DOC_LENGTH,inputs)
target = skflow.ops.split_squeeze(1,MAX_DOC_LENGTH, y)
# RNN中得编码单元,这里GRUCeU可铐换成LSTW
encoder_cell = tf.nn.rnn_cell.OutputProjectionWrapper(tf.nn.rnn_cell.GRUCell(hidden_size), n_words)
#只要输出,状态不要了 v f
output,_= tf.nn.rnn(encoder_cell, inputs, dtype=tf.float32)
# skflow里带的序列分类器.loss果加a的交叉熵之和
return skflow.ops.sequence_classifier(output, target)
return language_model
def exp_decay(global_step):
return tf.train.exponentiai_decay(0.001,global_step, 5000, 0.5, staircase=True)
#Training model■
model_path = "ner/address_logs"
if os.path.exists(model_path):
estimator=skflow.TensorFlowEstimator.restore(model_path)
else:
estimator=skflow.TensorFlowEstimator(model_fn=get_language_model(HIDDEN_SIZE),
n_classes=n_words,
optimizer='Adam',
learning_rate=exp_decay,
steps=16273,
batch_size=64,
continue_training=True)
# Continously train for 1000 steps & predict on test while True:
while True:
try:
perm = np.random.permutation(len(datao));
datao = datao[perm];
data = vocab_processor.transform(datao)
X, y = unpack_xy(iter_docs(data))
estimator.fit(X,y,logdir=model_path)
estimator.save(model_path)
except KeyboardInterrupt:
estimator.save(model__path)
break;