-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
58 lines (41 loc) · 1.69 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
import numpy as np
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("-d", "--data", required=True, help="path to the data file")
args = vars(ap.parse_args())
tokenizer = Tokenizer()
data = open(args["data"]).read()
corpus = data.lower().split("\n")
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
input_sequences = []
for line in corpus:
token_list = tokenizer.texts_to_sequences([line])[0]
for i in range(1, len(token_list)):
n_gram_sequence = token_list[:i+1]
input_sequences.append(n_gram_sequence)
# pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(
input_sequences, maxlen=max_sequence_len, padding='pre'))
# create predictors and label
xs, labels = input_sequences[:, :-1], input_sequences[:, -1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
def create_model(total_words, max_sequence_len):
model = Sequential()
model.add(Embedding(total_words, 100, input_length=(max_sequence_len-1))),
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation="softmax"))
model.compile(loss="categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"])
print(model.summary())
return model
model = create_model(total_words, max_sequence_len)
history = model.fit(xs, ys, epochs=50, verbose=1)
model.save("model.h5")