-
Notifications
You must be signed in to change notification settings - Fork 1
/
train.py
137 lines (100 loc) · 3.68 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import pathlib
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
import pickle
from preprocessing import *
from layers.embedding import PositionalEmbedding
from layers.encoder import TransformerEncoder
from layers.decoder import TransformerDecoder
tf.keras.utils.set_random_seed(42)
def main():
url = "http://www.manythings.org/anki/tur-eng.zip"
dataset_path = pathlib.Path("dataset", "tur.txt")
vocab_size = 30000
embed_dim = 256
max_length = 100
dense_units = 2048
num_heads = 8
n = 1
dropout = 0.5
num_parallel_calls = 4
download_dataset(url, "dataset")
train_pairs, val_pairs, test_pairs = split_dataset(dataset_path)
source_vectorizer, target_vectorizer = build_vectorizers(
train_pairs, vocab_size, max_length
)
train_ds = create_dataset(
train_pairs,
(source_vectorizer, target_vectorizer),
num_parallel_calls=num_parallel_calls,
)
val_ds = create_dataset(
val_pairs,
(source_vectorizer, target_vectorizer),
num_parallel_calls=num_parallel_calls,
)
test_ds = create_dataset(
test_pairs,
(source_vectorizer, target_vectorizer),
num_parallel_calls=num_parallel_calls,
)
transformer = get_model(
vocab_size, embed_dim, max_length, dense_units, num_heads, n, dropout
)
transformer.compile(
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)
# model results after last epoch of training
# loss: 0.0607 - accuracy: 0.7840 - val_loss: 0.1576 - val_accuracy: 0.6381
transformer.fit(train_ds, epochs=10, validation_data=val_ds)
# model results on test set
# loss: 0.1584 - accuracy: 0.6382
transformer.evaluate(test_ds)
transformer.save("model/translator.h5")
def get_model(
vocab_size, embed_dim, max_length, dense_units, num_heads, n=1, dropout=0.5
):
"""
Returns the model which built according as parameters
Parameters
----------
vocab_size : int
Vocabulary size of the dataset
embed_dim : int
Output dimension of the embedding layer
max_length : int
Maximum length for sentences
dense_units : int
Number of units will be used inside dense layers
num_heads : int
Number of heads for MultiHeadAttention
n : int, default=3
Number of layers that will be stacked for both encoder and decoder layers
dropout : float, default=0.5
Applied dropout level
Returns
-------
An instance of tf.keras.models.Model
"""
encoder_inputs = Input(shape=(None,), name="english")
embedding = PositionalEmbedding(vocab_size, embed_dim, max_length)(encoder_inputs)
encoder_outputs = TransformerEncoder(num_heads, embed_dim, dense_units)(embedding)
for _ in range(n - 1):
encoder_outputs = TransformerEncoder(num_heads, embed_dim, dense_units)(
encoder_outputs
)
decoder_inputs = Input(shape=(None,), name="turkish")
embedding = PositionalEmbedding(vocab_size, embed_dim, max_length)(decoder_inputs)
decoder_outputs = TransformerDecoder(num_heads, embed_dim, dense_units)(
embedding, encoder_outputs
)
for _ in range(n - 1):
decoder_outputs = TransformerDecoder(num_heads, embed_dim, dense_units)(
decoder_outputs, encoder_outputs
)
dropout = Dropout(dropout)(decoder_outputs)
dense_outputs = Dense(vocab_size, activation="softmax")(dropout)
transformer = Model([encoder_inputs, decoder_inputs], dense_outputs)
return transformer
if __name__ == "__main__":
main()