-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathspeech_model.py
253 lines (185 loc) · 10.7 KB
/
speech_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import abc
import math
import tensorflow as tf
from tensorflow.contrib.layers import xavier_initializer
from speechInput import BaseInputLoader
class SpeechModel:
def __init__(self, input_loader: BaseInputLoader, input_size: int, num_classes: int):
self.input_loader = input_loader
self.input_size = input_size
self.convolution_count = 0
self.global_step = tf.Variable(0, trainable=False)
self.inputs, self.sequence_lengths, self.labels = input_loader.get_inputs()
self.logits = self._create_network(num_classes)
tf.summary.image('logits', tf.expand_dims(tf.transpose(self.logits, (1, 2, 0)), 3))
tf.summary.histogram('logits', self.logits)
def add_training_ops(self, learning_rate: bool = 1e-3, learning_rate_decay_factor: float = 0, max_gradient_norm: float = 5.0, momentum: float = 0.9):
"""
Hyperparamaters for training
learning rate = 1e-3 (need to experiment with => this gave the best results so far)
learning rate decay factor = the factor to multiply the learning rate with when you need to decrease it
max gradient norm = maximum gradient norm to apply, otherwise clipping is needed
momentum = momentum for the optimizer (default set to 0.9)
"""
self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32, name='learning_rate')
self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor)
"""
Variable summaries
"""
tf.summary.scalar('learning_rate', self.learning_rate)
# Loss and optimizer
if self.labels is not None:
with tf.name_scope('training'):
self.cost = tf.nn.ctc_loss(self.labels, self.logits, self.sequence_lengths // 2) #using ctc loss (refer to ctc and wav2letter paper)
self.avg_loss = tf.reduce_mean(self.cost, name='average_loss')
tf.summary.scalar('loss', self.avg_loss)
optimizer = tf.train.AdamOptimizer(self.learning_rate, epsilon=1e-3) #using AdamOptimizer {gave the best results}
gvs = optimizer.compute_gradients(self.avg_loss)
gradients, trainables = zip(*gvs)
clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm, name='clip_gradients')
self.update = optimizer.apply_gradients(zip(clipped_gradients, trainables), global_step=self.global_step, name='apply_gradients')
def add_decoding_ops(self, language_model: str = None, lm_weight: float = 0.8, word_count_weight: float = 0.0,
valid_word_count_weight: float = 2.3):
"""
For decoding the sound wave and predicting.
language model : we are not using any langugage model but we can use models like kenlm if needed and use
a beam search decoder instead of a greedy decoder for ctc
word count weight : Weight added for each word
valid word count weight : weight added for each word in vocabulary
lm weight : weight multiplied with the language model scoring
"""
with tf.name_scope('decoding'):
# These are only needed if we use a language model
self.lm_weight = tf.placeholder_with_default(lm_weight, shape=(), name='language_model_weight')
self.word_count_weight = tf.placeholder_with_default(word_count_weight, shape=(), name='word_count_weight')
self.valid_word_count_weight = tf.placeholder_with_default(valid_word_count_weight, shape=(), name='valid_word_count_weight')
# use a ctc greedy decoder to determine the decoded string
"""
If merge repeated = True, this is how greedy decoder works:
Sequence: A B B * B * B ( where * is the blank label ) becomes A B B B
If merge repeated = False, it becomes A B B B B (so merging is not done)
For my case, I want it to merge to decode the string.
"""
self.decoded, self.log_probabilities = tf.nn.ctc_greedy_decoder(self.logits, self.sequence_lengths // 2, merge_repeated=True)
def finalize(self, log_dir: str, run_name: str, run_type: str):
self.init = tf.global_variables_initializer()
self.saver = tf.train.Saver(tf.global_variables()) # Saves and restores variables to and from checkpoints
# Create summary writers for logging
self.merged_summaries = tf.summary.merge_all()
self.summary_writer = tf.summary.FileWriter('{}/{}_{}'.format(log_dir, run_name, run_type))
def _convolution(self, value, filter_width, stride, input_channels, out_channels, apply_non_linearity=True):
"""
Builds the convolutional layers (wav2letter paper)
value: input tensor to apply the convolution on (A 3D tensor => float16, float32, float64)
filter width : width of the filter (kernel)
stride : striding of the filter (kernel) (An integer => no. of entries by which the filter is moved right at each step)
input channels : number of input channels
output channels : number of output channels
apply non linearity : whether to apply a non linearity
Returns the convolutional output (adds biases)
"""
layer_id = self.convolution_count
self.convolution_count += 1
with tf.variable_scope('convolution_layer_{}'.format(layer_id)) as layer:
# Creates variables filter and bias
filters = tf.get_variable('filters', shape=[filter_width, input_channels, out_channels], dtype=tf.float32, initializer=xavier_initializer())
bias = tf.Variable(tf.constant(0.0, shape=[out_channels]), name='bias')
# Applies convolution (using a 1d standard convNN) padding = 'SAME',
# Builds a 1-D convolution given 3-D input and filter tensors.
convolution_out = tf.nn.conv1d(value, filters, stride, 'SAME', use_cudnn_on_gpu=True, name='convolution')
with tf.name_scope('summaries'):
# add depth of 1 makes the space [filter width, input channels, 1, output channels]
kernel_with_depth = tf.expand_dims(filters, 2)
# tf.image_summary [batch_Size = output chanels, height = filter width, width = input channels, channels = 1]
kernel_transposed = tf.transpose(kernel_with_depth, [3, 0, 1, 2])
# This will display random 3 filters from all the output channels
tf.summary.image(layer.name + 'filters', kernel_transposed, max_outputs=3)
tf.summary.histogram(layer.name + 'filters', filters)
tf.summary.image(layer.name + 'bias', tf.reshape(bias, [1, 1, out_channels, 1]))
tf.summary.histogram(layer.name + 'bias', bias)
convolution_out = tf.nn.bias_add(convolution_out, bias)
if apply_non_linearity:
# uses relu activation function (best results) to add non-linearity
activations = tf.nn.relu(convolution_out, name='activation')
tf.summary.histogram(layer.name + 'activation', activations)
return activations, out_channels
else:
return convolution_out, out_channels
def init_session(self, sess, init_variables=True):
"""
Initializes a new session for the model.
"""
if init_variables:
sess.run(self.init)
self.summary_writer.add_graph(sess.graph)
def step(self, sess, loss=True, update=True, decode=False, return_label=False, summary=False, feed_dict=None):
"""
sess = tensorflow session
update = if the network is to be trained
loss = if outputing the average loss is required
decode = if decoding is to be done
return label = is the label needs to be return
summary = if the summary needs to be generated
feed dict = additional tensors that can be fed
"""
output_feed = []
if loss:
output_feed.append(self.avg_loss)
if decode:
output_feed.append(self.decoded)
if return_label:
output_feed.append(self.labels)
if update:
output_feed.append(self.update)
if summary:
output_feed.append(self.merged_summaries)
input_feed_dict = self.input_loader.get_feed_dict() or {}
if feed_dict is not None:
input_feed_dict.update(feed_dict)
return sess.run(output_feed, feed_dict=input_feed_dict)
@abc.abstractclassmethod
def _create_network(self, num_classes):
raise NotImplementedError()
def restore(self, session, checkpoint_directory: str, reset_learning_rate: float = None):
ckpt = tf.train.get_checkpoint_state(checkpoint_directory)
if ckpt and ckpt.model_checkpoint_path:
self.saver.restore(session, ckpt.model_checkpoint_path)
self.init_session(session, init_variables=False)
if reset_learning_rate:
session.run(self.learning_rate.assign(reset_learning_rate))
else:
raise FileNotFoundError('No checkpoint for evaluation found')
def restore_or_create(self, session, checkpoint_directory: str, reset_learning_rate: float = None):
try:
self.restore(session, checkpoint_directory, reset_learning_rate)
except FileNotFoundError:
self.init_session(session, init_variables=True)
class Wav2LetterModel(SpeechModel):
def __init__(self, input_loader: BaseInputLoader, input_size: int, num_classes: int):
super().__init__(input_loader, input_size, num_classes)
def _create_network(self, num_classes):
# The first layer scales up from 128 (input size) to 250 channels.
# One striding layer of output size [batch size, maxtime/2, 250]
outputs, channels = self._convolution(self.inputs, 48, 2, self.input_size, 250)
# 7 layers without striding of output size [batch size, max time/2, 250]
for layer_idx in range(7):
outputs, channels = self._convolution(outputs, 7, 1, channels, channels)
# 1 layer with high kernel width and output size [batch size, max time/2, 2000]
outputs, channels = self._convolution(outputs, 32, 1, channels, channels * 8)
# 1 fully connected layer of output size [batch size, max time/2, 2000]
outputs, channels = self._convolution(outputs, 1, 1, channels, channels)
# 1 fully connected layer of output size [batch size, max time/2, number of classes]
# Last layer must have non linearity
outputs, channels = self._convolution(outputs, 1, 1, channels, num_classes, False)
# transpose logits to size [max time/2, batch size, number of classes]
return tf.transpose(outputs, (1, 0, 2))
def create_default_model(command, input_size: int, speech_input: BaseInputLoader) -> SpeechModel:
model = Wav2LetterModel(input_loader=speech_input, input_size=input_size, num_classes=29)
if command == 'train':
model.add_training_ops(learning_rate=flags.learning_rate, learning_rate_decay_factor=flags.learning_rate_decay_factor, max_gradient_norm=flags.max_gradient_norm, momentum=flags.momentum)
model.add_decoding_ops()
else:
model.add_training_ops()
model.add_decoding_ops(language_model=None, lm_weight=0.8, word_count_weight=0.0, valid_word_count_weight=2.3)
model.finalize(log_dir='log', run_name='best-weights', run_type='train')
return model