-
Notifications
You must be signed in to change notification settings - Fork 9
/
multi_gpus.py
101 lines (86 loc) · 3.43 KB
/
multi_gpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#coding:utf-8
import os
import sys
import logging
import argparse
import tensorflow as tf
from model import CRNN
## logging config
logging.basicConfig(
level = logging.INFO,
format = '[%(levelname)-8s %(asctime)-11s L%(lineno)-4d] %(message)s',
datefmt = '%m-%d %H:%M')
class MultiGpusCRNN():
def __init__(self, n_gpus, optimizer, **kwargs):
self._n_gpus = n_gpus
self._optimizer = optimizer
self._models, self.objective, self.loss = self._multi_models(**kwargs)
self.predict, self.log_prob = self._predict(self._models)
self.error = self._error(self._models)
def feed_dict(self, inputs):
fd = {}
for i in range(self._n_gpus):
fd[self._models[i].image] = inputs[i].images
fd[self._models[i].label] = inputs[i].labels
return fd
def _predict(self, models):
pred = [m.predict for m in models]
prob = [m.log_prob for m in models]
return pred, prob
def _error(self, models):
error = tf.reduce_mean([m.error for m in models])
return error
def _multi_models(self, **kwargs):
models = []
## Calculate the gradients for each model tower.
tower_losses = []
tower_grads = []
with tf.variable_scope(tf.get_variable_scope()) as scope:
for i in xrange(self._n_gpus):
with tf.device('/gpu:%d' % i):
crnn = CRNN(**kwargs)
tf.get_variable_scope().reuse_variables()
## Calculate the gradients for the batch of data on this CIFAR tower.
grads = self._optimizer.compute_gradients(crnn.loss)
models.append(crnn)
tower_losses.append(crnn.loss)
## Keep track of the gradients across all towers.
tower_grads.append(grads)
loss = tf.reduce_mean(tower_losses)
# We must calculate the mean of each gradient. Note that this is the
# synchronization point across all towers.
grads = self._average_gradients(tower_grads)
# Apply the gradients to adjust the shared variables.
apply_grad = self._optimizer.apply_gradients(grads)
return models, apply_grad, loss
def _average_gradients(self, tower_grads):
"""Calculate the average gradient for each shared variable across all towers.
Note that this function provides a synchronization point across all towers.
Args:
tower_grads: List of lists of (gradient, variable) tuples. The outer list
is over individual gradients. The inner list is over the gradient
calculation for each tower.
Returns:
List of pairs of (gradient, variable) where the gradient has been averaged
across all towers.
"""
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads