forked from dlsys-course/assignment1-2018
-
Notifications
You must be signed in to change notification settings - Fork 0
/
autodiff.py
382 lines (319 loc) · 13.3 KB
/
autodiff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
import numpy as np
class Node(object):
"""Node in a computation graph."""
def __init__(self):
"""Constructor, new node is indirectly created by Op object __call__ method.
Instance variables
------------------
self.inputs: the list of input nodes.
self.op: the associated op object,
e.g. add_op object if this node is created by adding two other nodes.
self.const_attr: the add or multiply constant,
e.g. self.const_attr=5 if this node is created by x+5.
self.name: node name for debugging purposes.
"""
self.inputs = []
self.op = None
self.const_attr = None
self.name = ""
def __add__(self, other):
"""Adding two nodes return a new node."""
if isinstance(other, Node):
new_node = add_op(self, other)
else:
# Add by a constant stores the constant in the new node's const_attr field.
# 'other' argument is a constant
new_node = add_byconst_op(self, other)
return new_node
def __mul__(self, other):
"""TODO: Your code here"""
if isinstance(other, Node):
new_node = mul_op(self, other)
else:
new_node = mul_byconst_op(self, other)
return new_node
# Allow left-hand-side add and multiply.
__radd__ = __add__
__rmul__ = __mul__
def __str__(self):
"""Allow print to display node name."""
return self.name
__repr__ = __str__
def Variable(name):
"""User defined variables in an expression.
e.g. x = Variable(name = "x")
"""
placeholder_node = placeholder_op()
placeholder_node.name = name
return placeholder_node
class Op(object):
"""Op represents operations performed on nodes."""
def __call__(self):
"""Create a new node and associate the op object with the node.
Returns
-------
The new node object.
"""
new_node = Node()
new_node.op = self
return new_node
def compute(self, node, input_vals):
"""Given values of input nodes, compute the output value.
Parameters
----------
node: node that performs the compute.
input_vals: values of input nodes.
Returns
-------
An output value of the node.
"""
raise NotImplementedError
def gradient(self, node, output_grad):
"""Given value of output gradient, compute gradient contributions to each input node.
Parameters
----------
node: node that performs the gradient.
output_grad: value of output gradient summed from children nodes' contributions
Returns
-------
A list of gradient contributions to each input node respectively.
"""
raise NotImplementedError
class AddOp(Op):
"""Op to element-wise add two nodes."""
def __call__(self, node_A, node_B):
new_node = Op.__call__(self)
new_node.inputs = [node_A, node_B]
new_node.name = "(%s+%s)" % (node_A.name, node_B.name)
return new_node
def compute(self, node, input_vals):
"""Given values of two input nodes, return result of element-wise addition."""
assert len(input_vals) == 2
return input_vals[0] + input_vals[1]
def gradient(self, node, output_grad):
"""Given gradient of add node, return gradient contributions to each input."""
return [output_grad, output_grad]
class AddByConstOp(Op):
"""Op to element-wise add a nodes by a constant."""
def __call__(self, node_A, const_val):
new_node = Op.__call__(self)
new_node.const_attr = const_val
new_node.inputs = [node_A]
new_node.name = "(%s+%s)" % (node_A.name, str(const_val))
return new_node
def compute(self, node, input_vals):
"""Given values of input node, return result of element-wise addition."""
assert len(input_vals) == 1
return input_vals[0] + node.const_attr
def gradient(self, node, output_grad):
"""Given gradient of add node, return gradient contribution to input."""
return [output_grad]
class MulOp(Op):
"""Op to element-wise multiply two nodes."""
def __call__(self, node_A, node_B):
new_node = Op.__call__(self)
new_node.inputs = [node_A, node_B]
new_node.name = "(%s*%s)" % (node_A.name, node_B.name)
return new_node
def compute(self, node, input_vals):
"""Given values of two input nodes, return result of element-wise multiplication."""
"""TODO: Your code here"""
assert len(input_vals) == 2
return input_vals[0] * input_vals[1]
def gradient(self, node, output_grad):
"""Given gradient of multiply node, return gradient contributions to each input."""
"""TODO: Your code here"""
return [node.inputs[1] * output_grad, node.inputs[0] * output_grad]
class MulByConstOp(Op):
"""Op to element-wise multiply a nodes by a constant."""
def __call__(self, node_A, const_val):
new_node = Op.__call__(self)
new_node.const_attr = const_val
new_node.inputs = [node_A]
new_node.name = "(%s*%s)" % (node_A.name, str(const_val))
return new_node
def compute(self, node, input_vals):
"""Given values of input node, return result of element-wise multiplication."""
"""TODO: Your code here"""
assert len(input_vals) == 1
return input_vals[0] * node.const_attr
def gradient(self, node, output_grad):
"""Given gradient of multiplication node, return gradient contribution to input."""
"""TODO: Your code here"""
return [node.const_attr * output_grad]
class MatMulOp(Op):
"""Op to matrix multiply two nodes."""
def __call__(self, node_A, node_B, trans_A=False, trans_B=False):
"""Create a new node that is the result a matrix multiple of two input nodes.
Parameters
----------
node_A: lhs of matrix multiply
node_B: rhs of matrix multiply
trans_A: whether to transpose node_A
trans_B: whether to transpose node_B
Returns
-------
Returns a node that is the result a matrix multiple of two input nodes.
"""
new_node = Op.__call__(self)
new_node.matmul_attr_trans_A = trans_A
new_node.matmul_attr_trans_B = trans_B
new_node.inputs = [node_A, node_B]
new_node.name = "MatMul(%s,%s,%s,%s)" % (node_A.name, node_B.name, str(trans_A), str(trans_B))
return new_node
def compute(self, node, input_vals):
"""Given values of input nodes, return result of matrix multiplication."""
"""TODO: Your code here"""
assert len(input_vals) == 2
A = input_vals[0].T if node.matmul_attr_trans_A else input_vals[0]
B = input_vals[1].T if node.matmul_attr_trans_B else input_vals[1]
return np.matmul(A, B)
def gradient(self, node, output_grad):
"""Given gradient of multiply node, return gradient contributions to each input.
Useful formula: if Y=AB, then dA=dY B^T, dB=A^T dY
"""
"""TODO: Your code here"""
return [matmul_op(output_grad, node.inputs[1], trans_B=not node.matmul_attr_trans_B),
matmul_op(node.inputs[0], output_grad, trans_A=not node.matmul_attr_trans_A)]
class PlaceholderOp(Op):
"""Op to feed value to a nodes."""
def __call__(self):
"""Creates a variable node."""
new_node = Op.__call__(self)
return new_node
def compute(self, node, input_vals):
"""No compute function since node value is fed directly in Executor."""
assert False, "placeholder values provided by feed_dict"
def gradient(self, node, output_grad):
"""No gradient function since node has no inputs."""
return None
class ZerosLikeOp(Op):
"""Op that represents a constant np.zeros_like."""
def __call__(self, node_A):
"""Creates a node that represents a np.zeros array of same shape as node_A."""
new_node = Op.__call__(self)
new_node.inputs = [node_A]
new_node.name = "Zeroslike(%s)" % node_A.name
return new_node
def compute(self, node, input_vals):
"""Returns zeros_like of the same shape as input."""
assert(isinstance(input_vals[0], np.ndarray))
return np.zeros(input_vals[0].shape)
def gradient(self, node, output_grad):
return [zeroslike_op(node.inputs[0])]
class OnesLikeOp(Op):
"""Op that represents a constant np.ones_like."""
def __call__(self, node_A):
"""Creates a node that represents a np.ones array of same shape as node_A."""
new_node = Op.__call__(self)
new_node.inputs = [node_A]
new_node.name = "Oneslike(%s)" % node_A.name
return new_node
def compute(self, node, input_vals):
"""Returns ones_like of the same shape as input."""
assert(isinstance(input_vals[0], np.ndarray))
return np.ones(input_vals[0].shape)
def gradient(self, node, output_grad):
return [zeroslike_op(node.inputs[0])]
# Create global singletons of operators.
add_op = AddOp()
mul_op = MulOp()
add_byconst_op = AddByConstOp()
mul_byconst_op = MulByConstOp()
matmul_op = MatMulOp()
placeholder_op = PlaceholderOp()
oneslike_op = OnesLikeOp()
zeroslike_op = ZerosLikeOp()
class Executor:
"""Executor computes values for a given subset of nodes in a computation graph."""
def __init__(self, eval_node_list):
"""
Parameters
----------
eval_node_list: list of nodes whose values need to be computed.
"""
self.eval_node_list = eval_node_list
def run(self, feed_dict):
"""Computes values of nodes in eval_node_list given computation graph.
Parameters
----------
feed_dict: list of variable nodes whose values are supplied by user.
Returns
-------
A list of values for nodes in eval_node_list.
"""
node_to_val_map = dict(feed_dict)
# Traverse graph in topological sort order and compute values for all nodes.
topo_order = find_topo_sort(self.eval_node_list)
"""TODO: Your code here"""
for node in topo_order:
if node in node_to_val_map:
continue
input_vals = [node_to_val_map[input] for input in node.inputs]
val = node.op.compute(node, input_vals)
node_to_val_map[node] = val
# Collect node values.
node_val_results = [node_to_val_map[node] for node in self.eval_node_list]
return node_val_results
def gradients(output_node, node_list):
"""Take gradient of output node with respect to each node in node_list.
Parameters
----------
output_node: output node that we are taking derivative of.
node_list: list of nodes that we are taking derivative wrt.
Returns
-------
A list of gradient values, one for each node in node_list respectively.
"""
# a map from node to a list of gradient contributions from each output node
node_to_output_grads_list = {}
# Special note on initializing gradient of output_node as oneslike_op(output_node):
# We are really taking a derivative of the scalar reduce_sum(output_node)
# instead of the vector output_node. But this is the common case for loss function.
node_to_output_grads_list[output_node] = [oneslike_op(output_node)]
# a map from node to the gradient of that node
node_to_output_grad = {}
# Traverse graph in reverse topological order given the output_node that we are taking gradient wrt.
reverse_topo_order = reversed(find_topo_sort([output_node]))
"""TODO: Your code here"""
for node in reverse_topo_order:
output_grads_list = node_to_output_grads_list[node]
output_grad = sum_node_list(output_grads_list)
node_to_output_grad[node] = output_grad
grads = node.op.gradient(node, output_grad)
for i in range(len(node.inputs)):
input_node = node.inputs[i]
if input_node not in node_to_output_grads_list:
node_to_output_grads_list[input_node] = []
node_to_output_grads_list[input_node].append(grads[i])
# Collect results for gradients requested.
grad_node_list = [node_to_output_grad[node] for node in node_list]
return grad_node_list
##############################
####### Helper Methods #######
##############################
def find_topo_sort(node_list):
"""Given a list of nodes, return a topological sort list of nodes ending in them.
A simple algorithm is to do a post-order DFS traversal on the given nodes,
going backwards based on input edges. Since a node is added to the ordering
after all its predecessors are traversed due to post-order DFS, we get a topological
sort.
"""
visited = set()
topo_order = []
for node in node_list:
topo_sort_dfs(node, visited, topo_order)
return topo_order
def topo_sort_dfs(node, visited, topo_order):
"""Post-order DFS"""
if node in visited:
return
visited.add(node)
for n in node.inputs:
topo_sort_dfs(n, visited, topo_order)
topo_order.append(node)
def sum_node_list(node_list):
"""Custom sum function in order to avoid create redundant nodes in Python sum implementation."""
from operator import add
from functools import reduce
return reduce(add, node_list)