-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgradcheck.py
393 lines (336 loc) · 17 KB
/
gradcheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
import torch
from torch._six import container_abcs, istuple
import torch.testing
from itertools import product
import warnings
def zero_gradients(x):
if isinstance(x, torch.Tensor):
if x.grad is not None:
x.grad.detach_()
x.grad.data.zero_()
elif isinstance(x, container_abcs.Iterable):
for elem in x:
zero_gradients(elem)
def make_jacobian(input, num_out):
if isinstance(input, torch.Tensor):
if not input.is_floating_point():
return None
if not input.requires_grad:
return None
return torch.zeros(input.nelement(), num_out, dtype=input.dtype)
elif isinstance(input, container_abcs.Iterable) and not isinstance(input, str):
jacobians = list(filter(
lambda x: x is not None, (make_jacobian(elem, num_out) for elem in input)))
if not jacobians:
return None
return type(input)(jacobians)
else:
return None
def iter_tensors(x, only_requiring_grad=False):
if isinstance(x, torch.Tensor):
if x.requires_grad or not only_requiring_grad:
yield x
elif isinstance(x, container_abcs.Iterable) and not isinstance(x, str):
for elem in x:
for result in iter_tensors(elem, only_requiring_grad):
yield result
def get_numerical_jacobian(fn, input, target=None, eps=1e-3):
"""
input: input to `fn`
target: the Tensors wrt whom Jacobians are calculated (default=`input`)
Note that `target` may not even be part of `input` to `fn`, so please be
**very careful** in this to not clone `target`.
"""
if target is None:
target = input
output_size = fn(input).numel()
jacobian = make_jacobian(target, output_size)
# It's much easier to iterate over flattened lists of tensors.
# These are reference to the same objects in jacobian, so any changes
# will be reflected in it as well.
x_tensors = [t for t in iter_tensors(target, True)]
j_tensors = [t for t in iter_tensors(jacobian)]
# TODO: compare structure
for x_tensor, d_tensor in zip(x_tensors, j_tensors):
# need data here to get around the version check because without .data,
# the following code updates version but doesn't change content
if x_tensor.is_sparse:
def get_stride(size):
dim = len(size)
tmp = 1
stride = [0] * dim
for i in reversed(range(dim)):
stride[i] = tmp
tmp *= size[i]
return stride
x_nnz = x_tensor._nnz()
x_size = list(x_tensor.size())
x_indices = x_tensor._indices().t()
x_values = x_tensor._values().data
x_stride = get_stride(x_size)
for i in range(x_nnz):
x_value = x_values[i]
for x_idx in product(*[range(m) for m in x_values.size()[1:]]):
indices = x_indices[i].tolist() + list(x_idx)
d_idx = sum(indices[k] * x_stride[k] for k in range(len(x_size)))
orig = x_value[x_idx].item()
x_value[x_idx] = orig - eps
outa = fn(input).clone()
x_value[x_idx] = orig + eps
outb = fn(input).clone()
x_value[x_idx] = orig
r = (outb - outa) / (2 * eps)
d_tensor[d_idx] = r.detach().reshape(-1)
elif x_tensor.layout == torch._mkldnn:
if len(input) != 1:
raise ValueError('gradcheck currently only supports functions with 1 input, but got: ',
len(input))
x_tensor = x_tensor.data
for d_idx, x_idx in enumerate(product(*[range(m) for m in x_tensor.size()])):
# this is really inefficient, but without indexing implemented, there's
# not really a better way than converting back and forth
x_tensor_dense = x_tensor.to_dense()
orig = x_tensor_dense[x_idx].item()
x_tensor_dense[x_idx] = orig - eps
x_tensor_mkl = x_tensor_dense.to_mkldnn()
outa = fn([x_tensor_mkl])
x_tensor_dense[x_idx] = orig + eps
x_tensor_mkl = x_tensor_dense.to_mkldnn()
outb = fn([x_tensor_mkl])
r = (outb - outa) / (2 * eps)
d_tensor[d_idx] = r.detach().reshape(-1)
else:
x_tensor = x_tensor.data
for d_idx, x_idx in enumerate(product(*[range(m) for m in x_tensor.size()])):
orig = x_tensor[x_idx].item()
x_tensor[x_idx] = orig - eps
outa = fn(input).clone()
x_tensor[x_idx] = orig + eps
outb = fn(input).clone()
x_tensor[x_idx] = orig
r = (outb - outa) / (2 * eps)
d_tensor[d_idx] = r.detach().reshape(-1)
return jacobian
def get_analytical_jacobian(input, output):
# it is easier to call to_dense() on the sparse output than
# to modify analytical jacobian
if output.is_sparse:
raise ValueError('Sparse output is not supported at gradcheck yet. '
'Please call to_dense() on the output of fn for gradcheck.')
if output.layout == torch._mkldnn:
raise ValueError('MKLDNN output is not supported at gradcheck yet. '
'Please call to_dense() on the output of fn for gradcheck.')
diff_input_list = list(iter_tensors(input, True))
jacobian = make_jacobian(input, output.numel())
jacobian_reentrant = make_jacobian(input, output.numel())
grad_output = torch.zeros_like(output)
flat_grad_output = grad_output.view(-1)
reentrant = True
correct_grad_sizes = True
for i in range(flat_grad_output.numel()):
flat_grad_output.zero_()
flat_grad_output[i] = 1
for jacobian_c in (jacobian, jacobian_reentrant):
grads_input = torch.autograd.grad(output, diff_input_list, grad_output,
retain_graph=True, allow_unused=True)
for jacobian_x, d_x, x in zip(jacobian_c, grads_input, diff_input_list):
if d_x is not None and d_x.size() != x.size():
correct_grad_sizes = False
elif jacobian_x.numel() != 0:
if d_x is None:
jacobian_x[:, i].zero_()
else:
d_x_dense = d_x.to_dense() if not d_x.layout == torch.strided else d_x
assert jacobian_x[:, i].numel() == d_x_dense.numel()
jacobian_x[:, i] = d_x_dense.contiguous().view(-1)
for jacobian_x, jacobian_reentrant_x in zip(jacobian, jacobian_reentrant):
if jacobian_x.numel() != 0 and (jacobian_x - jacobian_reentrant_x).abs().max() != 0:
reentrant = False
return jacobian, reentrant, correct_grad_sizes
def _as_tuple(x):
if istuple(x):
return x
elif isinstance(x, list):
return tuple(x)
else:
return x,
def _differentiable_outputs(x):
return tuple(o for o in _as_tuple(x) if o.requires_grad)
def gradcheck(func, inputs, eps=1e-6, atol=1e-5, rtol=1e-3, raise_exception=True, check_sparse_nnz=False):
r"""Check gradients computed via small finite differences against analytical
gradients w.r.t. tensors in :attr:`inputs` that are of floating point type
and with ``requires_grad=True``.
The check between numerical and analytical gradients uses :func:`~torch.allclose`.
.. note::
The default values are designed for :attr:`input` of double precision.
This check will likely fail if :attr:`input` is of less precision, e.g.,
``FloatTensor``.
.. warning::
If any checked tensor in :attr:`input` has overlapping memory, i.e.,
different indices pointing to the same memory address (e.g., from
:func:`torch.expand`), this check will likely fail because the numerical
gradients computed by point perturbation at such indices will change
values at all other indices that share the same memory address.
Args:
func (function): a Python function that takes Tensor inputs and returns
a Tensor or a tuple of Tensors
inputs (tuple of Tensor or Tensor): inputs to the function
eps (float, optional): perturbation for finite differences
atol (float, optional): absolute tolerance
rtol (float, optional): relative tolerance
raise_exception (bool, optional): indicating whether to raise an exception if
the check fails. The exception gives more information about the
exact nature of the failure. This is helpful when debugging gradchecks.
check_sparse_nnz (bool, optional): if True, gradcheck allows for SparseTensor input,
and for any SparseTensor at input, gradcheck will perform check at nnz positions only.
Returns:
True if all differences satisfy allclose condition
"""
def fail_test(msg):
if raise_exception:
raise RuntimeError(msg)
return False
tupled_inputs = _as_tuple(inputs)
if any(t.is_sparse for t in tupled_inputs if isinstance(t, torch.Tensor)) and not check_sparse_nnz:
return fail_test('gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False.')
# Make sure that gradients are saved for all inputs
any_input_requiring_grad = False
some_input_not_requiring_grad = False
for inp in tupled_inputs:
if isinstance(inp, torch.Tensor):
if inp.requires_grad:
if inp.dtype != torch.float64:
warnings.warn(
'At least one of the inputs that requires gradient '
'is not of double precision floating point. '
'This check will likely fail if all the inputs are '
'not of double precision floating point. ')
any_input_requiring_grad = True
else:
some_input_not_requiring_grad = True
inp.retain_grad()
if not any_input_requiring_grad:
raise ValueError(
'gradcheck expects at least one input tensor to require gradient, '
'but none of the them have requires_grad=True.')
if some_input_not_requiring_grad:
raise ValueError(
'gradcheck expects if at least one input tensor is required gradient, '
'then all other inputs should have requires_grad=True.')
func_out = func(*tupled_inputs)
output = _differentiable_outputs(func_out)
if not output:
for i, o in enumerate(func_out):
def fn(input):
return _as_tuple(func(*input))[i]
numerical = get_numerical_jacobian(fn, tupled_inputs, eps=eps)
for n in numerical:
if len(torch.nonzero(n)) > 0:
return fail_test('Numerical gradient for function expected to be zero')
return True
for i, o in enumerate(output):
if not o.requires_grad:
continue
def fn(input):
return _as_tuple(func(*input))[i]
analytical, reentrant, correct_grad_sizes = get_analytical_jacobian(tupled_inputs, o)
numerical = get_numerical_jacobian(fn, tupled_inputs, eps=eps)
if not correct_grad_sizes:
return fail_test('Analytical gradient has incorrect size')
for j, (a, n) in enumerate(zip(analytical, numerical)):
if a.numel() != 0 or n.numel() != 0:
if not torch.allclose(a, n, rtol, atol):
return fail_test('Jacobian mismatch for output %d with respect to input %d,\n'
'numerical:%s\nanalytical:%s\n' % (i, j, n, a))
if not reentrant:
return fail_test('Backward is not reentrant, i.e., running backward with same '
'input and grad_output multiple times gives different values, '
'although analytical gradient matches numerical gradient')
# check if the backward multiplies by grad_output
output = _differentiable_outputs(func(*tupled_inputs))
if any([o.requires_grad for o in output]):
diff_input_list = list(iter_tensors(tupled_inputs, True))
if not diff_input_list:
raise RuntimeError("no Tensors requiring grad found in input")
grads_input = torch.autograd.grad(output, diff_input_list, [torch.zeros_like(o) for o in output],
allow_unused=True)
for gi, i in zip(grads_input, diff_input_list):
if gi is None:
continue
if isinstance(gi, torch.Tensor) and gi.layout != torch.strided:
if gi.layout != i.layout:
return fail_test('grad is incorrect layout')
if gi.layout == torch.sparse_coo:
if gi.sparse_dim() != i.sparse_dim():
return fail_test('grad is sparse tensor, but has incorrect sparse_dim')
if gi.dense_dim() != i.dense_dim():
return fail_test('grad is sparse tensor, but has incorrect dense_dim')
gi = gi.to_dense()
i = i.to_dense()
if not gi.eq(0).all():
return fail_test('backward not multiplied by grad_output')
if gi.type() != i.type():
return fail_test("grad is incorrect type")
if gi.size() != i.size():
return fail_test('grad is incorrect size')
return True
def gradgradcheck(func, inputs, grad_outputs=None, eps=1e-6, atol=1e-5, rtol=1e-3,
gen_non_contig_grad_outputs=False, raise_exception=True):
r"""Check gradients of gradients computed via small finite differences
against analytical gradients w.r.t. tensors in :attr:`inputs` and
:attr:`grad_outputs` that are of floating point type and with
``requires_grad=True``.
This function checks that backpropagating through the gradients computed
to the given :attr:`grad_outputs` are correct.
The check between numerical and analytical gradients uses :func:`~torch.allclose`.
.. note::
The default values are designed for :attr:`input` and
:attr:`grad_outputs` of double precision. This check will likely fail if
they are of less precision, e.g., ``FloatTensor``.
.. warning::
If any checked tensor in :attr:`input` and :attr:`grad_outputs` has
overlapping memory, i.e., different indices pointing to the same memory
address (e.g., from :func:`torch.expand`), this check will likely fail
because the numerical gradients computed by point perturbation at such
indices will change values at all other indices that share the same
memory address.
Args:
func (function): a Python function that takes Tensor inputs and returns
a Tensor or a tuple of Tensors
inputs (tuple of Tensor or Tensor): inputs to the function
grad_outputs (tuple of Tensor or Tensor, optional): The gradients with
respect to the function's outputs.
eps (float, optional): perturbation for finite differences
atol (float, optional): absolute tolerance
rtol (float, optional): relative tolerance
gen_non_contig_grad_outputs (bool, optional): if :attr:`grad_outputs` is
``None`` and :attr:`gen_non_contig_grad_outputs` is ``True``, the
randomly generated gradient outputs are made to be noncontiguous
raise_exception (bool, optional): indicating whether to raise an exception if
the check fails. The exception gives more information about the
exact nature of the failure. This is helpful when debugging gradchecks.
Returns:
True if all differences satisfy allclose condition
"""
tupled_inputs = _as_tuple(inputs)
if grad_outputs is None:
# If grad_outputs is not specified, create random Tensors of the same
# shape, type, and device as the outputs
def randn_like(x):
y = torch.testing.randn_like(x if x.is_floating_point() else x.double())
if gen_non_contig_grad_outputs:
y = torch.testing.make_non_contiguous(y)
return y.requires_grad_()
outputs = _as_tuple(func(*tupled_inputs))
tupled_grad_outputs = tuple(randn_like(x) for x in outputs)
else:
tupled_grad_outputs = _as_tuple(grad_outputs)
num_outputs = len(tupled_grad_outputs)
def new_func(*args):
input_args = args[:-num_outputs]
grad_outputs = args[-num_outputs:]
outputs = _differentiable_outputs(func(*input_args))
input_args = tuple(x for x in input_args if isinstance(x, torch.Tensor) and x.requires_grad)
grad_inputs = torch.autograd.grad(outputs, input_args, grad_outputs, create_graph=True)
return grad_inputs
return gradcheck(new_func, tupled_inputs + tupled_grad_outputs, eps, atol, rtol, raise_exception)