You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Describe the problem
2 out of 4 tests failed with differences exceeding the margin of error.
Error message
============================ test session starts =============================
platform linux -- Python 3.7.10, pytest-6.2.5, py-1.10.0, pluggy-1.0.0
rootdir: /home/minghanz/repos/torch-batch-svd
plugins: anyio-3.3.0
collected 4 items
test.py F..F [100%]
================================== FAILURES ==================================
_________________________________ test_float _________________________________
def test_float():
torch.manual_seed(0)
a = torch.randn(N, H, W).cuda()
b = a.clone()
a.requires_grad = True
b.requires_grad = True
U, S, V = svd(a)
loss = U.sum() + S.sum() + V.sum()
loss.backward()
u, s, v = torch.svd(b[0], some=True, compute_uv=True)
loss0 = u.sum() + s.sum() + v.sum()
loss0.backward()
# eigenvectors are only precise up to sign
testing.assert_allclose(U[0].abs(), u.abs())
testing.assert_allclose(S[0].abs(), s.abs())
testing.assert_allclose(V[0].abs(), v.abs())
> testing.assert_allclose(a, U @ torch.diag_embed(S) @ V.transpose(-2, -1))
test.py:28:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
actual = tensor([[[-1.1258, -1.1524, -0.2506],
[-0.4339, 0.8487, 0.6920],
[-0.3160, -2.1152, 0.3223],
...77],
[ 1.4223, 0.2985, 0.0924],
[-1.0208, 0.3279, 0.0111]]], device='cuda:0', requires_grad=True)
expected = tensor([[[-1.1258, -1.1528, -0.2509],
[-0.4338, 0.8488, 0.6922],
[-0.3161, -2.1155, 0.3225],
....4220, 0.2986, 0.0924],
[-1.0208, 0.3279, 0.0110]]], device='cuda:0',
grad_fn=<UnsafeViewBackward>)
rtol = 0.0001, atol = 1e-05, equal_nan = True
msg = 'With rtol=0.0001 and atol=1e-05, found 1976 element(s) (out of 2700) whose difference(s) exceeded the margin of error...difference was 0.0013537406921386719 (-3.153724431991577 vs. -3.1523706912994385), which occurred at index (17, 6, 2).'
def assert_allclose(actual, expected, rtol=None, atol=None, equal_nan=True, msg='') -> None:
if not isinstance(actual, torch.Tensor):
actual = torch.tensor(actual)
if not isinstance(expected, torch.Tensor):
expected = torch.tensor(expected, dtype=actual.dtype)
if expected.shape != actual.shape:
raise AssertionError("expected tensor shape {0} doesn't match with actual tensor "
"shape {1}!".format(expected.shape, actual.shape))
if rtol is None or atol is None:
if rtol is not None or atol is not None:
raise ValueError("rtol and atol must both be specified or both be unspecified")
rtol, atol = _get_default_tolerance(actual, expected)
result, debug_msg = _compare_tensors_internal(actual, expected,
rtol=rtol, atol=atol,
equal_nan=equal_nan)
if result:
return
if msg is None or msg == '':
msg = debug_msg
> raise AssertionError(msg)
E AssertionError: With rtol=0.0001 and atol=1e-05, found 1976 element(s) (out of 2700) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.0013537406921386719 (-3.153724431991577 vs. -3.1523706912994385), which occurred at index (17, 6, 2).
../../../anaconda3/envs/pytorch3d/lib/python3.7/site-packages/torch/testing/_core.py:270: AssertionError
_____________________________ test_multiple_gpus _____________________________
def test_multiple_gpus():
num_gpus = torch.cuda.device_count()
for gpu_idx in range(num_gpus):
device = torch.device('cuda:{}'.format(gpu_idx))
torch.manual_seed(0)
a = torch.randn(N, H, W).to(device)
b = a.clone()
a.requires_grad = True
b.requires_grad = True
U, S, V = svd(a)
loss = U.sum() + S.sum() + V.sum()
loss.backward()
u, s, v = torch.svd(b[0], some=True, compute_uv=True)
loss0 = u.sum() + s.sum() + v.sum()
loss0.backward()
# eigenvectors are only precise up to sign
testing.assert_allclose(U[0].abs(), u.abs())
testing.assert_allclose(S[0].abs(), s.abs())
testing.assert_allclose(V[0].abs(), v.abs())
testing.assert_allclose(a,
> U @ torch.diag_embed(S) @ V.transpose(-2, -1))
test.py:104:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
actual = tensor([[[-1.1258, -1.1524, -0.2506],
[-0.4339, 0.8487, 0.6920],
[-0.3160, -2.1152, 0.3223],
...77],
[ 1.4223, 0.2985, 0.0924],
[-1.0208, 0.3279, 0.0111]]], device='cuda:0', requires_grad=True)
expected = tensor([[[-1.1258, -1.1528, -0.2509],
[-0.4338, 0.8488, 0.6922],
[-0.3161, -2.1155, 0.3225],
....4220, 0.2986, 0.0924],
[-1.0208, 0.3279, 0.0110]]], device='cuda:0',
grad_fn=<UnsafeViewBackward>)
rtol = 0.0001, atol = 1e-05, equal_nan = True
msg = 'With rtol=0.0001 and atol=1e-05, found 1976 element(s) (out of 2700) whose difference(s) exceeded the margin of error...difference was 0.0013537406921386719 (-3.153724431991577 vs. -3.1523706912994385), which occurred at index (17, 6, 2).'
def assert_allclose(actual, expected, rtol=None, atol=None, equal_nan=True, msg='') -> None:
if not isinstance(actual, torch.Tensor):
actual = torch.tensor(actual)
if not isinstance(expected, torch.Tensor):
expected = torch.tensor(expected, dtype=actual.dtype)
if expected.shape != actual.shape:
raise AssertionError("expected tensor shape {0} doesn't match with actual tensor "
"shape {1}!".format(expected.shape, actual.shape))
if rtol is None or atol is None:
if rtol is not None or atol is not None:
raise ValueError("rtol and atol must both be specified or both be unspecified")
rtol, atol = _get_default_tolerance(actual, expected)
result, debug_msg = _compare_tensors_internal(actual, expected,
rtol=rtol, atol=atol,
equal_nan=equal_nan)
if result:
return
if msg is None or msg == '':
msg = debug_msg
> raise AssertionError(msg)
E AssertionError: With rtol=0.0001 and atol=1e-05, found 1976 element(s) (out of 2700) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.0013537406921386719 (-3.153724431991577 vs. -3.1523706912994385), which occurred at index (17, 6, 2).
../../../anaconda3/envs/pytorch3d/lib/python3.7/site-packages/torch/testing/_core.py:270: AssertionError
========================== short test summary info ===========================
FAILED test.py::test_float - AssertionError: With rtol=0.0001 and atol=1e-0...
FAILED test.py::test_multiple_gpus - AssertionError: With rtol=0.0001 and a...
======================== 2 failed, 2 passed in 2.45s =========================
Environments
OS: ubuntu 20.04.3 LTS
CUDA version: 11.1.74 (reported by conda list cudatoolkit)
Pytorch version: 1.9.0
Python version: 3.7.10
How to Reproduce
Just run python -m pytest test.py
The text was updated successfully, but these errors were encountered:
However, I recently found that Pytorch float32 tensors matrix multiplication on CUDA 11 gives slightly different result when the operation is on CPU and on GPU. The CPU result is consistent with the numpy multiplication result, therefore I suspect that the problem is on Pytorch side. Just to provide some information from my side. I'd appreciate it if you could confirm what problem it is actually.
Describe the problem
2 out of 4 tests failed with differences exceeding the margin of error.
Error message
Environments
conda list cudatoolkit
)How to Reproduce
Just run
python -m pytest test.py
The text was updated successfully, but these errors were encountered: