|
| 1 | +import pytest |
| 2 | +import torch |
| 3 | + |
| 4 | +import ntops.torch |
| 5 | +from tests.skippers import skip_if_cuda_not_available |
| 6 | + |
| 7 | + |
| 8 | +def _torch_rotary_position_embedding(input, sin_table, cos_table, interleaved=True): |
| 9 | + batch_size, seq_len, num_heads, emb_dim = input.shape |
| 10 | + |
| 11 | + assert emb_dim % 2 == 0, "The embedding dimension must be even." |
| 12 | + |
| 13 | + sin_table = sin_table[None, :, None, :] |
| 14 | + cos_table = cos_table[None, :, None, :] |
| 15 | + |
| 16 | + if interleaved: |
| 17 | + pair_wise_input = input.view(batch_size, seq_len, num_heads, emb_dim // 2, 2) |
| 18 | + input_0, input_1 = pair_wise_input[..., 0], pair_wise_input[..., 1] |
| 19 | + input_0_rotated = input_0 * cos_table - input_1 * sin_table |
| 20 | + input_1_rotated = input_0 * sin_table + input_1 * cos_table |
| 21 | + |
| 22 | + return torch.stack((input_0_rotated, input_1_rotated), dim=-1).view(input.shape) |
| 23 | + else: |
| 24 | + input_0 = input[..., : input.shape[-1] // 2] |
| 25 | + input_1 = input[..., input.shape[-1] // 2 :] |
| 26 | + input_0_rotated = input_0 * cos_table - input_1 * sin_table |
| 27 | + input_1_rotated = input_0 * sin_table + input_1 * cos_table |
| 28 | + |
| 29 | + return torch.cat((input_0_rotated, input_1_rotated), dim=-1) |
| 30 | + |
| 31 | + |
| 32 | +def _generate_sin_and_cos_tables( |
| 33 | + seq_len, emb_dim, base=10000, dtype=torch.float32, device="cuda" |
| 34 | +): |
| 35 | + assert emb_dim % 2 == 0, "The embedding dimension must be even." |
| 36 | + |
| 37 | + theta = base ** ( |
| 38 | + -2 * (torch.arange(emb_dim // 2, dtype=dtype, device=device) / emb_dim) |
| 39 | + ) |
| 40 | + |
| 41 | + positions = torch.arange(seq_len, dtype=dtype, device=device).unsqueeze(1) |
| 42 | + sin_table = torch.sin(positions * theta) |
| 43 | + cos_table = torch.cos(positions * theta) |
| 44 | + |
| 45 | + return sin_table, cos_table |
| 46 | + |
| 47 | + |
| 48 | +@skip_if_cuda_not_available |
| 49 | +@pytest.mark.parametrize( |
| 50 | + "dtype, atol, rtol", ((torch.float32, 0.001, 0), (torch.float16, 0.001, 0.001)) |
| 51 | +) |
| 52 | +@pytest.mark.parametrize("inplace", (False, True)) |
| 53 | +@pytest.mark.parametrize("interleaved", (False, True)) |
| 54 | +@pytest.mark.parametrize("emb_dim", (32, 64)) |
| 55 | +@pytest.mark.parametrize("num_heads", (1, 8)) |
| 56 | +@pytest.mark.parametrize("seq_len", (1, 128)) |
| 57 | +@pytest.mark.parametrize("batch_size", (1, 4)) |
| 58 | +def test_cuda( |
| 59 | + batch_size, seq_len, num_heads, emb_dim, interleaved, inplace, dtype, atol, rtol |
| 60 | +): |
| 61 | + device = "cuda" |
| 62 | + |
| 63 | + input = torch.randn( |
| 64 | + batch_size, seq_len, num_heads, emb_dim, dtype=dtype, device=device |
| 65 | + ) |
| 66 | + sin_table, cos_table = _generate_sin_and_cos_tables( |
| 67 | + seq_len, emb_dim, dtype=dtype, device=device |
| 68 | + ) |
| 69 | + |
| 70 | + ninetoothed_output = ntops.torch.rotary_position_embedding( |
| 71 | + input.clone() if inplace else input, |
| 72 | + sin_table, |
| 73 | + cos_table, |
| 74 | + interleaved=interleaved, |
| 75 | + inplace=inplace, |
| 76 | + ) |
| 77 | + reference_output = _torch_rotary_position_embedding( |
| 78 | + input, sin_table, cos_table, interleaved=interleaved |
| 79 | + ) |
| 80 | + |
| 81 | + assert torch.allclose(ninetoothed_output, reference_output, atol=atol, rtol=rtol) |
0 commit comments