Misaligned address error when using `std::aligned_storage` #429

Badhi · 2023-09-11T18:42:13Z

I'm trying to use std::barrier with Completion function as mentioned in this example. Following is a modified version of the given example, and I'm seeing it errors out as misaligned address

#include <cuda/barrier>
#include <cooperative_groups.h>
#include <functional>
namespace cg = cooperative_groups;
#define CHECK_CUDA(call)                                                  \
    {                                                                     \
        cudaError_t err = call;                                           \
        if (cudaSuccess != err) {                                         \
            fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err));         \
            exit(EXIT_FAILURE);                                           \
        }                                                                 \
    }


struct Comp {
    __device__ void operator()() {
        int sum = 0;
        for (int i = 0; i < 128; ++i) sum += smem[i];
        *acc += sum;
    }
    int* acc;
    uint* smem;
};

using barrier_t = cuda::barrier<cuda::thread_scope_block,
                                  Comp>;
__global__ void psum(int* data, int n, int* acc) {
  auto block = cg::this_thread_block();

  __shared__ uint32_t smem[128];
  Comp c{acc, smem};
  using barrier_t = cuda::barrier<cuda::thread_scope_block, Comp>;
  __shared__ std::aligned_storage<sizeof(barrier_t), alignof(barrier_t)>
      bar_storage;
  __shared__ unsigned int blockQueuePos;

  // Initialize barrier:
  barrier_t* bar = (barrier_t*)&bar_storage;
  if (cg::this_thread_block().thread_rank() == 0) {
    new (bar) barrier_t{cg::this_thread_block().size(), c};
    printf("block %d, bar :%p\n", blockIdx.x, bar);
    printf("block %d, sizeof bar :%lu\n", blockIdx.x, sizeof(barrier_t));
    printf("block %d, blockQueuePos: %p\n", blockIdx.x, &blockQueuePos);
  }
  cg::this_thread_block().sync();

  for (int i = 0; i < 10; i++)
    bar->arrive_and_wait();
}

int main() {
    int* data;
    int* acc;
    cudaMallocManaged(&data, 128 * sizeof(int));
    cudaMallocManaged(&acc, 1* sizeof(int));
    psum<<<128, 128>>>(data, 128, acc);
    CHECK_CUDA(cudaDeviceSynchronize());
}

Following is the stack trace

For cuda - 12.2

#0  0x00007ffdcf7d25f0 in cuda::std::__4::__atomic_base_storage<long, cuda::std::__4::__cxx_atomic_impl<long, 2, cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false> > >::__atomic_base_storage (this=<unavailable>, __a=...) at /cuda-12.2/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/atomic:1295
NVIDIA/libcudacxx#1  0x00007ffdcf7d1960 in cuda::std::__4::__atomic_base_storage<long, cuda::std::__4::__cxx_atomic_impl<long, 2, cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false> > >::__atomic_base_storage(cuda::std::__4::__cxx_atomic_impl<long, 2, cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false> >&&) ()
NVIDIA/libcudacxx#2  0x00007ffdcf7d0d90 in cuda::std::__4::__atomic_base_core<long, false, cuda::std::__4::__cxx_atomic_impl<long, 2, cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false> > >::__atomic_base_core (this=0x7ffe00000604, __a=...) at /cuda-12.2/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/atomic:1308
NVIDIA/libcudacxx#3  0x00007ffdcf7d0160 in cuda::std::__4::__atomic_base_core<long, false, cuda::std::__4::__cxx_atomic_impl<long, 2, cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false> > >::__atomic_base_core(cuda::std::__4::__cxx_atomic_impl<long, 2, cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false> >&&) ()
NVIDIA/libcudacxx#4  0x00007ffdcf7cf590 in cuda::std::__4::__atomic_base_arithmetic<long, false, cuda::std::__4::__cxx_atomic_impl<long, 2, cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false> > >::__atomic_base_arithmetic (this=0x7ffe00000604, __a=...) at /cuda-12.2/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/atomic:1558
NVIDIA/libcudacxx#5  0x00007ffdcf7ce960 in cuda::std::__4::__atomic_base_arithmetic<long, false, cuda::std::__4::__cxx_atomic_impl<long, 2, cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false> > >::__atomic_base_arithmetic(cuda::std::__4::__cxx_atomic_impl<long, 2, cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false> >&&) ()
NVIDIA/libcudacxx#6  0x00007ffdcf7cdd90 in cuda::std::__4::__atomic_base_bitwise<long, false, cuda::std::__4::__cxx_atomic_impl<long, 2, cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false> > >::__atomic_base_bitwise (this=0x7ffe00000604, __a=...) at /cuda-12.2/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/atomic:1660
NVIDIA/libcudacxx#7  0x00007ffdcf7cd160 in cuda::std::__4::__atomic_base_bitwise<long, false, cuda::std::__4::__cxx_atomic_impl<long, 2, cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false> > >::__atomic_base_bitwise(cuda::std::__4::__cxx_atomic_impl<long, 2, cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false> >&&) ()
NVIDIA/libcudacxx#8  0x00007ffdcf7cc550 in cuda::std::__4::__atomic_base<long, 2, cuda::std::__4::__atomic_base_bitwise<long, false, cuda::std::__4::__cxx_atomic_impl<long, 2, cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false> > > >::__atomic_base (this=0x7ffe00000604, __a=<error reading variable: Unknown storage specifier (read) 0x10000>)
    at /cuda-12.2/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/atomic:1758
NVIDIA/libcudacxx#9  0x00007ffdcf7cb2a0 in cuda::std::__4::__barrier_base<Comp, 2>::__barrier_base (this=0x7ffe00000604, __expected=128, __completion=...)
    at /cuda-12.2/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/barrier:309
NVIDIA/libcudacxx#10 0x00007ffdcf7ca390 in cuda::std::__4::__barrier_base<Comp, 2>::__barrier_base(long, Comp) ()
NVIDIA/libcudacxx#11 0x00007ffdcf7c9370 in cuda::__4::barrier<(cuda::std::__4::__detail::thread_scope)2, Comp>::barrier (this=0x7ffe00000604, __expected=128, __completion=...)
    at /cuda-12.2/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/__cuda/barrier.h:59
NVIDIA/libcudacxx#12 0x00007ffdcf76eb70 in psum<<<(128,1,1),(128,1,1)>>> (data=0x7ffdb0000000, n=128, acc=0x7ffdb0000200) at ./test_simple.cu:41

For cuda11.5

#0  cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false>::__cxx_atomic_base_heterogeneous_impl (this=0x2aaab1000204, __value=128) at /cuda-11.5/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h:151
NVIDIA/libcudacxx#1  0x0000000000da0ae0 in cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false>::__cxx_atomic_base_heterogeneous_impl(long) ()
NVIDIA/libcudacxx#2  0x0000000000ddd320 in cuda::std::__4::__cxx_atomic_impl<long, 2, cuda::std::__4::__detail::__cxx_atomic_base_heterogeneous_impl<long, 2, false> >::__cxx_atomic_impl (this=0x2aaab1000204, value=128) at /cuda-11.5/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/atomic:1064
NVIDIA/libcudacxx#3  0x0000000000de0820 in cuda::std::__4::__atomic_base<long, 2, false>::__atomic_base (this=0x2aaab1000204, __d=128) at /cuda-11.5/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/atomic:1363
NVIDIA/libcudacxx#4  0x0000000000de13e0 in cuda::std::__4::__atomic_base<long, 2, false>::__atomic_base(long) ()
NVIDIA/libcudacxx#5  0x0000000000de2120 in cuda::std::__4::__atomic_base<long, 2, true>::__atomic_base (this=0x2aaab1000204, __d=128) at /cuda-11.5/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/atomic:1397
NVIDIA/libcudacxx#6  0x0000000000de8000 in cuda::std::__4::__barrier_base<Comp, 2>::__barrier_base (this=0x2aaab1000204, __expected=128, __completion=...) at /cuda-11.5/bin/../targets/x86_64-linux/include/cuda/std/detail/libcxx/include/barrier:219
NVIDIA/libcudacxx#7  0x0000000000da7b00 in cuda::std::__4::__barrier_base<Comp, 2>::__barrier_base(long, Comp) ()
NVIDIA/libcudacxx#8  0x0000000000d9dce0 in cuda::__4::barrier<(cuda::std::__4::__detail::thread_scope)2, Comp>::barrier (this=0x2aaab1000204, __expected=128, __completion=...) at /cuda-11.5/bin/../targets/x86_64-linux/include/cuda/std/barrier:53
NVIDIA/libcudacxx#9  0x0000000000dd8680 in psum<<<(128,1,1),(128,1,1)>>> (data=0x2aaada000000, n=128, acc=0x2aaada000200) at ./test_simple.cu:41

The text was updated successfully, but these errors were encountered:

github-actions · 2023-09-11T18:47:48Z

Hi @Badhi!

Thanks for submitting this issue - the CCCL team has been notified and we'll get back to you as soon as we can!
In the mean time, feel free to add any relevant information to this issue.

jrhemstad · 2023-09-11T18:48:16Z

Hey @Badhi , I moved this to our new repo.

I tried reproducing the issue you were seeing and I can't seem to get the same misaligned error. What GPU are you running on?

https://godbolt.org/z/1W8nrsdvd

griwes · 2023-09-11T19:29:01Z

The example in the programming guide has a bug, it uses the trait instead of the type provided by the trait (i.e., is missing ::type after aligned_storage).

Badhi · 2023-09-11T20:35:41Z

@jrhemstad , Apologies. the compile command I used was

nvcc -gencode arch=compute_80,code=sm_80

@griwes , Thank you. Using the type fixed the issue.

jrhemstad transferred this issue from NVIDIA/libcudacxx Sep 11, 2023

github-actions bot assigned jrhemstad Sep 11, 2023

github-actions bot added the needs triage Issues that require the team's attention label Sep 11, 2023

Badhi closed this as completed Sep 11, 2023

This issue was closed.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Misaligned address error when using `std::aligned_storage` #429

Misaligned address error when using `std::aligned_storage` #429

Badhi commented Sep 11, 2023

github-actions bot commented Sep 11, 2023

jrhemstad commented Sep 11, 2023

griwes commented Sep 11, 2023

Badhi commented Sep 11, 2023

Misaligned address error when using std::aligned_storage #429

Misaligned address error when using std::aligned_storage #429

Comments

Badhi commented Sep 11, 2023

github-actions bot commented Sep 11, 2023

jrhemstad commented Sep 11, 2023

griwes commented Sep 11, 2023

Badhi commented Sep 11, 2023

Misaligned address error when using `std::aligned_storage` #429

Misaligned address error when using `std::aligned_storage` #429