[BUG]: Can't get correct result when use cub in CUDA12.0 #668

YuanRisheng · 2023-11-07T13:08:04Z

Is this a duplicate?

I confirmed there appear to be no duplicate issues for this bug and that I agree to the Code of Conduct

Type of Bug

Something else

Component

CUB

Describe the bug

I get error result when I use cub::DeviceReduce::Reduce in CUDA12.0. This error occurs only when build shared target. This bug has been solved in NVIDIA/cub#719 before by using CCCL. But I found this problem is easily reproduced when I add cuda path in my CMakeList.txt

How to Reproduce

This is my code：

test_main.cc

#include <iostream>

extern float Reduce1024x100();
int main() {
  auto ret = Reduce1024x100();
  std::cout << ret << std::endl;
  return 0;
}

test_functor.h

#include <iostream>
#include <vector>
#include "cub/cub.cuh"
#include "cuda.h"

#define CUDA_CHECK(__x)                                       \
  do {                                                        \
    auto __cond = (__x);                                      \
    if (__cond != cudaSuccess) {                              \
      auto __msg = std::string(#__x) + " " + __FILE__ + ":" + \
                   std::to_string(__LINE__) + ": " +          \
                   cudaGetErrorString(__cond) + " , code " +  \
                   std::to_string(__cond);                    \
      throw std::runtime_error(__msg);                        \
    }                                                         \
  } while (0)

template <typename Tx, typename Ty = Tx>
struct IdentityFunctor {
  __host__ inline IdentityFunctor() {}

  __device__ explicit inline IdentityFunctor(int n) {}

  __device__ inline Ty operator()(const Tx x) const {
    return static_cast<Ty>(x);
  }
};

template <typename T>
struct AddFunctor {
  inline T initial() { return static_cast<T>(0.0f); }

  __device__ T operator()(const T a, const T b) const { return b + a; }
};

template <typename T>
T *CudaMalloc(size_t n) {
  if (n == 0) return nullptr;
  T *p = nullptr;
  CUDA_CHECK(cudaMalloc(&p, n * sizeof(T)));
  return p;
}

test_moduleA.cu

#include "test_functor.h"

void CudaFree(void *p) {
  if (p == nullptr) return;
  CUDA_CHECK(cudaFree(p));
}
template <typename T>
T TestMain(const std::vector<T> &cpu) {
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  size_t n = cpu.size();
  auto *gpu = CudaMalloc<T>(n);
  CUDA_CHECK(cudaMemcpyAsync(
      gpu, cpu.data(), n * sizeof(T), cudaMemcpyHostToDevice, stream));
  auto *gpu_ret = CudaMalloc<T>(1);
  auto addf = AddFunctor<float>();
  auto trans = IdentityFunctor<float, float>(n);
  cub::TransformInputIterator<float,
                              IdentityFunctor<float, float>,
                              const float *>
      trans_x(gpu, trans);
  size_t tmp_bytes;
  CUDA_CHECK(cub::DeviceReduce::Reduce(
      nullptr, tmp_bytes, trans_x, gpu_ret, n, addf, 0.0f, stream));
  std::cout << "tmp_bytes:" << tmp_bytes << std::endl;
  uint8_t *gpu_tmp = CudaMalloc<uint8_t>(tmp_bytes);
  CUDA_CHECK(cub::DeviceReduce::Reduce(
      gpu_tmp, tmp_bytes, trans_x, gpu_ret, n, addf, 0.0f, stream));

  T cpu_ret;
  CUDA_CHECK(cudaMemcpyAsync(
      &cpu_ret, gpu_ret, sizeof(T), cudaMemcpyDeviceToHost, stream));

  CUDA_CHECK(cudaStreamSynchronize(stream));
  CUDA_CHECK(cudaStreamDestroy(stream));

  CudaFree(gpu);
  CudaFree(gpu_ret);
  CudaFree(gpu_tmp);
  return cpu_ret;
}

float Reduce1024x100() {
  std::cout << "CUB version : " << CUB_VERSION << std::endl;
  std::vector<float> data(1024 * 100, 1);
  auto ret = TestMain(data);
  return ret;
}

void UseModuleA() {}

test_moduleB.cu

#include "test_functor.h"
extern void UseModuleA();

float NoUse() {
  cudaStream_t stream;
  auto *gpu = CudaMalloc<float>(1);
  auto *gpu_ret = CudaMalloc<float>(1);
  auto addf = AddFunctor<float>();
  auto trans = IdentityFunctor<float, float>();
  cub::TransformInputIterator<float,
                              IdentityFunctor<float, float>,
                              const float *>
      trans_x(gpu, trans);
  size_t tmp_arg = 0;
  cub::DeviceReduce::Reduce(
      nullptr, tmp_arg, trans_x, gpu_ret, tmp_arg, addf, 0.0f, stream);

  UseModuleA();
  return 1;
}

CMakeList code:

cmake_minimum_required(VERSION 3.18)

include_directories("/usr/local/cuda/include")   # I need add cuda path in my project, but when I open this code, I will get error result. It seems that the CCCL becomes unavaiable.

set(CPM_DOWNLOAD_VERSION 0.34.0)

if(CPM_SOURCE_CACHE)
  set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
elseif(DEFINED ENV{CPM_SOURCE_CACHE})
  set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
else()
  set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
endif()

if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
  message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
  file(DOWNLOAD
       https://github.com/TheLartians/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
       ${CPM_DOWNLOAD_LOCATION}
  )
endif()

include(${CPM_DOWNLOAD_LOCATION})

# This will automatically clone CCCL from GitHub and make the exported cmake targets available
CPMAddPackage(
    NAME CCCL
    GITHUB_REPOSITORY nvidia/cccl
    GIT_TAG main # Fetches the latest commit on the main branch
)

project(test LANGUAGES CUDA CXX)
set(CMAKE_CUDA_ARCHITECTURES "80")

add_library(test_moduleA SHARED test_moduleA.cu)
add_library(test_moduleB SHARED test_moduleB.cu)
target_link_libraries(test_moduleB test_moduleA CCCL::CCCL)
add_executable(test_main test_main.cc)
target_link_libraries(test_main test_moduleB)

Please use cmake -DCMAKE_BUILD_TYPE=Release .. when build target.

Please help me to deal with this issue. Thank you!

Expected behavior

The error result:

The expected result:

Reproduction link

No response

Operating System

No response

nvidia-smi output

NVCC version

nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2023 NVIDIA Corporation Built on Fri_Jan__6_16:45:21_PST_2023 Cuda compilation tools, release 12.0, V12.0.140 Build cuda_12.0.r12.0/compiler.32267302_0

The text was updated successfully, but these errors were encountered:

github-actions · 2023-11-07T13:08:33Z

Hi @YuanRisheng!

Thanks for submitting this issue - the CCCL team has been notified and we'll get back to you as soon as we can!
In the mean time, feel free to add any relevant information to this issue.

YuanRisheng · 2023-11-07T13:11:32Z

@gevtushenko Please help me again. I reproduced this bug: NVIDIA/cub#719, I get incorrect result when I add include_directories("/usr/local/cuda/include") in CMakeLists of my project.

miscco · 2023-11-07T14:12:33Z

The issue is that you are pulling in cccl sources of your CTK. That means we cannot do anything here as that has already shipped. You would need to actually pull in the cccl sources from github and use them in your project.

jrhemstad · 2023-11-07T16:41:56Z

@YuanRisheng to provide more detail, when you add:

include_directories("/usr/local/cuda/include")

it will override the include paths that are configured when you do:

target_link_libraries(test_moduleB test_moduleA CCCL::CCCL)

So instead of picking up the version of CCCL downloaded from GitHub, it will use the version included in your CUDA Toolkit. The version in your CTK does not have the fix you need.

Can you tell us more about why you need to do:

include_directories("/usr/local/cuda/include")

That's generally considered an anti-pattern in cmake these days and using target properties is preferred.

Furthermore, this shouldn't be needed because when you do:

project(test LANGUAGES CUDA CXX)

CMake will automatically detect and setup the include paths to your CTK.

YuanRisheng · 2023-11-08T02:56:43Z

That's generally considered an anti-pattern in cmake these days and using target properties is preferred.

@jrhemstad Thank you for your reply. I need add:
include_directories("/usr/local/cuda/include")
Because there are some other header files that will be used in my project(my actual project is big and need use other header files that are in CUDA Toolkit Path, for example: /usr/local/cuda/include/cupti.h). But cccl and CTK(cub that has bug) in my environment seems can only select one. How can I write my code in cmake that can use both cccl and CTK or do I need update CTK in my environment?

jrhemstad · 2023-11-08T14:49:56Z

do I need update CTK in my environment?

You do not need to update your CTK. Upgrading CCCL without upgrading your CTK is well supported.

Let's use your test_moduleA.cu as an example. The best practice for when you are building a CUDA application using libraries from the CUDA Toolkit, you want to do the following:

project(test LANGUAGES CUDA) # Enable the CUDA language in cmake
find_package(CUDAToolkit REQUIRED) # Bring in the CTK library targets

With this, cmake does a number of things for you automatically. Most importantly, it defines a bunch of targets for various components of the CTK, like CUDA::cublas for cuBLAS or CUDA::cupti for cupti.

So the best practice for doing what you're trying to do is the following:

project(test LANGUAGES CUDA CXX)
find_package(CUDAToolkit REQUIRED)
add_library(test_moduleA SHARED test_moduleA.cu)

CPMAddPackage(
    NAME CCCL
    GITHUB_REPOSITORY nvidia/cccl
    GIT_TAG main # Fetches the latest commit on the main branch
)

target_link_libraries(test_moduleA CUDA::cupti CCCL::CCCL)

Note how in addition to linking the CCCL::CCCL target, it also links the CUDA::cupti target. If there are other CTK components you're relying on, you can link those as well.

YuanRisheng · 2023-11-09T13:00:37Z

do I need update CTK in my environment?

You do not need to update your CTK. Upgrading CCCL without upgrading your CTK is well supported.

Let's use your test_moduleA.cu as an example. The best practice for when you are building a CUDA application using libraries from the CUDA Toolkit, you want to do the following:
project(test LANGUAGES CUDA) # Enable the CUDA language in cmake
find_package(CUDAToolkit REQUIRED) # Bring in the CTK library targets
With this, cmake does a number of things for you automatically. Most importantly, it defines a bunch of targets for various components of the CTK, like CUDA::cublas for cuBLAS or CUDA::cupti for cupti.

So the best practice for doing what you're trying to do is the following:
project(test LANGUAGES CUDA CXX)
find_package(CUDAToolkit REQUIRED)
add_library(test_moduleA SHARED test_moduleA.cu)

CPMAddPackage(
    NAME CCCL
    GITHUB_REPOSITORY nvidia/cccl
    GIT_TAG main # Fetches the latest commit on the main branch
)

target_link_libraries(test_moduleA CUDA::cupti CCCL::CCCL)
Note how in addition to linking the CCCL::CCCL target, it also links the CUDA::cupti target. If there are other CTK components you're relying on, you can link those as well.

@jrhemstad Thanks for your solution!Your solution inspired me and I have come up with a simpler solution. I only need add code below:

include_directories(BEFORE "${CCCL_SOURCE_DIR}/cub")
include_directories(BEFORE "${CCCL_SOURCE_DIR}/libcudacxx/include")
include_directories(BEFORE "${CCCL_SOURCE_DIR}/thrust")

This will make cccl path override the cuda path and fix my bug. Thanks a lot!!

jrhemstad · 2023-11-09T13:40:48Z

@YuanRisheng while that will likely work, I recommend against using include_directories like this. The best practice in modern cmake is to always use target properties.

YuanRisheng · 2023-11-10T02:35:27Z

@YuanRisheng while that will likely work, I recommend against using include_directories like this. The best practice in modern cmake is to always use target properties.

@jrhemstad I have tried but got some errors that my source code can't find header file. Maybe I need refactor my code in cmake. I will try this best practice again after this time. Thanks for your suggestion!

YuanRisheng added the bug Something isn't working right. label Nov 7, 2023

github-actions bot assigned elstehle Nov 7, 2023

github-actions bot added the needs triage Issues that require the team's attention label Nov 7, 2023

jrhemstad removed the needs triage Issues that require the team's attention label Nov 7, 2023

jrhemstad unassigned elstehle Nov 7, 2023

jrhemstad closed this as completed Nov 10, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[BUG]: Can't get correct result when use cub in CUDA12.0 #668

[BUG]: Can't get correct result when use cub in CUDA12.0 #668

YuanRisheng commented Nov 7, 2023

github-actions bot commented Nov 7, 2023

YuanRisheng commented Nov 7, 2023 •

edited

Loading

miscco commented Nov 7, 2023

jrhemstad commented Nov 7, 2023

YuanRisheng commented Nov 8, 2023

jrhemstad commented Nov 8, 2023 •

edited

Loading

YuanRisheng commented Nov 9, 2023

jrhemstad commented Nov 9, 2023

YuanRisheng commented Nov 10, 2023

[BUG]: Can't get correct result when use cub in CUDA12.0 #668

[BUG]: Can't get correct result when use cub in CUDA12.0 #668

Comments

YuanRisheng commented Nov 7, 2023

Is this a duplicate?

Type of Bug

Component

Describe the bug

How to Reproduce

test_main.cc

test_functor.h

test_moduleA.cu

test_moduleB.cu

CMakeList code:

Expected behavior

Reproduction link

Operating System

nvidia-smi output

NVCC version

github-actions bot commented Nov 7, 2023

YuanRisheng commented Nov 7, 2023 • edited Loading

miscco commented Nov 7, 2023

jrhemstad commented Nov 7, 2023

YuanRisheng commented Nov 8, 2023

jrhemstad commented Nov 8, 2023 • edited Loading

YuanRisheng commented Nov 9, 2023

jrhemstad commented Nov 9, 2023

YuanRisheng commented Nov 10, 2023

YuanRisheng commented Nov 7, 2023 •

edited

Loading

jrhemstad commented Nov 8, 2023 •

edited

Loading