From 5a4881b5001f8103b6e20bfd200a8083f13e07f2 Mon Sep 17 00:00:00 2001 From: Georgii Evtushenko Date: Wed, 21 Aug 2024 07:45:55 -0700 Subject: [PATCH] Implement C API for device reduction (#2256) * Implement C device reduce * Format * Fix device-specific module loading * Don't need a context at build step now * Address review feedback --- CMakeLists.txt | 5 + CMakePresets.json | 1 + c/CMakeLists.txt | 20 + c/include/cccl/reduce.h | 59 +++ c/include/cccl/types.h | 85 ++++ c/src/reduce.cu | 857 ++++++++++++++++++++++++++++++++++++++++ c/test/CMakeLists.txt | 9 + c/test/c2h.h | 310 +++++++++++++++ c/test/test_main.cpp | 55 +++ c/test/test_reduce.cpp | 285 +++++++++++++ 10 files changed, 1686 insertions(+) create mode 100644 c/CMakeLists.txt create mode 100644 c/include/cccl/reduce.h create mode 100644 c/include/cccl/types.h create mode 100644 c/src/reduce.cu create mode 100644 c/test/CMakeLists.txt create mode 100644 c/test/c2h.h create mode 100644 c/test/test_main.cpp create mode 100644 c/test/test_reduce.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 198727dc5d..015b216a8d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,6 +38,7 @@ option(CCCL_ENABLE_THRUST "Enable the Thrust developer build." ${CCCL_TOPLEVEL_P option(CCCL_ENABLE_TESTING "Enable CUDA C++ Core Library tests." ${CCCL_TOPLEVEL_PROJECT}) option(CCCL_ENABLE_EXAMPLES "Enable CUDA C++ Core Library examples." ${CCCL_TOPLEVEL_PROJECT}) option(CCCL_ENABLE_BENCHMARKS "Enable CUDA C++ Core Library benchmarks." OFF) +option(CCCL_ENABLE_C "Enable CUDA C Core Library." OFF) option(CCCL_ENABLE_UNSTABLE "Enable targets and developer build options for unstable projects." OFF) @@ -77,6 +78,10 @@ if (CCCL_ENABLE_UNSTABLE) add_subdirectory(cudax) endif() +if (CCCL_ENABLE_C) + add_subdirectory(c) +endif() + if (CCCL_ENABLE_TESTING) add_subdirectory(test) endif() diff --git a/CMakePresets.json b/CMakePresets.json index 61cb88eca8..bd9374778b 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -38,6 +38,7 @@ "CCCL_ENABLE_TESTING": true, "CCCL_ENABLE_EXAMPLES": true, "CCCL_ENABLE_BENCHMARKS": true, + "CCCL_ENABLE_C": true, "LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS": true, "CUB_ENABLE_TESTING": true, "CUB_ENABLE_EXAMPLES": true, diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt new file mode 100644 index 0000000000..3e3783903b --- /dev/null +++ b/c/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.30) + +project(cccl.c LANGUAGES CUDA CXX) + +add_library(cccl.c SHARED src/reduce.cu) +set_property(TARGET cccl.c PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET cccl.c PROPERTY CXX_STANDARD 20) +set_property(TARGET cccl.c PROPERTY CUDA_STANDARD 20) + +find_package(CUDAToolkit REQUIRED) + +# TODO Use static versions of cudart, nvrtc, and nvJitLink +target_link_libraries(cccl.c PRIVATE CUDA::cudart + CUDA::nvrtc + CUDA::nvJitLink + CUDA::cuda_driver) +target_compile_definitions(cccl.c PRIVATE NVRTC_GET_TYPE_NAME=1 CCCL_C_EXPERIMENTAL=1) +target_include_directories(cccl.c PUBLIC "include") + +add_subdirectory(test) diff --git a/c/include/cccl/reduce.h b/c/include/cccl/reduce.h new file mode 100644 index 0000000000..5047625a85 --- /dev/null +++ b/c/include/cccl/reduce.h @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#ifndef CCCL_C_EXPERIMENTAL +# warning "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this warning." +#else // ^^^ !CCCL_C_EXPERIMENTAL ^^^ / vvv CCCL_C_EXPERIMENTAL vvv + +# include + +# include + +struct cccl_device_reduce_build_result_t +{ + int cc; + void* cubin; + size_t cubin_size; + CUlibrary library; + CUkernel single_tile_kernel; + CUkernel single_tile_second_kernel; + CUkernel reduction_kernel; +}; + +// TODO return a union of nvtx/cuda/nvrtc errors or a string? +extern "C" CCCL_C_API CUresult cccl_device_reduce_build( + cccl_device_reduce_build_result_t* build, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + cccl_op_t op, + cccl_value_t init, + int cc_major, + int cc_minor, + const char* cub_path, + const char* thrust_path, + const char* libcudacxx_path, + const char* ctk_path) noexcept; + +extern "C" CCCL_C_API CUresult cccl_device_reduce( + cccl_device_reduce_build_result_t build, + void* d_temp_storage, + size_t* temp_storage_bytes, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + unsigned long long num_items, + cccl_op_t op, + cccl_value_t init, + CUstream stream) noexcept; + +extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr); + +#endif // CCCL_C_EXPERIMENTAL diff --git a/c/include/cccl/types.h b/c/include/cccl/types.h new file mode 100644 index 0000000000..6b19848de4 --- /dev/null +++ b/c/include/cccl/types.h @@ -0,0 +1,85 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#ifndef CCCL_C_EXPERIMENTAL +# warning "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this warning." +#else // ^^^ !CCCL_C_EXPERIMENTAL ^^^ / vvv CCCL_C_EXPERIMENTAL vvv + +# if defined(_WIN32) +# define CCCL_C_API __declspec(dllexport) +# else +# define CCCL_C_API __attribute__((visibility("default"))) +# endif + +enum class cccl_type_enum +{ + INT8 = 0, + INT16 = 1, + INT32 = 2, + INT64 = 3, + UINT8 = 4, + UINT16 = 5, + UINT32 = 6, + UINT64 = 7, + FLOAT32 = 8, + FLOAT64 = 9, + STORAGE = 10 +}; + +struct cccl_type_info +{ + int size; + int alignment; + cccl_type_enum type; +}; + +enum class cccl_op_kind_t +{ + stateless, + stateful +}; + +struct cccl_op_t +{ + cccl_op_kind_t type; + const char* name; + const char* ltoir; + int ltoir_size; + int size; + int alignment; + void* state; +}; + +enum class cccl_iterator_kind_t +{ + pointer, + iterator +}; + +struct cccl_value_t +{ + cccl_type_info type; + void* state; +}; + +struct cccl_iterator_t +{ + int size; + int alignment; + cccl_iterator_kind_t type; + cccl_op_t advance; + cccl_op_t dereference; + cccl_type_info value_type; + void* state; +}; + +#endif // CCCL_C_EXPERIMENTAL diff --git a/c/src/reduce.cu b/c/src/reduce.cu new file mode 100644 index 0000000000..4ddcdf2f1a --- /dev/null +++ b/c/src/reduce.cu @@ -0,0 +1,857 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include + +void check(nvrtcResult result) +{ + if (result != NVRTC_SUCCESS) + { + throw std::runtime_error(std::string("NVRTC error: ") + nvrtcGetErrorString(result)); + } +} + +void check(CUresult result) +{ + if (result != CUDA_SUCCESS) + { + const char* str = nullptr; + cuGetErrorString(result, &str); + throw std::runtime_error(std::string("CUDA error: ") + str); + } +} + +void check(nvJitLinkResult result) +{ + if (result != NVJITLINK_SUCCESS) + { + throw std::runtime_error(std::string("nvJitLink error: ") + std::to_string(result)); + } +} + +struct op_wrapper; +struct device_reduce_policy; +using TransformOpT = ::cuda::std::__identity; +using OffsetT = unsigned long long; +static_assert(std::is_same_v, OffsetT>, "OffsetT must be size_t"); + +struct nothing_t +{}; + +struct runtime_tuning_policy +{ + int block_size; + int items_per_thread; + int vector_load_length; +}; + +struct storage_t; +struct input_iterator_state_t; +struct output_iterator_t; + +char const* cccl_type_enum_to_string(cccl_type_enum type) +{ + switch (type) + { + case cccl_type_enum::INT8: + return "::cuda::std::int8_t"; + case cccl_type_enum::INT16: + return "::cuda::std::int16_t"; + case cccl_type_enum::INT32: + return "::cuda::std::int32_t"; + case cccl_type_enum::INT64: + return "::cuda::std::int64_t"; + case cccl_type_enum::UINT8: + return "::cuda::std::uint8_t"; + case cccl_type_enum::UINT16: + return "::cuda::std::uint16_t"; + case cccl_type_enum::UINT32: + return "::cuda::std::uint32_t"; + case cccl_type_enum::UINT64: + return "::cuda::std::uint64_t"; + case cccl_type_enum::FLOAT32: + return "float"; + case cccl_type_enum::FLOAT64: + return "double"; + case cccl_type_enum::STORAGE: + return "storage_t"; + } + return "unknown"; +} + +std::string cccl_type_enum_to_name(cccl_type_enum type, bool is_pointer = false) +{ + std::string result; + + if (is_pointer) + { + switch (type) + { + case cccl_type_enum::INT8: + + check(nvrtcGetTypeName<::cuda::std::int8_t*>(&result)); + break; + case cccl_type_enum::INT16: + check(nvrtcGetTypeName<::cuda::std::int16_t*>(&result)); + break; + case cccl_type_enum::INT32: + check(nvrtcGetTypeName<::cuda::std::int32_t*>(&result)); + break; + case cccl_type_enum::INT64: + check(nvrtcGetTypeName<::cuda::std::int64_t*>(&result)); + break; + case cccl_type_enum::UINT8: + check(nvrtcGetTypeName<::cuda::std::uint8_t*>(&result)); + break; + case cccl_type_enum::UINT16: + check(nvrtcGetTypeName<::cuda::std::uint16_t*>(&result)); + break; + case cccl_type_enum::UINT32: + check(nvrtcGetTypeName<::cuda::std::uint32_t*>(&result)); + break; + case cccl_type_enum::UINT64: + check(nvrtcGetTypeName<::cuda::std::uint64_t*>(&result)); + break; + case cccl_type_enum::FLOAT32: + check(nvrtcGetTypeName(&result)); + break; + case cccl_type_enum::FLOAT64: + check(nvrtcGetTypeName(&result)); + break; + case cccl_type_enum::STORAGE: + check(nvrtcGetTypeName(&result)); + break; + } + } + else + { + switch (type) + { + case cccl_type_enum::INT8: + check(nvrtcGetTypeName<::cuda::std::int8_t>(&result)); + break; + case cccl_type_enum::INT16: + check(nvrtcGetTypeName<::cuda::std::int16_t>(&result)); + break; + case cccl_type_enum::INT32: + check(nvrtcGetTypeName<::cuda::std::int32_t>(&result)); + break; + case cccl_type_enum::INT64: + check(nvrtcGetTypeName<::cuda::std::int64_t>(&result)); + break; + case cccl_type_enum::UINT8: + check(nvrtcGetTypeName<::cuda::std::uint8_t>(&result)); + break; + case cccl_type_enum::UINT16: + check(nvrtcGetTypeName<::cuda::std::uint16_t>(&result)); + break; + case cccl_type_enum::UINT32: + check(nvrtcGetTypeName<::cuda::std::uint32_t>(&result)); + break; + case cccl_type_enum::UINT64: + check(nvrtcGetTypeName<::cuda::std::uint64_t>(&result)); + break; + case cccl_type_enum::FLOAT32: + check(nvrtcGetTypeName(&result)); + break; + case cccl_type_enum::FLOAT64: + check(nvrtcGetTypeName(&result)); + break; + case cccl_type_enum::STORAGE: + check(nvrtcGetTypeName(&result)); + break; + } + } + + return result; +} + +struct reduce_tuning_t +{ + int cc; + int block_size; + int items_per_thread; + int vector_load_length; +}; + +template +reduce_tuning_t find_tuning(int cc, const reduce_tuning_t (&tunings)[N]) +{ + for (const reduce_tuning_t& tuning : tunings) + { + if (cc >= tuning.cc) + { + return tuning; + } + } + + return tunings[N - 1]; +} + +runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type, cccl_type_info input_type) +{ + reduce_tuning_t chain[] = {{60, 256, 16, 4}, {35, 256, 20, 4}}; + + auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain); + + // Implement part of MemBoundScaling + items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2)); + block_size = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32); + + return {block_size, items_per_thread, vector_load_length}; +} + +cccl_type_info get_accumulator_type(cccl_op_t op, cccl_iterator_t input_it, cccl_value_t init) +{ + // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet + // so switching back to the old accumulator type logic for now + return init.type; +} + +cudaError_t InvokeSingleTile( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + unsigned long long num_items, + cccl_op_t op, + cccl_value_t init, + int cc, + CUfunction single_tile_kernel, + CUstream stream) +{ + const runtime_tuning_policy policy = get_policy(cc, d_in.value_type, d_in.value_type); + + cudaError error = cudaSuccess; + do + { + if (d_temp_storage == nullptr) + { + temp_storage_bytes = 1; + break; + } + + nothing_t nothing{}; + TransformOpT transform_op{}; + void* op_state = op.type == cccl_op_kind_t::stateless ? ¬hing : op.state; + void* in_ptr = d_in.type == cccl_iterator_kind_t::pointer ? &d_in.state : d_in.state; + void* out_ptr = d_out.type == cccl_iterator_kind_t::pointer ? &d_out.state : d_out.state; + void* args[] = {in_ptr, out_ptr, &num_items, op_state, init.state, &transform_op}; + + check(cuLaunchKernel(single_tile_kernel, 1, 1, 1, policy.block_size, 1, 1, 0, stream, args, 0)); + + // Check for failure to launch + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) + { + break; + } + } while (0); + + return error; +} + +cudaError_t InvokePasses( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + unsigned long long num_items, + cccl_op_t op, + cccl_value_t init, + int cc, + CUfunction reduce_kernel, + CUfunction single_tile_kernel, + CUdevice device, + CUstream stream) +{ + const cccl_type_info accum_t = get_accumulator_type(op, d_in, init); + const runtime_tuning_policy policy = get_policy(cc, accum_t, d_in.value_type); + + cudaError error = cudaSuccess; + do + { + void* in_ptr = d_in.type == cccl_iterator_kind_t::pointer ? &d_in.state : d_in.state; + void* out_ptr = d_out.type == cccl_iterator_kind_t::pointer ? &d_out.state : d_out.state; + + // Get SM count + int sm_count; + check(cuDeviceGetAttribute(&sm_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); + + // Init regular kernel configuration + const auto tile_size = policy.block_size * policy.items_per_thread; + + int sm_occupancy = 1; + check(cuOccupancyMaxActiveBlocksPerMultiprocessor(&sm_occupancy, reduce_kernel, policy.block_size, 0)); + + int reduce_device_occupancy = sm_occupancy * sm_count; + + // Even-share work distribution + int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(0); + cub::GridEvenShare even_share; + even_share.DispatchInit(num_items, max_blocks, tile_size); + + // Temporary storage allocation requirements + void* allocations[1] = {}; + size_t allocation_sizes[1] = { + max_blocks * static_cast(d_in.value_type.size) // bytes needed for privatized block reductions + }; + + // Alias the temporary allocations from the single storage blob (or + // compute the necessary size of the blob) + error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) + { + break; + } + + if (d_temp_storage == nullptr) + { + // Return if the caller is simply requesting the size of the storage + // allocation + return cudaSuccess; + } + + // Get grid size for device_reduce_sweep_kernel + OffsetT reduce_grid_size = even_share.grid_size; + + // Invoke DeviceReduceKernel + // reduce_kernel<<>>( + // d_in, d_block_reductions, num_items, even_share, ReductionOpT{}, TransformOpT{}); + + nothing_t nothing{}; + void* op_state = op.type == cccl_op_kind_t::stateless ? ¬hing : op.state; + + TransformOpT transform_op{}; + void* reduce_args[] = {in_ptr, &allocations[0], &num_items, &even_share, op_state, &transform_op}; + + check(cuLaunchKernel(reduce_kernel, reduce_grid_size, 1, 1, policy.block_size, 1, 1, 0, stream, reduce_args, 0)); + + // Check for failure to launch + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) + { + break; + } + + // single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS>>>( + // d_block_reductions, d_out, reduce_grid_size, ReductionOpT{}, 0, TransformOpT{}); + + void* single_tile_kernel_args[] = {&allocations[0], out_ptr, &reduce_grid_size, op_state, init.state, &transform_op}; + + check(cuLaunchKernel(single_tile_kernel, 1, 1, 1, policy.block_size, 1, 1, 0, stream, single_tile_kernel_args, 0)); + + // Check for failure to launch + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) + { + break; + } + } while (0); + + return error; +} + +cudaError_t Invoke( + void* d_temp_storage, + std::size_t& temp_storage_bytes, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + OffsetT num_items, + cccl_op_t op, + cccl_value_t init, + int cc, + CUfunction single_tile_kernel, + CUfunction single_tile_second_kernel, + CUfunction reduce_kernel, + CUdevice device, + CUstream stream) +{ + const cccl_type_info accum_t = get_accumulator_type(op, d_in, init); + runtime_tuning_policy policy = get_policy(cc, accum_t, d_in.value_type); + + // Force kernel code-generation in all compiler passes + if (num_items <= (policy.block_size * policy.items_per_thread)) + { + // Small, single tile size + return InvokeSingleTile( + d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, op, init, cc, single_tile_kernel, stream); + } + else + { + // Multi-tile pass + return InvokePasses( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + op, + init, + cc, + reduce_kernel, + single_tile_second_kernel, + device, + stream); + } +} + +std::string get_input_iterator_name() +{ + std::string iterator_t; + check(nvrtcGetTypeName(&iterator_t)); + return iterator_t; +} + +std::string get_output_iterator_name() +{ + std::string iterator_t; + check(nvrtcGetTypeName(&iterator_t)); + return iterator_t; +} + +std::string get_single_tile_kernel_name( + cccl_iterator_t input_it, cccl_iterator_t output_it, cccl_op_t op, cccl_value_t init, bool is_second_kernel) +{ + std::string chained_policy_t; + check(nvrtcGetTypeName(&chained_policy_t)); + + const cccl_type_info accum_t = get_accumulator_type(op, input_it, init); + const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type); + const std::string input_iterator_t = + is_second_kernel ? cccl_type_enum_to_name(accum_t.type, true) + : input_it.type == cccl_iterator_kind_t::pointer // + ? cccl_type_enum_to_name(input_it.value_type.type, true) // + : get_input_iterator_name(); + const std::string output_iterator_t = + output_it.type == cccl_iterator_kind_t::pointer // + ? cccl_type_enum_to_name(output_it.value_type.type, true) // + : get_output_iterator_name(); + const std::string init_t = cccl_type_enum_to_name(init.type.type); + + std::string offset_t; + check(nvrtcGetTypeName(&offset_t)); + + std::string reduction_op_t; + check(nvrtcGetTypeName(&reduction_op_t)); + + return std::format( + "cub::DeviceReduceSingleTileKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}>", + chained_policy_t, + input_iterator_t, + output_iterator_t, + offset_t, + reduction_op_t, + init_t, + accum_cpp_t); +} + +std::string get_device_reduce_kernel_name(cccl_op_t op, cccl_iterator_t input_it, cccl_value_t init) +{ + std::string chained_policy_t; + check(nvrtcGetTypeName(&chained_policy_t)); + + const std::string input_iterator_t = + input_it.type == cccl_iterator_kind_t::pointer // + ? cccl_type_enum_to_name(input_it.value_type.type, true) // + : get_input_iterator_name(); + + const std::string accum_t = cccl_type_enum_to_name(get_accumulator_type(op, input_it, init).type); + + std::string offset_t; + check(nvrtcGetTypeName(&offset_t)); + + std::string reduction_op_t; + check(nvrtcGetTypeName(&reduction_op_t)); + + std::string transform_op_t; + check(nvrtcGetTypeName(&transform_op_t)); + + return std::format( + "cub::DeviceReduceKernel<{0}, {1}, {2}, {3}, {4}, {5}>", + chained_policy_t, + input_iterator_t, + offset_t, + reduction_op_t, + accum_t, + transform_op_t); +} + +bool try_push_context() +{ + CUcontext context = nullptr; + + check(cuCtxGetCurrent(&context)); + + if (context == nullptr) + { + const int default_device = 0; + check(cuDevicePrimaryCtxRetain(&context, default_device)); + check(cuCtxPushCurrent(context)); + + return true; + } + + return false; +} + +extern "C" CCCL_C_API CUresult cccl_device_reduce_build( + cccl_device_reduce_build_result_t* build, + cccl_iterator_t input_it, + cccl_iterator_t output_it, + cccl_op_t op, + cccl_value_t init, + int cc_major, + int cc_minor, + const char* cub_path, + const char* thrust_path, + const char* libcudacxx_path, + const char* ctk_path) noexcept +{ + CUresult error = CUDA_SUCCESS; + + try + { + nvrtcProgram prog{}; + const char* name = "test"; + + const int cc = cc_major * 10 + cc_minor; + const cccl_type_info accum_t = get_accumulator_type(op, input_it, init); + const std::string accum_cpp = cccl_type_enum_to_string(accum_t.type); + const runtime_tuning_policy policy = get_policy(cc, accum_t, input_it.value_type); + const std::string input_it_value_t = cccl_type_enum_to_string(input_it.value_type.type); + const std::string offset_t = cccl_type_enum_to_string(cccl_type_enum::UINT64); + + const std::string input_iterator_src = + input_it.type == cccl_iterator_kind_t::pointer + ? std::string{} + : std::format( + "extern \"C\" __device__ {3} {4}(const void *self_ptr);\n" + "extern \"C\" __device__ void {5}(void *self_ptr, {0} offset);\n" + "struct __align__({2}) input_iterator_state_t {{\n;" + " using iterator_category = cuda::std::random_access_iterator_tag;\n" + " using value_type = {3};\n" + " using difference_type = {0};\n" + " using pointer = {3}*;\n" + " using reference = {3}&;\n" + " __device__ value_type operator*() const {{ return {4}(this); }}\n" + " __device__ input_iterator_state_t& operator+=(difference_type diff) {{\n" + " {5}(this, diff);\n" + " return *this;\n" + " }}\n" + " __device__ value_type operator[](difference_type diff) const {{\n" + " return *(*this + diff);\n" + " }}\n" + " __device__ input_iterator_state_t operator+(difference_type diff) const {{\n" + " input_iterator_state_t result = *this;\n" + " result += diff;\n" + " return result;\n" + " }}\n" + " char data[{1}];\n" + "}};\n", + offset_t, // 0 + input_it.size, // 1 + input_it.alignment, // 2 + input_it_value_t, // 3 + input_it.dereference.name, // 4 + input_it.advance.name); // 5 + + const std::string output_iterator_src = + output_it.type == cccl_iterator_kind_t::pointer + ? std::string{} + : std::format( + "extern \"C\" __device__ void {2}(const void *self_ptr, {1} x);\n" + "extern \"C\" __device__ void {3}(void *self_ptr, {0} offset);\n" + "struct __align__({5}) output_iterator_state_t{{\n" + " char data[{4}];\n" + "}};\n" + "struct output_iterator_proxy_t {{\n" + " __device__ output_iterator_proxy_t operator=({1} x) {{\n" + " {2}(&state, x);\n" + " return *this;\n" + " }}\n" + " output_iterator_state_t state;\n" + "}};\n" + "struct output_iterator_t {{\n" + " using iterator_category = cuda::std::random_access_iterator_tag;\n" + " using difference_type = {0};\n" + " using value_type = void;\n" + " using pointer = output_iterator_proxy_t*;\n" + " using reference = output_iterator_proxy_t;\n" + " __device__ output_iterator_proxy_t operator*() const {{ return {{state}}; }}\n" + " __device__ output_iterator_t& operator+=(difference_type diff) {{\n" + " {3}(&state, diff);\n" + " return *this;\n" + " }}\n" + " __device__ output_iterator_proxy_t operator[](difference_type diff) const {{\n" + " output_iterator_t result = *this;\n" + " result += diff;\n" + " return {{ result.state }};\n" + " }}\n" + " __device__ output_iterator_t operator+(difference_type diff) const {{\n" + " output_iterator_t result = *this;\n" + " result += diff;\n" + " return result;\n" + " }}\n" + " output_iterator_state_t state;\n" + "}};", + offset_t, // 0 + accum_cpp, // 1 + output_it.dereference.name, // 2 + output_it.advance.name, // 3 + output_it.size, // 4 + output_it.alignment); // 5 + + const std::string op_src = + op.type == cccl_op_kind_t::stateless + ? std::format( + "extern \"C\" __device__ {0} {1}({0} lhs, {0} rhs);\n" + "struct op_wrapper {{\n" + " __device__ {0} operator()({0} lhs, {0} rhs) const {{\n" + " return {1}(lhs, rhs);\n" + " }}\n" + "}};\n", + accum_cpp, + op.name) + : std::format( + "struct __align__({2}) op_state {{\n" + " char data[{3}];\n" + "}};" + "extern \"C\" __device__ {0} {1}(op_state *state, {0} lhs, {0} rhs);\n" + "struct op_wrapper {{\n" + " op_state state;\n" + " __device__ {0} operator()({0} lhs, {0} rhs) {{\n" + " return {1}(&state, lhs, rhs);\n" + " }}\n" + "}};\n", + accum_cpp, + op.name, + op.alignment, + op.size); + + const std::string src = std::format( + "#include \n" + "#include \n" + "struct __align__({1}) storage_t {{\n" + " char data[{0}];\n" + "}};\n" + "{4}\n" + "{5}\n" + "struct agent_policy_t {{\n" + " static constexpr int ITEMS_PER_THREAD = {2};\n" + " static constexpr int BLOCK_THREADS = {3};\n" + " static constexpr int VECTOR_LOAD_LENGTH = {7};\n" + " static constexpr cub::BlockReduceAlgorithm BLOCK_ALGORITHM = cub::BLOCK_REDUCE_WARP_REDUCTIONS;\n" + " static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::LOAD_LDG;\n" + "}};\n" + "struct device_reduce_policy {{\n" + " struct ActivePolicy {{\n" + " using ReducePolicy = agent_policy_t;\n" + " using SingleTilePolicy = agent_policy_t;\n" + " }};\n" + "}};\n" + "{6};\n", + input_it.value_type.size, // 0 + input_it.value_type.alignment, // 1 + policy.items_per_thread, // 2 + policy.block_size, // 3 + input_iterator_src, // 4 + output_iterator_src, // 5 + op_src, // 6 + policy.vector_load_length); // 7 + + check(nvrtcCreateProgram(&prog, src.c_str(), name, 0, nullptr, nullptr)); + + std::string single_tile_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, false); + check(nvrtcAddNameExpression(prog, single_tile_kernel_name.c_str())); + + std::string single_tile_second_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, true); + check(nvrtcAddNameExpression(prog, single_tile_second_kernel_name.c_str())); + + std::string reduction_kernel_name = get_device_reduce_kernel_name(op, input_it, init); + check(nvrtcAddNameExpression(prog, reduction_kernel_name.c_str())); + + const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor); + + constexpr int num_args = 7; + const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"}; + + std::size_t log_size{}; + nvrtcResult compile_result = nvrtcCompileProgram(prog, num_args, args); + + check(nvrtcGetProgramLogSize(prog, &log_size)); + + std::unique_ptr log{new char[log_size]}; + check(nvrtcGetProgramLog(prog, log.get())); + + if (log_size > 1) + { + std::cerr << log.get() << std::endl; + } + + const char* single_tile_kernel_lowered_name; + check(nvrtcGetLoweredName(prog, single_tile_kernel_name.c_str(), &single_tile_kernel_lowered_name)); + + const char* single_tile_second_kernel_lowered_name; + check(nvrtcGetLoweredName(prog, single_tile_second_kernel_name.c_str(), &single_tile_second_kernel_lowered_name)); + + const char* reduction_kernel_lowered_name; + check(nvrtcGetLoweredName(prog, reduction_kernel_name.c_str(), &reduction_kernel_lowered_name)); + + // Copy lowered names to a std::unique_ptr to ensure they can be used after + // the program is destroyed + + std::unique_ptr single_tile_kernel_lowered_name_ptr{new char[strlen(single_tile_kernel_lowered_name) + 1]}; + strcpy(single_tile_kernel_lowered_name_ptr.get(), single_tile_kernel_lowered_name); + + std::unique_ptr single_tile_second_kernel_lowered_name_ptr{ + new char[strlen(single_tile_second_kernel_lowered_name) + 1]}; + strcpy(single_tile_second_kernel_lowered_name_ptr.get(), single_tile_second_kernel_lowered_name); + + std::unique_ptr reduction_kernel_lowered_name_ptr{new char[strlen(reduction_kernel_lowered_name) + 1]}; + strcpy(reduction_kernel_lowered_name_ptr.get(), reduction_kernel_lowered_name); + + check(compile_result); + + std::size_t ltoir_size{}; + check(nvrtcGetLTOIRSize(prog, <oir_size)); + std::unique_ptr ltoir{new char[ltoir_size]}; + check(nvrtcGetLTOIR(prog, ltoir.get())); + check(nvrtcDestroyProgram(&prog)); + + nvJitLinkHandle handle; + const char* lopts[] = {"-lto", arch.c_str()}; + check(nvJitLinkCreate(&handle, 2, lopts)); + + check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, ltoir.get(), ltoir_size, name)); + check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, op.ltoir, op.ltoir_size, name)); + + if (input_it.type == cccl_iterator_kind_t::iterator) + { + check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, input_it.advance.ltoir, input_it.advance.ltoir_size, name)); + check(nvJitLinkAddData( + handle, NVJITLINK_INPUT_LTOIR, input_it.dereference.ltoir, input_it.dereference.ltoir_size, name)); + } + + if (output_it.type == cccl_iterator_kind_t::iterator) + { + check( + nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, output_it.advance.ltoir, output_it.advance.ltoir_size, name)); + check(nvJitLinkAddData( + handle, NVJITLINK_INPUT_LTOIR, output_it.dereference.ltoir, output_it.dereference.ltoir_size, name)); + } + + check(nvJitLinkComplete(handle)); + + std::size_t cubin_size{}; + check(nvJitLinkGetLinkedCubinSize(handle, &cubin_size)); + std::unique_ptr cubin{new char[cubin_size]}; + check(nvJitLinkGetLinkedCubin(handle, cubin.get())); + check(nvJitLinkDestroy(&handle)); + + cuLibraryLoadData(&build->library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0); + check(cuLibraryGetKernel(&build->single_tile_kernel, build->library, single_tile_kernel_lowered_name_ptr.get())); + check(cuLibraryGetKernel( + &build->single_tile_second_kernel, build->library, single_tile_second_kernel_lowered_name_ptr.get())); + check(cuLibraryGetKernel(&build->reduction_kernel, build->library, reduction_kernel_lowered_name_ptr.get())); + + build->cc = cc; + build->cubin = cubin.release(); + build->cubin_size = cubin_size; + } + catch (...) + { + error = CUDA_ERROR_UNKNOWN; + } + + return error; +} + +extern "C" CCCL_C_API CUresult cccl_device_reduce( + cccl_device_reduce_build_result_t build, + void* d_temp_storage, + size_t* temp_storage_bytes, + cccl_iterator_t d_in, + cccl_iterator_t d_out, + unsigned long long num_items, + cccl_op_t op, + cccl_value_t init, + CUstream stream) noexcept +{ + bool pushed = false; + CUresult error = CUDA_SUCCESS; + try + { + pushed = try_push_context(); + + CUdevice cu_device; + check(cuCtxGetDevice(&cu_device)); + + Invoke( + d_temp_storage, + *temp_storage_bytes, + d_in, + d_out, + num_items, + op, + init, + build.cc, + (CUfunction) build.single_tile_kernel, + (CUfunction) build.single_tile_second_kernel, + (CUfunction) build.reduction_kernel, + cu_device, + stream); + } + catch (...) + { + error = CUDA_ERROR_UNKNOWN; + } + + if (pushed) + { + CUcontext dummy; + cuCtxPopCurrent(&dummy); + } + + return error; +} + +extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr) +{ + try + { + if (bld_ptr == nullptr) + { + return CUDA_ERROR_INVALID_VALUE; + } + + std::unique_ptr cubin(reinterpret_cast(bld_ptr->cubin)); + check(cuLibraryUnload(bld_ptr->library)); + } + catch (...) + { + return CUDA_ERROR_UNKNOWN; + } + + return CUDA_SUCCESS; +} diff --git a/c/test/CMakeLists.txt b/c/test/CMakeLists.txt new file mode 100644 index 0000000000..6a8599500e --- /dev/null +++ b/c/test/CMakeLists.txt @@ -0,0 +1,9 @@ +add_executable(cccl.c.test.reduce test_reduce.cpp test_main.cpp) + +target_link_libraries(cccl.c.test.reduce PRIVATE cccl.c CUDA::cudart CUDA::nvrtc Catch2::Catch2) + +target_compile_definitions(cccl.c.test.reduce PRIVATE CCCL_C_EXPERIMENTAL + TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub" + TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/cub" + TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include" + TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}") diff --git a/c/test/c2h.h b/c/test/c2h.h new file mode 100644 index 0000000000..e2b26895a8 --- /dev/null +++ b/c/test/c2h.h @@ -0,0 +1,310 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static std::string inspect_sass(const void* cubin, size_t cubin_size) +{ + namespace fs = std::filesystem; + + fs::path temp_dir = fs::temp_directory_path(); + + fs::path temp_in_filename = temp_dir / "temp_in_file.cubin"; + fs::path temp_out_filename = temp_dir / "temp_out_file.sass"; + + std::ofstream temp_in_file(temp_in_filename, std::ios::binary); + if (!temp_in_file) + { + throw std::runtime_error("Failed to create temporary file."); + } + + temp_in_file.write(static_cast(cubin), cubin_size); + temp_in_file.close(); + + std::string command = "nvdisasm -gi "; + command += temp_in_filename; + command += " > "; + command += temp_out_filename; + + if (std::system(command.c_str()) != 0) + { + throw std::runtime_error("Failed to execute command."); + } + + if (!fs::remove(temp_in_filename)) + { + throw std::runtime_error("Failed to remove temporary file."); + } + + std::ifstream temp_out_file(temp_out_filename, std::ios::binary); + if (!temp_out_file) + { + throw std::runtime_error("Failed to create temporary file."); + } + + const std::string sass{std::istreambuf_iterator(temp_out_file), std::istreambuf_iterator()}; + if (!fs::remove(temp_out_filename)) + { + throw std::runtime_error("Failed to remove temporary file."); + } + + return sass; +} + +static std::string compile(const std::string& source) +{ + // compile source to LTO-IR using nvrtc + + nvrtcProgram prog; + REQUIRE(NVRTC_SUCCESS == nvrtcCreateProgram(&prog, source.c_str(), "op.cu", 0, nullptr, nullptr)); + + const char* options[] = {"--std=c++17", "-rdc=true", "-dlto"}; + REQUIRE(NVRTC_SUCCESS == nvrtcCompileProgram(prog, 3, options)); + + std::size_t ltoir_size{}; + REQUIRE(NVRTC_SUCCESS == nvrtcGetLTOIRSize(prog, <oir_size)); + + std::unique_ptr ltoir(new char[ltoir_size]); + + REQUIRE(NVRTC_SUCCESS == nvrtcGetLTOIR(prog, ltoir.get())); + REQUIRE(NVRTC_SUCCESS == nvrtcDestroyProgram(&prog)); + + return std::string(ltoir.release(), ltoir_size); +} + +template +std::vector generate(std::size_t num_items) +{ + std::random_device rnd_device; + std::mt19937 mersenne_engine{rnd_device()}; // Generates random integers + std::uniform_int_distribution dist{T{1}, T{42}}; + std::vector vec(num_items); + std::generate(vec.begin(), vec.end(), [&]() { + return dist(mersenne_engine); + }); + return vec; +} + +template +cccl_type_info get_type_info() +{ + cccl_type_info info; + info.size = sizeof(T); + info.alignment = alignof(T); + + if constexpr (std::is_same_v) + { + info.type = cccl_type_enum::INT8; + } + else if constexpr (std::is_same_v) + { + info.type = cccl_type_enum::INT32; + } + else if constexpr (std::is_same_v) + { + info.type = cccl_type_enum::UINT32; + } + else if constexpr (std::is_same_v) + { + info.type = cccl_type_enum::INT64; + } + else if constexpr (std::is_same_v) + { + info.type = cccl_type_enum::UINT64; + } + else if constexpr (!std::is_integral_v) + { + info.type = cccl_type_enum::STORAGE; + } + else + { + static_assert(false, "Unsupported type"); + } + + return info; +} + +static std::string get_op(cccl_type_enum t) +{ + switch (t) + { + case cccl_type_enum::INT8: + return "extern \"C\" __device__ char op(char a, char b) { return a + b; }"; + case cccl_type_enum::INT32: + return "extern \"C\" __device__ int op(int a, int b) { return a + b; }"; + case cccl_type_enum::UINT32: + return "extern \"C\" __device__ unsigned int op(unsigned int a, unsigned int b) { return a + b; }"; + case cccl_type_enum::INT64: + return "extern \"C\" __device__ long long op(long long a, long long b) { return a + b; }"; + case cccl_type_enum::UINT64: + return "extern \"C\" __device__ unsigned long long op(unsigned long long a, unsigned long long b) { " + " return a + b; " + "}"; + default: + throw std::runtime_error("Unsupported type"); + } + return ""; +} + +template +struct pointer_t +{ + T* ptr{}; + + pointer_t(int num_items) + { + REQUIRE(cudaSuccess == cudaMalloc(&ptr, num_items * sizeof(T))); + } + + pointer_t(const std::vector& vec) + { + REQUIRE(cudaSuccess == cudaMalloc(&ptr, vec.size() * sizeof(T))); + REQUIRE(cudaSuccess == cudaMemcpy(ptr, vec.data(), vec.size() * sizeof(T), cudaMemcpyHostToDevice)); + } + + ~pointer_t() + { + if (ptr) + { + REQUIRE(cudaSuccess == cudaFree(ptr)); + ptr = nullptr; + } + } + + T operator[](int i) const + { + T value{}; + REQUIRE(cudaSuccess == cudaMemcpy(&value, ptr + i, sizeof(T), cudaMemcpyDeviceToHost)); + return value; + } + + operator cccl_iterator_t() + { + cccl_iterator_t it; + it.size = sizeof(T); + it.alignment = alignof(T); + it.type = cccl_iterator_kind_t::pointer; + it.state = ptr; + it.value_type = get_type_info(); + return it; + } +}; + +struct operation_t +{ + std::string name; + std::string code; + + operator cccl_op_t() + { + cccl_op_t op; + op.type = cccl_op_kind_t::stateless; + op.name = name.c_str(); + op.ltoir = code.c_str(); + op.ltoir_size = code.size(); + return op; + } +}; + +template +struct stateful_operation_t +{ + OpT op_state; + std::string name; + std::string code; + + operator cccl_op_t() + { + cccl_op_t op; + op.type = cccl_op_kind_t::stateful; + op.size = sizeof(OpT); + op.alignment = alignof(OpT); + op.state = &op_state; + op.name = name.c_str(); + op.ltoir = code.c_str(); + op.ltoir_size = code.size(); + return op; + } +}; + +static operation_t make_operation(std::string name, std::string code) +{ + return operation_t{name, compile(code)}; +} + +template +static stateful_operation_t make_operation(std::string name, std::string code, OpT op) +{ + return {op, name, compile(code)}; +} + +template +struct iterator_t +{ + StateT state; + operation_t advance; + operation_t dereference; + + operator cccl_iterator_t() + { + cccl_iterator_t it; + it.size = sizeof(StateT); + it.alignment = alignof(StateT); + it.type = cccl_iterator_kind_t::iterator; + it.advance = advance; + it.dereference = dereference; + it.value_type = get_type_info(); + it.state = &state; + return it; + } +}; + +template +iterator_t make_iterator(std::string state, operation_t advance, operation_t dereference) +{ + iterator_t it; + it.advance = make_operation(advance.name, state + advance.code); + it.dereference = make_operation(dereference.name, state + dereference.code); + return it; +} + +template +struct value_t +{ + T value; + + value_t(T value) + : value(value) + {} + + operator cccl_value_t() + { + cccl_value_t v; + v.type = get_type_info(); + v.state = &value; + return v; + } +}; diff --git a/c/test/test_main.cpp b/c/test/test_main.cpp new file mode 100644 index 0000000000..3e3b4900a5 --- /dev/null +++ b/c/test/test_main.cpp @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include + +#define CATCH_CONFIG_RUNNER +#include + +int device_guard(int device_id) +{ + int device_count{}; + if (cudaGetDeviceCount(&device_count) != cudaSuccess) + { + std::cerr << "Can't query devices number." << std::endl; + std::exit(-1); + } + + if (device_id >= device_count || device_id < 0) + { + std::cerr << "Invalid device ID: " << device_id << std::endl; + std::exit(-1); + } + + return device_id; +} + +int main(int argc, char* argv[]) +{ + Catch::Session session; + + int device_id{}; + + // Build a new parser on top of Catch's + using namespace Catch::clara; + auto cli = session.cli() | Opt(device_id, "device")["-d"]["--device"]("device id to use"); + session.cli(cli); + + int returnCode = session.applyCommandLine(argc, argv); + if (returnCode != 0) + { + return returnCode; + } + + cudaSetDevice(device_guard(device_id)); + return session.run(argc, argv); +} diff --git a/c/test/test_reduce.cpp b/c/test/test_reduce.cpp new file mode 100644 index 0000000000..1a4607702a --- /dev/null +++ b/c/test/test_reduce.cpp @@ -0,0 +1,285 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +#include "c2h.h" + +void reduce(cccl_iterator_t input, cccl_iterator_t output, unsigned long long num_items, cccl_op_t op, cccl_value_t init) +{ + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + + const int cc_major = deviceProp.major; + const int cc_minor = deviceProp.minor; + + const char* cub_path = TEST_CUB_PATH; + const char* thrust_path = TEST_THRUST_PATH; + const char* libcudacxx_path = TEST_LIBCUDACXX_PATH; + const char* ctk_path = TEST_CTK_PATH; + + cccl_device_reduce_build_result_t build; + REQUIRE(CUDA_SUCCESS + == cccl_device_reduce_build( + &build, input, output, op, init, cc_major, cc_minor, cub_path, thrust_path, libcudacxx_path, ctk_path)); + + const std::string sass = inspect_sass(build.cubin, build.cubin_size); + REQUIRE(sass.find("LDL") == std::string::npos); + REQUIRE(sass.find("STL") == std::string::npos); + + size_t temp_storage_bytes = 0; + REQUIRE( + CUDA_SUCCESS == cccl_device_reduce(build, nullptr, &temp_storage_bytes, input, output, num_items, op, init, 0)); + + pointer_t temp_storage(temp_storage_bytes); + + REQUIRE(CUDA_SUCCESS + == cccl_device_reduce(build, temp_storage.ptr, &temp_storage_bytes, input, output, num_items, op, init, 0)); + REQUIRE(CUDA_SUCCESS == cccl_device_reduce_cleanup(&build)); +} + +using integral_types = std::tuple; +TEMPLATE_LIST_TEST_CASE("Reduce works with integral types", "[reduce]", integral_types) +{ + const int num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24))); + operation_t op = make_operation("op", get_op(get_type_info().type)); + const std::vector input = generate(num_items); + pointer_t input_ptr(input); + pointer_t output_ptr(1); + value_t init{TestType{42}}; + + reduce(input_ptr, output_ptr, num_items, op, init); + + const TestType output = output_ptr[0]; + const TestType expected = std::accumulate(input.begin(), input.end(), init.value); + REQUIRE(output == expected); +} + +struct pair +{ + short a; + size_t b; +}; + +TEST_CASE("Reduce works with custom types", "[reduce]") +{ + const int num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24))); + + operation_t op = make_operation( + "op", + "struct pair { short a; size_t b; };\n" + "extern \"C\" __device__ pair op(pair lhs, pair rhs) {\n" + " return pair{ lhs.a + rhs.a, lhs.b + rhs.b };\n" + "}"); + const std::vector a = generate(num_items); + const std::vector b = generate(num_items); + std::vector input(num_items); + for (std::size_t i = 0; i < num_items; ++i) + { + input[i] = pair{a[i], b[i]}; + } + pointer_t input_ptr(input); + pointer_t output_ptr(1); + value_t init{pair{4, 2}}; + + reduce(input_ptr, output_ptr, num_items, op, init); + + const pair output = output_ptr[0]; + const pair expected = std::accumulate(input.begin(), input.end(), init.value, [](const pair& lhs, const pair& rhs) { + return pair{short(lhs.a + rhs.a), lhs.b + rhs.b}; + }); + REQUIRE(output.a == expected.a); + REQUIRE(output.b == expected.b); +} + +struct counting_iterator_state_t +{ + int value; +}; + +TEST_CASE("Reduce works with input iterators", "[reduce]") +{ + const std::size_t num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16))); + operation_t op = make_operation("op", get_op(get_type_info().type)); + iterator_t input_it = make_iterator( + "struct counting_iterator_state_t { int value; };\n", + {"advance", + "extern \"C\" __device__ void advance(counting_iterator_state_t* state, unsigned long long offset) {\n" + " state->value += offset;\n" + "}"}, + {"dereference", + "extern \"C\" __device__ int dereference(counting_iterator_state_t* state) { \n" + " return state->value;\n" + "}"}); + input_it.state.value = 0; + pointer_t output_it(1); + value_t init{42}; + + reduce(input_it, output_it, num_items, op, init); + + const int output = output_it[0]; + const int expected = init.value + num_items * (num_items - 1) / 2; + REQUIRE(output == expected); +} + +struct transform_output_iterator_state_t +{ + int* d_output; +}; + +TEST_CASE("Reduce works with output iterators", "[reduce]") +{ + const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16))); + operation_t op = make_operation("op", get_op(get_type_info().type)); + iterator_t output_it = make_iterator( + "struct transform_output_iterator_state_t { int* d_output; };\n", + {"advance", + "extern \"C\" __device__ void advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n" + " state->d_output += offset;\n" + "}"}, + {"dereference", + "extern \"C\" __device__ void dereference(transform_output_iterator_state_t* state, int x) { \n" + " *state->d_output = 2 * x;\n" + "}"}); + const std::vector input = generate(num_items); + pointer_t input_it(input); + pointer_t inner_output_it(1); + output_it.state.d_output = inner_output_it.ptr; + value_t init{42}; + + reduce(input_it, output_it, num_items, op, init); + + const int output = inner_output_it[0]; + const int expected = std::accumulate(input.begin(), input.end(), init.value); + REQUIRE(output == expected * 2); +} + +template +struct constant_iterator_state_t +{ + T value; +}; + +TEST_CASE("Reduce works with input and output iterators", "[reduce]") +{ + const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16))); + operation_t op = make_operation("op", get_op(get_type_info().type)); + iterator_t> input_it = make_iterator>( + "struct constant_iterator_state_t { int value; };\n", + {"in_advance", + "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n" + "}"}, + {"in_dereference", + "extern \"C\" __device__ int in_dereference(constant_iterator_state_t* state) { \n" + " return state->value;\n" + "}"}); + input_it.state.value = 1; + iterator_t output_it = make_iterator( + "struct transform_output_iterator_state_t { int* d_output; };\n", + {"out_advance", + "extern \"C\" __device__ void out_advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n" + " state->d_output += offset;\n" + "}"}, + {"out_dereference", + "extern \"C\" __device__ void out_dereference(transform_output_iterator_state_t* state, int x) { \n" + " *state->d_output = 2 * x;\n" + "}"}); + pointer_t inner_output_it(1); + output_it.state.d_output = inner_output_it.ptr; + value_t init{42}; + + reduce(input_it, output_it, num_items, op, init); + + const int output = inner_output_it[0]; + const int expected = 2 * (init.value + num_items); + REQUIRE(output == expected); +} + +TEST_CASE("Reduce accumulator type is influenced by initial value", "[reduce]") +{ + const int num_items = 1 << 14; // 16384 > 128 + operation_t op = make_operation("op", get_op(get_type_info().type)); + iterator_t> input_it = make_iterator>( + "struct constant_iterator_state_t { char value; };\n", + {"in_advance", + "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n" + "}"}, + {"in_dereference", + "extern \"C\" __device__ char in_dereference(constant_iterator_state_t* state) { \n" + " return state->value;\n" + "}"}); + input_it.state.value = 1; + pointer_t output_it(1); + value_t init{42}; + + reduce(input_it, output_it, num_items, op, init); + + const size_t output = output_it[0]; + const int expected = init.value + num_items; + REQUIRE(output == expected); +} + +TEST_CASE("Reduce works with large inputs", "[reduce]") +{ + const size_t num_items = 1ull << 33; + operation_t op = make_operation("op", get_op(get_type_info().type)); + iterator_t> input_it = make_iterator>( + "struct constant_iterator_state_t { char value; };\n", + {"in_advance", + "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n" + "}"}, + {"in_dereference", + "extern \"C\" __device__ char in_dereference(constant_iterator_state_t* state) { \n" + " return state->value;\n" + "}"}); + input_it.state.value = 1; + pointer_t output_it(1); + value_t init{42}; + + reduce(input_it, output_it, num_items, op, init); + + const size_t output = output_it[0]; + const size_t expected = init.value + num_items; + REQUIRE(output == expected); +} + +struct invocation_counter_state_t +{ + int* d_counter; +}; + +TEST_CASE("Reduce works with stateful operators", "[reduce]") +{ + const int num_items = 1 << 12; + pointer_t counter(1); + stateful_operation_t op = make_operation( + "op", + "struct invocation_counter_state_t { int* d_counter; };\n" + "extern \"C\" __device__ int op(invocation_counter_state_t *state, int a, int b) {\n" + " atomicAdd(state->d_counter, 1);\n" + " return a + b;\n" + "}", + invocation_counter_state_t{counter.ptr}); + + const std::vector input = generate(num_items); + pointer_t input_ptr(input); + pointer_t output_ptr(1); + value_t init{42}; + + reduce(input_ptr, output_ptr, num_items, op, init); + + const int invocation_count = counter[0]; + const int expected_invocation_count = num_items - 1; + REQUIRE(invocation_count > expected_invocation_count); + + const int output = output_ptr[0]; + const int expected = std::accumulate(input.begin(), input.end(), init.value); + REQUIRE(output == expected); +}