From 5a4881b5001f8103b6e20bfd200a8083f13e07f2 Mon Sep 17 00:00:00 2001
From: Georgii Evtushenko <evtushenko.georgy@gmail.com>
Date: Wed, 21 Aug 2024 07:45:55 -0700
Subject: [PATCH] Implement C API for device reduction (#2256)

* Implement C device reduce

* Format

* Fix device-specific module loading

* Don't need a context at build step now

* Address review feedback
---
 CMakeLists.txt          |   5 +
 CMakePresets.json       |   1 +
 c/CMakeLists.txt        |  20 +
 c/include/cccl/reduce.h |  59 +++
 c/include/cccl/types.h  |  85 ++++
 c/src/reduce.cu         | 857 ++++++++++++++++++++++++++++++++++++++++
 c/test/CMakeLists.txt   |   9 +
 c/test/c2h.h            | 310 +++++++++++++++
 c/test/test_main.cpp    |  55 +++
 c/test/test_reduce.cpp  | 285 +++++++++++++
 10 files changed, 1686 insertions(+)
 create mode 100644 c/CMakeLists.txt
 create mode 100644 c/include/cccl/reduce.h
 create mode 100644 c/include/cccl/types.h
 create mode 100644 c/src/reduce.cu
 create mode 100644 c/test/CMakeLists.txt
 create mode 100644 c/test/c2h.h
 create mode 100644 c/test/test_main.cpp
 create mode 100644 c/test/test_reduce.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 198727dc5d..015b216a8d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,6 +38,7 @@ option(CCCL_ENABLE_THRUST "Enable the Thrust developer build." ${CCCL_TOPLEVEL_P
 option(CCCL_ENABLE_TESTING "Enable CUDA C++ Core Library tests." ${CCCL_TOPLEVEL_PROJECT})
 option(CCCL_ENABLE_EXAMPLES "Enable CUDA C++ Core Library examples." ${CCCL_TOPLEVEL_PROJECT})
 option(CCCL_ENABLE_BENCHMARKS "Enable CUDA C++ Core Library benchmarks." OFF)
+option(CCCL_ENABLE_C "Enable CUDA C Core Library." OFF)
 
 option(CCCL_ENABLE_UNSTABLE "Enable targets and developer build options for unstable projects." OFF)
 
@@ -77,6 +78,10 @@ if (CCCL_ENABLE_UNSTABLE)
   add_subdirectory(cudax)
 endif()
 
+if (CCCL_ENABLE_C)
+  add_subdirectory(c)
+endif()
+
 if (CCCL_ENABLE_TESTING)
   add_subdirectory(test)
 endif()
diff --git a/CMakePresets.json b/CMakePresets.json
index 61cb88eca8..bd9374778b 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -38,6 +38,7 @@
         "CCCL_ENABLE_TESTING": true,
         "CCCL_ENABLE_EXAMPLES": true,
         "CCCL_ENABLE_BENCHMARKS": true,
+        "CCCL_ENABLE_C": true,
         "LIBCUDACXX_ENABLE_LIBCUDACXX_TESTS": true,
         "CUB_ENABLE_TESTING": true,
         "CUB_ENABLE_EXAMPLES": true,
diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt
new file mode 100644
index 0000000000..3e3783903b
--- /dev/null
+++ b/c/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.30)
+
+project(cccl.c LANGUAGES CUDA CXX)
+
+add_library(cccl.c SHARED src/reduce.cu)
+set_property(TARGET cccl.c PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET cccl.c PROPERTY CXX_STANDARD 20)
+set_property(TARGET cccl.c PROPERTY CUDA_STANDARD 20)
+
+find_package(CUDAToolkit REQUIRED)
+
+# TODO Use static versions of cudart, nvrtc, and nvJitLink
+target_link_libraries(cccl.c PRIVATE CUDA::cudart
+                                     CUDA::nvrtc
+                                     CUDA::nvJitLink
+                                     CUDA::cuda_driver)
+target_compile_definitions(cccl.c PRIVATE NVRTC_GET_TYPE_NAME=1 CCCL_C_EXPERIMENTAL=1)
+target_include_directories(cccl.c PUBLIC "include")
+
+add_subdirectory(test)
diff --git a/c/include/cccl/reduce.h b/c/include/cccl/reduce.h
new file mode 100644
index 0000000000..5047625a85
--- /dev/null
+++ b/c/include/cccl/reduce.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#ifndef CCCL_C_EXPERIMENTAL
+#  warning "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this warning."
+#else // ^^^ !CCCL_C_EXPERIMENTAL ^^^ / vvv CCCL_C_EXPERIMENTAL vvv
+
+#  include <cuda.h>
+
+#  include <cccl/types.h>
+
+struct cccl_device_reduce_build_result_t
+{
+  int cc;
+  void* cubin;
+  size_t cubin_size;
+  CUlibrary library;
+  CUkernel single_tile_kernel;
+  CUkernel single_tile_second_kernel;
+  CUkernel reduction_kernel;
+};
+
+// TODO return a union of nvtx/cuda/nvrtc errors or a string?
+extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
+  cccl_device_reduce_build_result_t* build,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  cccl_op_t op,
+  cccl_value_t init,
+  int cc_major,
+  int cc_minor,
+  const char* cub_path,
+  const char* thrust_path,
+  const char* libcudacxx_path,
+  const char* ctk_path) noexcept;
+
+extern "C" CCCL_C_API CUresult cccl_device_reduce(
+  cccl_device_reduce_build_result_t build,
+  void* d_temp_storage,
+  size_t* temp_storage_bytes,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  unsigned long long num_items,
+  cccl_op_t op,
+  cccl_value_t init,
+  CUstream stream) noexcept;
+
+extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr);
+
+#endif // CCCL_C_EXPERIMENTAL
diff --git a/c/include/cccl/types.h b/c/include/cccl/types.h
new file mode 100644
index 0000000000..6b19848de4
--- /dev/null
+++ b/c/include/cccl/types.h
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#ifndef CCCL_C_EXPERIMENTAL
+#  warning "C exposure is experimental and subject to change. Define CCCL_C_EXPERIMENTAL to acknowledge this warning."
+#else // ^^^ !CCCL_C_EXPERIMENTAL ^^^ / vvv CCCL_C_EXPERIMENTAL vvv
+
+#  if defined(_WIN32)
+#    define CCCL_C_API __declspec(dllexport)
+#  else
+#    define CCCL_C_API __attribute__((visibility("default")))
+#  endif
+
+enum class cccl_type_enum
+{
+  INT8    = 0,
+  INT16   = 1,
+  INT32   = 2,
+  INT64   = 3,
+  UINT8   = 4,
+  UINT16  = 5,
+  UINT32  = 6,
+  UINT64  = 7,
+  FLOAT32 = 8,
+  FLOAT64 = 9,
+  STORAGE = 10
+};
+
+struct cccl_type_info
+{
+  int size;
+  int alignment;
+  cccl_type_enum type;
+};
+
+enum class cccl_op_kind_t
+{
+  stateless,
+  stateful
+};
+
+struct cccl_op_t
+{
+  cccl_op_kind_t type;
+  const char* name;
+  const char* ltoir;
+  int ltoir_size;
+  int size;
+  int alignment;
+  void* state;
+};
+
+enum class cccl_iterator_kind_t
+{
+  pointer,
+  iterator
+};
+
+struct cccl_value_t
+{
+  cccl_type_info type;
+  void* state;
+};
+
+struct cccl_iterator_t
+{
+  int size;
+  int alignment;
+  cccl_iterator_kind_t type;
+  cccl_op_t advance;
+  cccl_op_t dereference;
+  cccl_type_info value_type;
+  void* state;
+};
+
+#endif // CCCL_C_EXPERIMENTAL
diff --git a/c/src/reduce.cu b/c/src/reduce.cu
new file mode 100644
index 0000000000..4ddcdf2f1a
--- /dev/null
+++ b/c/src/reduce.cu
@@ -0,0 +1,857 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cub/detail/choose_offset.cuh>
+#include <cub/grid/grid_even_share.cuh>
+#include <cub/util_device.cuh>
+
+#include <cuda/std/cstdint>
+#include <cuda/std/functional>
+
+#include <format>
+#include <iostream>
+#include <memory>
+
+#include <cccl/reduce.h>
+#include <nvJitLink.h>
+#include <nvrtc.h>
+
+void check(nvrtcResult result)
+{
+  if (result != NVRTC_SUCCESS)
+  {
+    throw std::runtime_error(std::string("NVRTC error: ") + nvrtcGetErrorString(result));
+  }
+}
+
+void check(CUresult result)
+{
+  if (result != CUDA_SUCCESS)
+  {
+    const char* str = nullptr;
+    cuGetErrorString(result, &str);
+    throw std::runtime_error(std::string("CUDA error: ") + str);
+  }
+}
+
+void check(nvJitLinkResult result)
+{
+  if (result != NVJITLINK_SUCCESS)
+  {
+    throw std::runtime_error(std::string("nvJitLink error: ") + std::to_string(result));
+  }
+}
+
+struct op_wrapper;
+struct device_reduce_policy;
+using TransformOpT = ::cuda::std::__identity;
+using OffsetT      = unsigned long long;
+static_assert(std::is_same_v<cub::detail::choose_offset_t<OffsetT>, OffsetT>, "OffsetT must be size_t");
+
+struct nothing_t
+{};
+
+struct runtime_tuning_policy
+{
+  int block_size;
+  int items_per_thread;
+  int vector_load_length;
+};
+
+struct storage_t;
+struct input_iterator_state_t;
+struct output_iterator_t;
+
+char const* cccl_type_enum_to_string(cccl_type_enum type)
+{
+  switch (type)
+  {
+    case cccl_type_enum::INT8:
+      return "::cuda::std::int8_t";
+    case cccl_type_enum::INT16:
+      return "::cuda::std::int16_t";
+    case cccl_type_enum::INT32:
+      return "::cuda::std::int32_t";
+    case cccl_type_enum::INT64:
+      return "::cuda::std::int64_t";
+    case cccl_type_enum::UINT8:
+      return "::cuda::std::uint8_t";
+    case cccl_type_enum::UINT16:
+      return "::cuda::std::uint16_t";
+    case cccl_type_enum::UINT32:
+      return "::cuda::std::uint32_t";
+    case cccl_type_enum::UINT64:
+      return "::cuda::std::uint64_t";
+    case cccl_type_enum::FLOAT32:
+      return "float";
+    case cccl_type_enum::FLOAT64:
+      return "double";
+    case cccl_type_enum::STORAGE:
+      return "storage_t";
+  }
+  return "unknown";
+}
+
+std::string cccl_type_enum_to_name(cccl_type_enum type, bool is_pointer = false)
+{
+  std::string result;
+
+  if (is_pointer)
+  {
+    switch (type)
+    {
+      case cccl_type_enum::INT8:
+
+        check(nvrtcGetTypeName<::cuda::std::int8_t*>(&result));
+        break;
+      case cccl_type_enum::INT16:
+        check(nvrtcGetTypeName<::cuda::std::int16_t*>(&result));
+        break;
+      case cccl_type_enum::INT32:
+        check(nvrtcGetTypeName<::cuda::std::int32_t*>(&result));
+        break;
+      case cccl_type_enum::INT64:
+        check(nvrtcGetTypeName<::cuda::std::int64_t*>(&result));
+        break;
+      case cccl_type_enum::UINT8:
+        check(nvrtcGetTypeName<::cuda::std::uint8_t*>(&result));
+        break;
+      case cccl_type_enum::UINT16:
+        check(nvrtcGetTypeName<::cuda::std::uint16_t*>(&result));
+        break;
+      case cccl_type_enum::UINT32:
+        check(nvrtcGetTypeName<::cuda::std::uint32_t*>(&result));
+        break;
+      case cccl_type_enum::UINT64:
+        check(nvrtcGetTypeName<::cuda::std::uint64_t*>(&result));
+        break;
+      case cccl_type_enum::FLOAT32:
+        check(nvrtcGetTypeName<float*>(&result));
+        break;
+      case cccl_type_enum::FLOAT64:
+        check(nvrtcGetTypeName<double*>(&result));
+        break;
+      case cccl_type_enum::STORAGE:
+        check(nvrtcGetTypeName<storage_t*>(&result));
+        break;
+    }
+  }
+  else
+  {
+    switch (type)
+    {
+      case cccl_type_enum::INT8:
+        check(nvrtcGetTypeName<::cuda::std::int8_t>(&result));
+        break;
+      case cccl_type_enum::INT16:
+        check(nvrtcGetTypeName<::cuda::std::int16_t>(&result));
+        break;
+      case cccl_type_enum::INT32:
+        check(nvrtcGetTypeName<::cuda::std::int32_t>(&result));
+        break;
+      case cccl_type_enum::INT64:
+        check(nvrtcGetTypeName<::cuda::std::int64_t>(&result));
+        break;
+      case cccl_type_enum::UINT8:
+        check(nvrtcGetTypeName<::cuda::std::uint8_t>(&result));
+        break;
+      case cccl_type_enum::UINT16:
+        check(nvrtcGetTypeName<::cuda::std::uint16_t>(&result));
+        break;
+      case cccl_type_enum::UINT32:
+        check(nvrtcGetTypeName<::cuda::std::uint32_t>(&result));
+        break;
+      case cccl_type_enum::UINT64:
+        check(nvrtcGetTypeName<::cuda::std::uint64_t>(&result));
+        break;
+      case cccl_type_enum::FLOAT32:
+        check(nvrtcGetTypeName<float>(&result));
+        break;
+      case cccl_type_enum::FLOAT64:
+        check(nvrtcGetTypeName<double>(&result));
+        break;
+      case cccl_type_enum::STORAGE:
+        check(nvrtcGetTypeName<storage_t>(&result));
+        break;
+    }
+  }
+
+  return result;
+}
+
+struct reduce_tuning_t
+{
+  int cc;
+  int block_size;
+  int items_per_thread;
+  int vector_load_length;
+};
+
+template <int N>
+reduce_tuning_t find_tuning(int cc, const reduce_tuning_t (&tunings)[N])
+{
+  for (const reduce_tuning_t& tuning : tunings)
+  {
+    if (cc >= tuning.cc)
+    {
+      return tuning;
+    }
+  }
+
+  return tunings[N - 1];
+}
+
+runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type, cccl_type_info input_type)
+{
+  reduce_tuning_t chain[] = {{60, 256, 16, 4}, {35, 256, 20, 4}};
+
+  auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain);
+
+  // Implement part of MemBoundScaling
+  items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2));
+  block_size       = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);
+
+  return {block_size, items_per_thread, vector_load_length};
+}
+
+cccl_type_info get_accumulator_type(cccl_op_t op, cccl_iterator_t input_it, cccl_value_t init)
+{
+  // TODO Should be decltype(op(init, *input_it)) but haven't implemented type arithmetic yet
+  //      so switching back to the old accumulator type logic for now
+  return init.type;
+}
+
+cudaError_t InvokeSingleTile(
+  void* d_temp_storage,
+  std::size_t& temp_storage_bytes,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  unsigned long long num_items,
+  cccl_op_t op,
+  cccl_value_t init,
+  int cc,
+  CUfunction single_tile_kernel,
+  CUstream stream)
+{
+  const runtime_tuning_policy policy = get_policy(cc, d_in.value_type, d_in.value_type);
+
+  cudaError error = cudaSuccess;
+  do
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      break;
+    }
+
+    nothing_t nothing{};
+    TransformOpT transform_op{};
+    void* op_state = op.type == cccl_op_kind_t::stateless ? &nothing : op.state;
+    void* in_ptr   = d_in.type == cccl_iterator_kind_t::pointer ? &d_in.state : d_in.state;
+    void* out_ptr  = d_out.type == cccl_iterator_kind_t::pointer ? &d_out.state : d_out.state;
+    void* args[]   = {in_ptr, out_ptr, &num_items, op_state, init.state, &transform_op};
+
+    check(cuLaunchKernel(single_tile_kernel, 1, 1, 1, policy.block_size, 1, 1, 0, stream, args, 0));
+
+    // Check for failure to launch
+    error = CubDebug(cudaPeekAtLastError());
+    if (cudaSuccess != error)
+    {
+      break;
+    }
+  } while (0);
+
+  return error;
+}
+
+cudaError_t InvokePasses(
+  void* d_temp_storage,
+  std::size_t& temp_storage_bytes,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  unsigned long long num_items,
+  cccl_op_t op,
+  cccl_value_t init,
+  int cc,
+  CUfunction reduce_kernel,
+  CUfunction single_tile_kernel,
+  CUdevice device,
+  CUstream stream)
+{
+  const cccl_type_info accum_t       = get_accumulator_type(op, d_in, init);
+  const runtime_tuning_policy policy = get_policy(cc, accum_t, d_in.value_type);
+
+  cudaError error = cudaSuccess;
+  do
+  {
+    void* in_ptr  = d_in.type == cccl_iterator_kind_t::pointer ? &d_in.state : d_in.state;
+    void* out_ptr = d_out.type == cccl_iterator_kind_t::pointer ? &d_out.state : d_out.state;
+
+    // Get SM count
+    int sm_count;
+    check(cuDeviceGetAttribute(&sm_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
+
+    // Init regular kernel configuration
+    const auto tile_size = policy.block_size * policy.items_per_thread;
+
+    int sm_occupancy = 1;
+    check(cuOccupancyMaxActiveBlocksPerMultiprocessor(&sm_occupancy, reduce_kernel, policy.block_size, 0));
+
+    int reduce_device_occupancy = sm_occupancy * sm_count;
+
+    // Even-share work distribution
+    int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(0);
+    cub::GridEvenShare<OffsetT> even_share;
+    even_share.DispatchInit(num_items, max_blocks, tile_size);
+
+    // Temporary storage allocation requirements
+    void* allocations[1]       = {};
+    size_t allocation_sizes[1] = {
+      max_blocks * static_cast<std::size_t>(d_in.value_type.size) // bytes needed for privatized block reductions
+    };
+
+    // Alias the temporary allocations from the single storage blob (or
+    // compute the necessary size of the blob)
+    error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+    if (cudaSuccess != error)
+    {
+      break;
+    }
+
+    if (d_temp_storage == nullptr)
+    {
+      // Return if the caller is simply requesting the size of the storage
+      // allocation
+      return cudaSuccess;
+    }
+
+    // Get grid size for device_reduce_sweep_kernel
+    OffsetT reduce_grid_size = even_share.grid_size;
+
+    // Invoke DeviceReduceKernel
+    // reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS>>>(
+    //    d_in, d_block_reductions, num_items, even_share, ReductionOpT{}, TransformOpT{});
+
+    nothing_t nothing{};
+    void* op_state = op.type == cccl_op_kind_t::stateless ? &nothing : op.state;
+
+    TransformOpT transform_op{};
+    void* reduce_args[] = {in_ptr, &allocations[0], &num_items, &even_share, op_state, &transform_op};
+
+    check(cuLaunchKernel(reduce_kernel, reduce_grid_size, 1, 1, policy.block_size, 1, 1, 0, stream, reduce_args, 0));
+
+    // Check for failure to launch
+    error = CubDebug(cudaPeekAtLastError());
+    if (cudaSuccess != error)
+    {
+      break;
+    }
+
+    // single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS>>>(
+    //     d_block_reductions, d_out, reduce_grid_size, ReductionOpT{}, 0, TransformOpT{});
+
+    void* single_tile_kernel_args[] = {&allocations[0], out_ptr, &reduce_grid_size, op_state, init.state, &transform_op};
+
+    check(cuLaunchKernel(single_tile_kernel, 1, 1, 1, policy.block_size, 1, 1, 0, stream, single_tile_kernel_args, 0));
+
+    // Check for failure to launch
+    error = CubDebug(cudaPeekAtLastError());
+    if (cudaSuccess != error)
+    {
+      break;
+    }
+  } while (0);
+
+  return error;
+}
+
+cudaError_t Invoke(
+  void* d_temp_storage,
+  std::size_t& temp_storage_bytes,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  OffsetT num_items,
+  cccl_op_t op,
+  cccl_value_t init,
+  int cc,
+  CUfunction single_tile_kernel,
+  CUfunction single_tile_second_kernel,
+  CUfunction reduce_kernel,
+  CUdevice device,
+  CUstream stream)
+{
+  const cccl_type_info accum_t = get_accumulator_type(op, d_in, init);
+  runtime_tuning_policy policy = get_policy(cc, accum_t, d_in.value_type);
+
+  // Force kernel code-generation in all compiler passes
+  if (num_items <= (policy.block_size * policy.items_per_thread))
+  {
+    // Small, single tile size
+    return InvokeSingleTile(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, op, init, cc, single_tile_kernel, stream);
+  }
+  else
+  {
+    // Multi-tile pass
+    return InvokePasses(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_items,
+      op,
+      init,
+      cc,
+      reduce_kernel,
+      single_tile_second_kernel,
+      device,
+      stream);
+  }
+}
+
+std::string get_input_iterator_name()
+{
+  std::string iterator_t;
+  check(nvrtcGetTypeName<input_iterator_state_t>(&iterator_t));
+  return iterator_t;
+}
+
+std::string get_output_iterator_name()
+{
+  std::string iterator_t;
+  check(nvrtcGetTypeName<output_iterator_t>(&iterator_t));
+  return iterator_t;
+}
+
+std::string get_single_tile_kernel_name(
+  cccl_iterator_t input_it, cccl_iterator_t output_it, cccl_op_t op, cccl_value_t init, bool is_second_kernel)
+{
+  std::string chained_policy_t;
+  check(nvrtcGetTypeName<device_reduce_policy>(&chained_policy_t));
+
+  const cccl_type_info accum_t  = get_accumulator_type(op, input_it, init);
+  const std::string accum_cpp_t = cccl_type_enum_to_name(accum_t.type);
+  const std::string input_iterator_t =
+    is_second_kernel ? cccl_type_enum_to_name(accum_t.type, true)
+    : input_it.type == cccl_iterator_kind_t::pointer //
+      ? cccl_type_enum_to_name(input_it.value_type.type, true) //
+      : get_input_iterator_name();
+  const std::string output_iterator_t =
+    output_it.type == cccl_iterator_kind_t::pointer //
+      ? cccl_type_enum_to_name(output_it.value_type.type, true) //
+      : get_output_iterator_name();
+  const std::string init_t = cccl_type_enum_to_name(init.type.type);
+
+  std::string offset_t;
+  check(nvrtcGetTypeName<OffsetT>(&offset_t));
+
+  std::string reduction_op_t;
+  check(nvrtcGetTypeName<op_wrapper>(&reduction_op_t));
+
+  return std::format(
+    "cub::DeviceReduceSingleTileKernel<{0}, {1}, {2}, {3}, {4}, {5}, {6}>",
+    chained_policy_t,
+    input_iterator_t,
+    output_iterator_t,
+    offset_t,
+    reduction_op_t,
+    init_t,
+    accum_cpp_t);
+}
+
+std::string get_device_reduce_kernel_name(cccl_op_t op, cccl_iterator_t input_it, cccl_value_t init)
+{
+  std::string chained_policy_t;
+  check(nvrtcGetTypeName<device_reduce_policy>(&chained_policy_t));
+
+  const std::string input_iterator_t =
+    input_it.type == cccl_iterator_kind_t::pointer //
+      ? cccl_type_enum_to_name(input_it.value_type.type, true) //
+      : get_input_iterator_name();
+
+  const std::string accum_t = cccl_type_enum_to_name(get_accumulator_type(op, input_it, init).type);
+
+  std::string offset_t;
+  check(nvrtcGetTypeName<OffsetT>(&offset_t));
+
+  std::string reduction_op_t;
+  check(nvrtcGetTypeName<op_wrapper>(&reduction_op_t));
+
+  std::string transform_op_t;
+  check(nvrtcGetTypeName<cuda::std::__identity>(&transform_op_t));
+
+  return std::format(
+    "cub::DeviceReduceKernel<{0}, {1}, {2}, {3}, {4}, {5}>",
+    chained_policy_t,
+    input_iterator_t,
+    offset_t,
+    reduction_op_t,
+    accum_t,
+    transform_op_t);
+}
+
+bool try_push_context()
+{
+  CUcontext context = nullptr;
+
+  check(cuCtxGetCurrent(&context));
+
+  if (context == nullptr)
+  {
+    const int default_device = 0;
+    check(cuDevicePrimaryCtxRetain(&context, default_device));
+    check(cuCtxPushCurrent(context));
+
+    return true;
+  }
+
+  return false;
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_reduce_build(
+  cccl_device_reduce_build_result_t* build,
+  cccl_iterator_t input_it,
+  cccl_iterator_t output_it,
+  cccl_op_t op,
+  cccl_value_t init,
+  int cc_major,
+  int cc_minor,
+  const char* cub_path,
+  const char* thrust_path,
+  const char* libcudacxx_path,
+  const char* ctk_path) noexcept
+{
+  CUresult error = CUDA_SUCCESS;
+
+  try
+  {
+    nvrtcProgram prog{};
+    const char* name = "test";
+
+    const int cc                       = cc_major * 10 + cc_minor;
+    const cccl_type_info accum_t       = get_accumulator_type(op, input_it, init);
+    const std::string accum_cpp        = cccl_type_enum_to_string(accum_t.type);
+    const runtime_tuning_policy policy = get_policy(cc, accum_t, input_it.value_type);
+    const std::string input_it_value_t = cccl_type_enum_to_string(input_it.value_type.type);
+    const std::string offset_t         = cccl_type_enum_to_string(cccl_type_enum::UINT64);
+
+    const std::string input_iterator_src =
+      input_it.type == cccl_iterator_kind_t::pointer
+        ? std::string{}
+        : std::format(
+            "extern \"C\" __device__ {3} {4}(const void *self_ptr);\n"
+            "extern \"C\" __device__ void {5}(void *self_ptr, {0} offset);\n"
+            "struct __align__({2}) input_iterator_state_t {{\n;"
+            "  using iterator_category = cuda::std::random_access_iterator_tag;\n"
+            "  using value_type = {3};\n"
+            "  using difference_type = {0};\n"
+            "  using pointer = {3}*;\n"
+            "  using reference = {3}&;\n"
+            "  __device__ value_type operator*() const {{ return {4}(this); }}\n"
+            "  __device__ input_iterator_state_t& operator+=(difference_type diff) {{\n"
+            "      {5}(this, diff);\n"
+            "      return *this;\n"
+            "  }}\n"
+            "  __device__ value_type operator[](difference_type diff) const {{\n"
+            "      return *(*this + diff);\n"
+            "  }}\n"
+            "  __device__ input_iterator_state_t operator+(difference_type diff) const {{\n"
+            "      input_iterator_state_t result = *this;\n"
+            "      result += diff;\n"
+            "      return result;\n"
+            "  }}\n"
+            "  char data[{1}];\n"
+            "}};\n",
+            offset_t, // 0
+            input_it.size, // 1
+            input_it.alignment, // 2
+            input_it_value_t, // 3
+            input_it.dereference.name, // 4
+            input_it.advance.name); // 5
+
+    const std::string output_iterator_src =
+      output_it.type == cccl_iterator_kind_t::pointer
+        ? std::string{}
+        : std::format(
+            "extern \"C\" __device__ void {2}(const void *self_ptr, {1} x);\n"
+            "extern \"C\" __device__ void {3}(void *self_ptr, {0} offset);\n"
+            "struct __align__({5}) output_iterator_state_t{{\n"
+            "  char data[{4}];\n"
+            "}};\n"
+            "struct output_iterator_proxy_t {{\n"
+            "  __device__ output_iterator_proxy_t operator=({1} x) {{\n"
+            "    {2}(&state, x);\n"
+            "    return *this;\n"
+            "  }}\n"
+            "  output_iterator_state_t state;\n"
+            "}};\n"
+            "struct output_iterator_t {{\n"
+            "  using iterator_category = cuda::std::random_access_iterator_tag;\n"
+            "  using difference_type   = {0};\n"
+            "  using value_type        = void;\n"
+            "  using pointer           = output_iterator_proxy_t*;\n"
+            "  using reference         = output_iterator_proxy_t;\n"
+            "  __device__ output_iterator_proxy_t operator*() const {{ return {{state}}; }}\n"
+            "  __device__ output_iterator_t& operator+=(difference_type diff) {{\n"
+            "      {3}(&state, diff);\n"
+            "      return *this;\n"
+            "  }}\n"
+            "  __device__ output_iterator_proxy_t operator[](difference_type diff) const {{\n"
+            "    output_iterator_t result = *this;\n"
+            "    result += diff;\n"
+            "    return {{ result.state }};\n"
+            "  }}\n"
+            "  __device__ output_iterator_t operator+(difference_type diff) const {{\n"
+            "    output_iterator_t result = *this;\n"
+            "    result += diff;\n"
+            "    return result;\n"
+            "  }}\n"
+            "  output_iterator_state_t state;\n"
+            "}};",
+            offset_t, // 0
+            accum_cpp, // 1
+            output_it.dereference.name, // 2
+            output_it.advance.name, // 3
+            output_it.size, // 4
+            output_it.alignment); // 5
+
+    const std::string op_src =
+      op.type == cccl_op_kind_t::stateless
+        ? std::format(
+            "extern \"C\" __device__ {0} {1}({0} lhs, {0} rhs);\n"
+            "struct op_wrapper {{\n"
+            "  __device__ {0} operator()({0} lhs, {0} rhs) const {{\n"
+            "    return {1}(lhs, rhs);\n"
+            "  }}\n"
+            "}};\n",
+            accum_cpp,
+            op.name)
+        : std::format(
+            "struct __align__({2}) op_state {{\n"
+            "  char data[{3}];\n"
+            "}};"
+            "extern \"C\" __device__ {0} {1}(op_state *state, {0} lhs, {0} rhs);\n"
+            "struct op_wrapper {{\n"
+            "  op_state state;\n"
+            "  __device__ {0} operator()({0} lhs, {0} rhs) {{\n"
+            "    return {1}(&state, lhs, rhs);\n"
+            "  }}\n"
+            "}};\n",
+            accum_cpp,
+            op.name,
+            op.alignment,
+            op.size);
+
+    const std::string src = std::format(
+      "#include <cub/block/block_reduce.cuh>\n"
+      "#include <cub/device/dispatch/kernels/reduce.cuh>\n"
+      "struct __align__({1}) storage_t {{\n"
+      "  char data[{0}];\n"
+      "}};\n"
+      "{4}\n"
+      "{5}\n"
+      "struct agent_policy_t {{\n"
+      "  static constexpr int ITEMS_PER_THREAD = {2};\n"
+      "  static constexpr int BLOCK_THREADS = {3};\n"
+      "  static constexpr int VECTOR_LOAD_LENGTH = {7};\n"
+      "  static constexpr cub::BlockReduceAlgorithm BLOCK_ALGORITHM = cub::BLOCK_REDUCE_WARP_REDUCTIONS;\n"
+      "  static constexpr cub::CacheLoadModifier LOAD_MODIFIER = cub::LOAD_LDG;\n"
+      "}};\n"
+      "struct device_reduce_policy {{\n"
+      "  struct ActivePolicy {{\n"
+      "    using ReducePolicy = agent_policy_t;\n"
+      "    using SingleTilePolicy = agent_policy_t;\n"
+      "  }};\n"
+      "}};\n"
+      "{6};\n",
+      input_it.value_type.size, // 0
+      input_it.value_type.alignment, // 1
+      policy.items_per_thread, // 2
+      policy.block_size, // 3
+      input_iterator_src, // 4
+      output_iterator_src, // 5
+      op_src, // 6
+      policy.vector_load_length); // 7
+
+    check(nvrtcCreateProgram(&prog, src.c_str(), name, 0, nullptr, nullptr));
+
+    std::string single_tile_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, false);
+    check(nvrtcAddNameExpression(prog, single_tile_kernel_name.c_str()));
+
+    std::string single_tile_second_kernel_name = get_single_tile_kernel_name(input_it, output_it, op, init, true);
+    check(nvrtcAddNameExpression(prog, single_tile_second_kernel_name.c_str()));
+
+    std::string reduction_kernel_name = get_device_reduce_kernel_name(op, input_it, init);
+    check(nvrtcAddNameExpression(prog, reduction_kernel_name.c_str()));
+
+    const std::string arch = std::format("-arch=sm_{0}{1}", cc_major, cc_minor);
+
+    constexpr int num_args     = 7;
+    const char* args[num_args] = {arch.c_str(), cub_path, thrust_path, libcudacxx_path, ctk_path, "-rdc=true", "-dlto"};
+
+    std::size_t log_size{};
+    nvrtcResult compile_result = nvrtcCompileProgram(prog, num_args, args);
+
+    check(nvrtcGetProgramLogSize(prog, &log_size));
+
+    std::unique_ptr<char[]> log{new char[log_size]};
+    check(nvrtcGetProgramLog(prog, log.get()));
+
+    if (log_size > 1)
+    {
+      std::cerr << log.get() << std::endl;
+    }
+
+    const char* single_tile_kernel_lowered_name;
+    check(nvrtcGetLoweredName(prog, single_tile_kernel_name.c_str(), &single_tile_kernel_lowered_name));
+
+    const char* single_tile_second_kernel_lowered_name;
+    check(nvrtcGetLoweredName(prog, single_tile_second_kernel_name.c_str(), &single_tile_second_kernel_lowered_name));
+
+    const char* reduction_kernel_lowered_name;
+    check(nvrtcGetLoweredName(prog, reduction_kernel_name.c_str(), &reduction_kernel_lowered_name));
+
+    // Copy lowered names to a std::unique_ptr to ensure they can be used after
+    // the program is destroyed
+
+    std::unique_ptr<char[]> single_tile_kernel_lowered_name_ptr{new char[strlen(single_tile_kernel_lowered_name) + 1]};
+    strcpy(single_tile_kernel_lowered_name_ptr.get(), single_tile_kernel_lowered_name);
+
+    std::unique_ptr<char[]> single_tile_second_kernel_lowered_name_ptr{
+      new char[strlen(single_tile_second_kernel_lowered_name) + 1]};
+    strcpy(single_tile_second_kernel_lowered_name_ptr.get(), single_tile_second_kernel_lowered_name);
+
+    std::unique_ptr<char[]> reduction_kernel_lowered_name_ptr{new char[strlen(reduction_kernel_lowered_name) + 1]};
+    strcpy(reduction_kernel_lowered_name_ptr.get(), reduction_kernel_lowered_name);
+
+    check(compile_result);
+
+    std::size_t ltoir_size{};
+    check(nvrtcGetLTOIRSize(prog, &ltoir_size));
+    std::unique_ptr<char[]> ltoir{new char[ltoir_size]};
+    check(nvrtcGetLTOIR(prog, ltoir.get()));
+    check(nvrtcDestroyProgram(&prog));
+
+    nvJitLinkHandle handle;
+    const char* lopts[] = {"-lto", arch.c_str()};
+    check(nvJitLinkCreate(&handle, 2, lopts));
+
+    check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, ltoir.get(), ltoir_size, name));
+    check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, op.ltoir, op.ltoir_size, name));
+
+    if (input_it.type == cccl_iterator_kind_t::iterator)
+    {
+      check(nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, input_it.advance.ltoir, input_it.advance.ltoir_size, name));
+      check(nvJitLinkAddData(
+        handle, NVJITLINK_INPUT_LTOIR, input_it.dereference.ltoir, input_it.dereference.ltoir_size, name));
+    }
+
+    if (output_it.type == cccl_iterator_kind_t::iterator)
+    {
+      check(
+        nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, output_it.advance.ltoir, output_it.advance.ltoir_size, name));
+      check(nvJitLinkAddData(
+        handle, NVJITLINK_INPUT_LTOIR, output_it.dereference.ltoir, output_it.dereference.ltoir_size, name));
+    }
+
+    check(nvJitLinkComplete(handle));
+
+    std::size_t cubin_size{};
+    check(nvJitLinkGetLinkedCubinSize(handle, &cubin_size));
+    std::unique_ptr<char[]> cubin{new char[cubin_size]};
+    check(nvJitLinkGetLinkedCubin(handle, cubin.get()));
+    check(nvJitLinkDestroy(&handle));
+
+    cuLibraryLoadData(&build->library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0);
+    check(cuLibraryGetKernel(&build->single_tile_kernel, build->library, single_tile_kernel_lowered_name_ptr.get()));
+    check(cuLibraryGetKernel(
+      &build->single_tile_second_kernel, build->library, single_tile_second_kernel_lowered_name_ptr.get()));
+    check(cuLibraryGetKernel(&build->reduction_kernel, build->library, reduction_kernel_lowered_name_ptr.get()));
+
+    build->cc         = cc;
+    build->cubin      = cubin.release();
+    build->cubin_size = cubin_size;
+  }
+  catch (...)
+  {
+    error = CUDA_ERROR_UNKNOWN;
+  }
+
+  return error;
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_reduce(
+  cccl_device_reduce_build_result_t build,
+  void* d_temp_storage,
+  size_t* temp_storage_bytes,
+  cccl_iterator_t d_in,
+  cccl_iterator_t d_out,
+  unsigned long long num_items,
+  cccl_op_t op,
+  cccl_value_t init,
+  CUstream stream) noexcept
+{
+  bool pushed    = false;
+  CUresult error = CUDA_SUCCESS;
+  try
+  {
+    pushed = try_push_context();
+
+    CUdevice cu_device;
+    check(cuCtxGetDevice(&cu_device));
+
+    Invoke(
+      d_temp_storage,
+      *temp_storage_bytes,
+      d_in,
+      d_out,
+      num_items,
+      op,
+      init,
+      build.cc,
+      (CUfunction) build.single_tile_kernel,
+      (CUfunction) build.single_tile_second_kernel,
+      (CUfunction) build.reduction_kernel,
+      cu_device,
+      stream);
+  }
+  catch (...)
+  {
+    error = CUDA_ERROR_UNKNOWN;
+  }
+
+  if (pushed)
+  {
+    CUcontext dummy;
+    cuCtxPopCurrent(&dummy);
+  }
+
+  return error;
+}
+
+extern "C" CCCL_C_API CUresult cccl_device_reduce_cleanup(cccl_device_reduce_build_result_t* bld_ptr)
+{
+  try
+  {
+    if (bld_ptr == nullptr)
+    {
+      return CUDA_ERROR_INVALID_VALUE;
+    }
+
+    std::unique_ptr<char[]> cubin(reinterpret_cast<char*>(bld_ptr->cubin));
+    check(cuLibraryUnload(bld_ptr->library));
+  }
+  catch (...)
+  {
+    return CUDA_ERROR_UNKNOWN;
+  }
+
+  return CUDA_SUCCESS;
+}
diff --git a/c/test/CMakeLists.txt b/c/test/CMakeLists.txt
new file mode 100644
index 0000000000..6a8599500e
--- /dev/null
+++ b/c/test/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable(cccl.c.test.reduce test_reduce.cpp test_main.cpp)
+
+target_link_libraries(cccl.c.test.reduce PRIVATE cccl.c CUDA::cudart CUDA::nvrtc Catch2::Catch2)
+
+target_compile_definitions(cccl.c.test.reduce PRIVATE CCCL_C_EXPERIMENTAL
+                                                     TEST_CUB_PATH="-I${CCCL_SOURCE_DIR}/cub"
+                                                     TEST_THRUST_PATH="-I${CCCL_SOURCE_DIR}/cub"
+                                                     TEST_LIBCUDACXX_PATH="-I${CCCL_SOURCE_DIR}/libcudacxx/include"
+                                                     TEST_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}")
diff --git a/c/test/c2h.h b/c/test/c2h.h
new file mode 100644
index 0000000000..e2b26895a8
--- /dev/null
+++ b/c/test/c2h.h
@@ -0,0 +1,310 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <catch2/catch.hpp>
+#include <cccl/reduce.h>
+#include <nvrtc.h>
+
+static std::string inspect_sass(const void* cubin, size_t cubin_size)
+{
+  namespace fs = std::filesystem;
+
+  fs::path temp_dir = fs::temp_directory_path();
+
+  fs::path temp_in_filename  = temp_dir / "temp_in_file.cubin";
+  fs::path temp_out_filename = temp_dir / "temp_out_file.sass";
+
+  std::ofstream temp_in_file(temp_in_filename, std::ios::binary);
+  if (!temp_in_file)
+  {
+    throw std::runtime_error("Failed to create temporary file.");
+  }
+
+  temp_in_file.write(static_cast<const char*>(cubin), cubin_size);
+  temp_in_file.close();
+
+  std::string command = "nvdisasm -gi ";
+  command += temp_in_filename;
+  command += " > ";
+  command += temp_out_filename;
+
+  if (std::system(command.c_str()) != 0)
+  {
+    throw std::runtime_error("Failed to execute command.");
+  }
+
+  if (!fs::remove(temp_in_filename))
+  {
+    throw std::runtime_error("Failed to remove temporary file.");
+  }
+
+  std::ifstream temp_out_file(temp_out_filename, std::ios::binary);
+  if (!temp_out_file)
+  {
+    throw std::runtime_error("Failed to create temporary file.");
+  }
+
+  const std::string sass{std::istreambuf_iterator<char>(temp_out_file), std::istreambuf_iterator<char>()};
+  if (!fs::remove(temp_out_filename))
+  {
+    throw std::runtime_error("Failed to remove temporary file.");
+  }
+
+  return sass;
+}
+
+static std::string compile(const std::string& source)
+{
+  // compile source to LTO-IR using nvrtc
+
+  nvrtcProgram prog;
+  REQUIRE(NVRTC_SUCCESS == nvrtcCreateProgram(&prog, source.c_str(), "op.cu", 0, nullptr, nullptr));
+
+  const char* options[] = {"--std=c++17", "-rdc=true", "-dlto"};
+  REQUIRE(NVRTC_SUCCESS == nvrtcCompileProgram(prog, 3, options));
+
+  std::size_t ltoir_size{};
+  REQUIRE(NVRTC_SUCCESS == nvrtcGetLTOIRSize(prog, &ltoir_size));
+
+  std::unique_ptr<char[]> ltoir(new char[ltoir_size]);
+
+  REQUIRE(NVRTC_SUCCESS == nvrtcGetLTOIR(prog, ltoir.get()));
+  REQUIRE(NVRTC_SUCCESS == nvrtcDestroyProgram(&prog));
+
+  return std::string(ltoir.release(), ltoir_size);
+}
+
+template <class T>
+std::vector<T> generate(std::size_t num_items)
+{
+  std::random_device rnd_device;
+  std::mt19937 mersenne_engine{rnd_device()}; // Generates random integers
+  std::uniform_int_distribution<T> dist{T{1}, T{42}};
+  std::vector<T> vec(num_items);
+  std::generate(vec.begin(), vec.end(), [&]() {
+    return dist(mersenne_engine);
+  });
+  return vec;
+}
+
+template <class T>
+cccl_type_info get_type_info()
+{
+  cccl_type_info info;
+  info.size      = sizeof(T);
+  info.alignment = alignof(T);
+
+  if constexpr (std::is_same_v<T, char>)
+  {
+    info.type = cccl_type_enum::INT8;
+  }
+  else if constexpr (std::is_same_v<T, int32_t>)
+  {
+    info.type = cccl_type_enum::INT32;
+  }
+  else if constexpr (std::is_same_v<T, uint32_t>)
+  {
+    info.type = cccl_type_enum::UINT32;
+  }
+  else if constexpr (std::is_same_v<T, int64_t>)
+  {
+    info.type = cccl_type_enum::INT64;
+  }
+  else if constexpr (std::is_same_v<T, uint64_t>)
+  {
+    info.type = cccl_type_enum::UINT64;
+  }
+  else if constexpr (!std::is_integral_v<T>)
+  {
+    info.type = cccl_type_enum::STORAGE;
+  }
+  else
+  {
+    static_assert(false, "Unsupported type");
+  }
+
+  return info;
+}
+
+static std::string get_op(cccl_type_enum t)
+{
+  switch (t)
+  {
+    case cccl_type_enum::INT8:
+      return "extern \"C\" __device__ char op(char a, char b) { return a + b; }";
+    case cccl_type_enum::INT32:
+      return "extern \"C\" __device__ int op(int a, int b) { return a + b; }";
+    case cccl_type_enum::UINT32:
+      return "extern \"C\" __device__ unsigned int op(unsigned int a, unsigned int b) { return a + b; }";
+    case cccl_type_enum::INT64:
+      return "extern \"C\" __device__ long long op(long long a, long long b) { return a + b; }";
+    case cccl_type_enum::UINT64:
+      return "extern \"C\" __device__ unsigned long long op(unsigned long long a, unsigned long long b) { "
+             " return a + b; "
+             "}";
+    default:
+      throw std::runtime_error("Unsupported type");
+  }
+  return "";
+}
+
+template <class T>
+struct pointer_t
+{
+  T* ptr{};
+
+  pointer_t(int num_items)
+  {
+    REQUIRE(cudaSuccess == cudaMalloc(&ptr, num_items * sizeof(T)));
+  }
+
+  pointer_t(const std::vector<T>& vec)
+  {
+    REQUIRE(cudaSuccess == cudaMalloc(&ptr, vec.size() * sizeof(T)));
+    REQUIRE(cudaSuccess == cudaMemcpy(ptr, vec.data(), vec.size() * sizeof(T), cudaMemcpyHostToDevice));
+  }
+
+  ~pointer_t()
+  {
+    if (ptr)
+    {
+      REQUIRE(cudaSuccess == cudaFree(ptr));
+      ptr = nullptr;
+    }
+  }
+
+  T operator[](int i) const
+  {
+    T value{};
+    REQUIRE(cudaSuccess == cudaMemcpy(&value, ptr + i, sizeof(T), cudaMemcpyDeviceToHost));
+    return value;
+  }
+
+  operator cccl_iterator_t()
+  {
+    cccl_iterator_t it;
+    it.size       = sizeof(T);
+    it.alignment  = alignof(T);
+    it.type       = cccl_iterator_kind_t::pointer;
+    it.state      = ptr;
+    it.value_type = get_type_info<T>();
+    return it;
+  }
+};
+
+struct operation_t
+{
+  std::string name;
+  std::string code;
+
+  operator cccl_op_t()
+  {
+    cccl_op_t op;
+    op.type       = cccl_op_kind_t::stateless;
+    op.name       = name.c_str();
+    op.ltoir      = code.c_str();
+    op.ltoir_size = code.size();
+    return op;
+  }
+};
+
+template <class OpT>
+struct stateful_operation_t
+{
+  OpT op_state;
+  std::string name;
+  std::string code;
+
+  operator cccl_op_t()
+  {
+    cccl_op_t op;
+    op.type       = cccl_op_kind_t::stateful;
+    op.size       = sizeof(OpT);
+    op.alignment  = alignof(OpT);
+    op.state      = &op_state;
+    op.name       = name.c_str();
+    op.ltoir      = code.c_str();
+    op.ltoir_size = code.size();
+    return op;
+  }
+};
+
+static operation_t make_operation(std::string name, std::string code)
+{
+  return operation_t{name, compile(code)};
+}
+
+template <class OpT>
+static stateful_operation_t<OpT> make_operation(std::string name, std::string code, OpT op)
+{
+  return {op, name, compile(code)};
+}
+
+template <class ValueT, class StateT>
+struct iterator_t
+{
+  StateT state;
+  operation_t advance;
+  operation_t dereference;
+
+  operator cccl_iterator_t()
+  {
+    cccl_iterator_t it;
+    it.size        = sizeof(StateT);
+    it.alignment   = alignof(StateT);
+    it.type        = cccl_iterator_kind_t::iterator;
+    it.advance     = advance;
+    it.dereference = dereference;
+    it.value_type  = get_type_info<ValueT>();
+    it.state       = &state;
+    return it;
+  }
+};
+
+template <class ValueT, class StateT>
+iterator_t<ValueT, StateT> make_iterator(std::string state, operation_t advance, operation_t dereference)
+{
+  iterator_t<ValueT, StateT> it;
+  it.advance     = make_operation(advance.name, state + advance.code);
+  it.dereference = make_operation(dereference.name, state + dereference.code);
+  return it;
+}
+
+template <class T>
+struct value_t
+{
+  T value;
+
+  value_t(T value)
+      : value(value)
+  {}
+
+  operator cccl_value_t()
+  {
+    cccl_value_t v;
+    v.type  = get_type_info<T>();
+    v.state = &value;
+    return v;
+  }
+};
diff --git a/c/test/test_main.cpp b/c/test/test_main.cpp
new file mode 100644
index 0000000000..3e3b4900a5
--- /dev/null
+++ b/c/test/test_main.cpp
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+#define CATCH_CONFIG_RUNNER
+#include <catch2/catch.hpp>
+
+int device_guard(int device_id)
+{
+  int device_count{};
+  if (cudaGetDeviceCount(&device_count) != cudaSuccess)
+  {
+    std::cerr << "Can't query devices number." << std::endl;
+    std::exit(-1);
+  }
+
+  if (device_id >= device_count || device_id < 0)
+  {
+    std::cerr << "Invalid device ID: " << device_id << std::endl;
+    std::exit(-1);
+  }
+
+  return device_id;
+}
+
+int main(int argc, char* argv[])
+{
+  Catch::Session session;
+
+  int device_id{};
+
+  // Build a new parser on top of Catch's
+  using namespace Catch::clara;
+  auto cli = session.cli() | Opt(device_id, "device")["-d"]["--device"]("device id to use");
+  session.cli(cli);
+
+  int returnCode = session.applyCommandLine(argc, argv);
+  if (returnCode != 0)
+  {
+    return returnCode;
+  }
+
+  cudaSetDevice(device_guard(device_id));
+  return session.run(argc, argv);
+}
diff --git a/c/test/test_reduce.cpp b/c/test/test_reduce.cpp
new file mode 100644
index 0000000000..1a4607702a
--- /dev/null
+++ b/c/test/test_reduce.cpp
@@ -0,0 +1,285 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+
+#include "c2h.h"
+
+void reduce(cccl_iterator_t input, cccl_iterator_t output, unsigned long long num_items, cccl_op_t op, cccl_value_t init)
+{
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, 0);
+
+  const int cc_major = deviceProp.major;
+  const int cc_minor = deviceProp.minor;
+
+  const char* cub_path        = TEST_CUB_PATH;
+  const char* thrust_path     = TEST_THRUST_PATH;
+  const char* libcudacxx_path = TEST_LIBCUDACXX_PATH;
+  const char* ctk_path        = TEST_CTK_PATH;
+
+  cccl_device_reduce_build_result_t build;
+  REQUIRE(CUDA_SUCCESS
+          == cccl_device_reduce_build(
+            &build, input, output, op, init, cc_major, cc_minor, cub_path, thrust_path, libcudacxx_path, ctk_path));
+
+  const std::string sass = inspect_sass(build.cubin, build.cubin_size);
+  REQUIRE(sass.find("LDL") == std::string::npos);
+  REQUIRE(sass.find("STL") == std::string::npos);
+
+  size_t temp_storage_bytes = 0;
+  REQUIRE(
+    CUDA_SUCCESS == cccl_device_reduce(build, nullptr, &temp_storage_bytes, input, output, num_items, op, init, 0));
+
+  pointer_t<uint8_t> temp_storage(temp_storage_bytes);
+
+  REQUIRE(CUDA_SUCCESS
+          == cccl_device_reduce(build, temp_storage.ptr, &temp_storage_bytes, input, output, num_items, op, init, 0));
+  REQUIRE(CUDA_SUCCESS == cccl_device_reduce_cleanup(&build));
+}
+
+using integral_types = std::tuple<int32_t, uint32_t, int64_t, uint64_t>;
+TEMPLATE_LIST_TEST_CASE("Reduce works with integral types", "[reduce]", integral_types)
+{
+  const int num_items               = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
+  operation_t op                    = make_operation("op", get_op(get_type_info<TestType>().type));
+  const std::vector<TestType> input = generate<TestType>(num_items);
+  pointer_t<TestType> input_ptr(input);
+  pointer_t<TestType> output_ptr(1);
+  value_t<TestType> init{TestType{42}};
+
+  reduce(input_ptr, output_ptr, num_items, op, init);
+
+  const TestType output   = output_ptr[0];
+  const TestType expected = std::accumulate(input.begin(), input.end(), init.value);
+  REQUIRE(output == expected);
+}
+
+struct pair
+{
+  short a;
+  size_t b;
+};
+
+TEST_CASE("Reduce works with custom types", "[reduce]")
+{
+  const int num_items = GENERATE(0, 42, take(4, random(1 << 12, 1 << 24)));
+
+  operation_t op = make_operation(
+    "op",
+    "struct pair { short a; size_t b; };\n"
+    "extern \"C\" __device__ pair op(pair lhs, pair rhs) {\n"
+    "  return pair{ lhs.a + rhs.a, lhs.b + rhs.b };\n"
+    "}");
+  const std::vector<short> a  = generate<short>(num_items);
+  const std::vector<size_t> b = generate<size_t>(num_items);
+  std::vector<pair> input(num_items);
+  for (std::size_t i = 0; i < num_items; ++i)
+  {
+    input[i] = pair{a[i], b[i]};
+  }
+  pointer_t<pair> input_ptr(input);
+  pointer_t<pair> output_ptr(1);
+  value_t<pair> init{pair{4, 2}};
+
+  reduce(input_ptr, output_ptr, num_items, op, init);
+
+  const pair output   = output_ptr[0];
+  const pair expected = std::accumulate(input.begin(), input.end(), init.value, [](const pair& lhs, const pair& rhs) {
+    return pair{short(lhs.a + rhs.a), lhs.b + rhs.b};
+  });
+  REQUIRE(output.a == expected.a);
+  REQUIRE(output.b == expected.b);
+}
+
+struct counting_iterator_state_t
+{
+  int value;
+};
+
+TEST_CASE("Reduce works with input iterators", "[reduce]")
+{
+  const std::size_t num_items                         = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op                                      = make_operation("op", get_op(get_type_info<int>().type));
+  iterator_t<int, counting_iterator_state_t> input_it = make_iterator<int, counting_iterator_state_t>(
+    "struct counting_iterator_state_t { int value; };\n",
+    {"advance",
+     "extern \"C\" __device__ void advance(counting_iterator_state_t* state, unsigned long long offset) {\n"
+     "  state->value += offset;\n"
+     "}"},
+    {"dereference",
+     "extern \"C\" __device__ int dereference(counting_iterator_state_t* state) { \n"
+     "  return state->value;\n"
+     "}"});
+  input_it.state.value = 0;
+  pointer_t<int> output_it(1);
+  value_t<int> init{42};
+
+  reduce(input_it, output_it, num_items, op, init);
+
+  const int output   = output_it[0];
+  const int expected = init.value + num_items * (num_items - 1) / 2;
+  REQUIRE(output == expected);
+}
+
+struct transform_output_iterator_state_t
+{
+  int* d_output;
+};
+
+TEST_CASE("Reduce works with output iterators", "[reduce]")
+{
+  const int num_items = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op      = make_operation("op", get_op(get_type_info<int>().type));
+  iterator_t<int, transform_output_iterator_state_t> output_it = make_iterator<int, transform_output_iterator_state_t>(
+    "struct transform_output_iterator_state_t { int* d_output; };\n",
+    {"advance",
+     "extern \"C\" __device__ void advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n"
+     "  state->d_output += offset;\n"
+     "}"},
+    {"dereference",
+     "extern \"C\" __device__ void dereference(transform_output_iterator_state_t* state, int x) { \n"
+     "  *state->d_output = 2 * x;\n"
+     "}"});
+  const std::vector<int> input = generate<int>(num_items);
+  pointer_t<int> input_it(input);
+  pointer_t<int> inner_output_it(1);
+  output_it.state.d_output = inner_output_it.ptr;
+  value_t<int> init{42};
+
+  reduce(input_it, output_it, num_items, op, init);
+
+  const int output   = inner_output_it[0];
+  const int expected = std::accumulate(input.begin(), input.end(), init.value);
+  REQUIRE(output == expected * 2);
+}
+
+template <class T>
+struct constant_iterator_state_t
+{
+  T value;
+};
+
+TEST_CASE("Reduce works with input and output iterators", "[reduce]")
+{
+  const int num_items                                      = GENERATE(1, 42, take(4, random(1 << 12, 1 << 16)));
+  operation_t op                                           = make_operation("op", get_op(get_type_info<int>().type));
+  iterator_t<int, constant_iterator_state_t<int>> input_it = make_iterator<int, constant_iterator_state_t<int>>(
+    "struct constant_iterator_state_t { int value; };\n",
+    {"in_advance",
+     "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n"
+     "}"},
+    {"in_dereference",
+     "extern \"C\" __device__ int in_dereference(constant_iterator_state_t* state) { \n"
+     "  return state->value;\n"
+     "}"});
+  input_it.state.value                                         = 1;
+  iterator_t<int, transform_output_iterator_state_t> output_it = make_iterator<int, transform_output_iterator_state_t>(
+    "struct transform_output_iterator_state_t { int* d_output; };\n",
+    {"out_advance",
+     "extern \"C\" __device__ void out_advance(transform_output_iterator_state_t* state, unsigned long long offset) {\n"
+     "  state->d_output += offset;\n"
+     "}"},
+    {"out_dereference",
+     "extern \"C\" __device__ void out_dereference(transform_output_iterator_state_t* state, int x) { \n"
+     "  *state->d_output = 2 * x;\n"
+     "}"});
+  pointer_t<int> inner_output_it(1);
+  output_it.state.d_output = inner_output_it.ptr;
+  value_t<int> init{42};
+
+  reduce(input_it, output_it, num_items, op, init);
+
+  const int output   = inner_output_it[0];
+  const int expected = 2 * (init.value + num_items);
+  REQUIRE(output == expected);
+}
+
+TEST_CASE("Reduce accumulator type is influenced by initial value", "[reduce]")
+{
+  const int num_items = 1 << 14; // 16384 > 128
+  operation_t op      = make_operation("op", get_op(get_type_info<size_t>().type));
+  iterator_t<char, constant_iterator_state_t<char>> input_it = make_iterator<char, constant_iterator_state_t<char>>(
+    "struct constant_iterator_state_t { char value; };\n",
+    {"in_advance",
+     "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n"
+     "}"},
+    {"in_dereference",
+     "extern \"C\" __device__ char in_dereference(constant_iterator_state_t* state) { \n"
+     "  return state->value;\n"
+     "}"});
+  input_it.state.value = 1;
+  pointer_t<size_t> output_it(1);
+  value_t<size_t> init{42};
+
+  reduce(input_it, output_it, num_items, op, init);
+
+  const size_t output = output_it[0];
+  const int expected  = init.value + num_items;
+  REQUIRE(output == expected);
+}
+
+TEST_CASE("Reduce works with large inputs", "[reduce]")
+{
+  const size_t num_items = 1ull << 33;
+  operation_t op         = make_operation("op", get_op(get_type_info<size_t>().type));
+  iterator_t<char, constant_iterator_state_t<char>> input_it = make_iterator<char, constant_iterator_state_t<char>>(
+    "struct constant_iterator_state_t { char value; };\n",
+    {"in_advance",
+     "extern \"C\" __device__ void in_advance(constant_iterator_state_t*, unsigned long long) {\n"
+     "}"},
+    {"in_dereference",
+     "extern \"C\" __device__ char in_dereference(constant_iterator_state_t* state) { \n"
+     "  return state->value;\n"
+     "}"});
+  input_it.state.value = 1;
+  pointer_t<size_t> output_it(1);
+  value_t<size_t> init{42};
+
+  reduce(input_it, output_it, num_items, op, init);
+
+  const size_t output   = output_it[0];
+  const size_t expected = init.value + num_items;
+  REQUIRE(output == expected);
+}
+
+struct invocation_counter_state_t
+{
+  int* d_counter;
+};
+
+TEST_CASE("Reduce works with stateful operators", "[reduce]")
+{
+  const int num_items = 1 << 12;
+  pointer_t<int> counter(1);
+  stateful_operation_t<invocation_counter_state_t> op = make_operation(
+    "op",
+    "struct invocation_counter_state_t { int* d_counter; };\n"
+    "extern \"C\" __device__ int op(invocation_counter_state_t *state, int a, int b) {\n"
+    "  atomicAdd(state->d_counter, 1);\n"
+    "  return a + b;\n"
+    "}",
+    invocation_counter_state_t{counter.ptr});
+
+  const std::vector<int> input = generate<int>(num_items);
+  pointer_t<int> input_ptr(input);
+  pointer_t<int> output_ptr(1);
+  value_t<int> init{42};
+
+  reduce(input_ptr, output_ptr, num_items, op, init);
+
+  const int invocation_count          = counter[0];
+  const int expected_invocation_count = num_items - 1;
+  REQUIRE(invocation_count > expected_invocation_count);
+
+  const int output   = output_ptr[0];
+  const int expected = std::accumulate(input.begin(), input.end(), init.value);
+  REQUIRE(output == expected);
+}