Integrate CUDASTF -> CudaX (NVIDIA#2572)

CUDASTF is an implementation of the Sequential Task Flow model for CUDA. The availability of parallelism within modern hardware has dramatically increased, with large nodes now featuring multiple accelerators. As a result, maximizing concurrency at the application level in a scalable manner has become a crucial priority. To effectively hide latencies, it is essential to achieve the highest level of asynchrony possible. CUDASTF introduces a tasking model that automates data transfers while enforcing implicit data-driven dependencies. Implemented as a header-only C++ library, CUDASTF builds on top of CUDA APIs to simplify the development of multi-GPU applications. CUDASTF is currently capable of generating parallel applications using either the CUDA stream API or the CUDA graph API. --------- Co-authored-by: Cédric Augonnet <[email protected]> Co-authored-by: Andrei Alexandrescu <[email protected]>
fbusato · Nov 5, 2024 · 7c50458 · 7c50458
1 parent ca1e682
commit 7c50458
Show file tree

Hide file tree

Showing 320 changed files with 66,556 additions and 13 deletions.
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -62,6 +62,10 @@
         "THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB": true,
         "cudax_ENABLE_HEADER_TESTING": true,
         "cudax_ENABLE_TESTING": true,
+        "cudax_ENABLE_EXAMPLES": true,
+        "cudax_ENABLE_CUDASTF_BOUNDSCHECK": false,
+        "cudax_ENABLE_CUDASTF_DEBUG": false,
+        "cudax_ENABLE_CUDASTF_MATHLIBS": false,
         "cudax_ENABLE_DIALECT_CPP17": true,
         "cudax_ENABLE_DIALECT_CPP20": true,
         "CCCL_C_Parallel_ENABLE_TESTING": true,
@@ -73,9 +77,11 @@
       "displayName": "all-dev debug",
       "inherits": "all-dev",
       "cacheVariables": {
-        "CCCL_ENABLE_BENCHMARKS": false,
         "CMAKE_BUILD_TYPE": "Debug",
-        "CMAKE_CUDA_FLAGS": "-G"
+        "CMAKE_CUDA_FLAGS": "-G",
+        "CCCL_ENABLE_BENCHMARKS": false,
+        "cudax_ENABLE_CUDASTF_BOUNDSCHECK": true,
+        "cudax_ENABLE_CUDASTF_DEBUG": true
       }
     },
     {
@@ -297,6 +303,9 @@
         "CCCL_ENABLE_CUDAX": true,
         "cudax_ENABLE_HEADER_TESTING": true,
         "cudax_ENABLE_TESTING": true,
+        "cudax_ENABLE_EXAMPLES": true,
+        "cudax_ENABLE_CUDASTF_BOUNDSCHECK": false,
+        "cudax_ENABLE_CUDASTF_MATHLIBS": false,
         "cudax_ENABLE_DIALECT_CPP17": false,
         "cudax_ENABLE_DIALECT_CPP20": false
       }

diff --git a/ci/windows/build_common.psm1 b/ci/windows/build_common.psm1
@@ -112,7 +112,7 @@ function build_preset {
 
     sccache_stats('Start')
 
-    cmake --build --preset $PRESET -v
+    cmake --build --preset $PRESET -v -- -k 0
     $test_result = $LastExitCode
 
     $preset_dir = "${BUILD_DIR}/${PRESET}"

diff --git a/cudax/CMakeLists.txt b/cudax/CMakeLists.txt
@@ -19,9 +19,19 @@ enable_language(CUDA)
 
 option(cudax_ENABLE_HEADER_TESTING "Test that CUDA Experimental's public headers compile." ON)
 option(cudax_ENABLE_TESTING "Build CUDA Experimental's tests." ON)
+option(cudax_ENABLE_EXAMPLES "Build CUDA Experimental's examples." ON)
+option(cudax_ENABLE_CUDASTF_BOUNDSCHECK "Enable bounds checks for STF targets. Requires debug build." OFF)
+option(cudax_ENABLE_CUDASTF_DEBUG "Enable additional debugging for STF targets. Requires debug build." OFF)
+option(cudax_ENABLE_CUDASTF_MATHLIBS "Enable STF tests/examples that use cublas/cusolver." OFF)
+
+if ((cudax_ENABLE_CUDASTF_BOUNDSCHECK OR cudax_ENABLE_CUDASTF_DEBUG) AND
+    NOT CMAKE_BUILD_TYPE MATCHES "Debug" AND NOT CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo")
+  message(FATAL_ERROR "cudax_ENABLE_CUDASTF_BOUNDSCHECK and cudax_ENABLE_CUDASTF_DEBUG require a Debug build.")
+endif()
 
 include(cmake/cudaxBuildCompilerTargets.cmake)
 include(cmake/cudaxBuildTargetList.cmake)
+include(cmake/cudaxSTFConfigureTarget.cmake)
 
 cudax_build_compiler_targets()
 cudax_build_target_list()
@@ -31,7 +41,9 @@ if (cudax_ENABLE_HEADER_TESTING)
 endif()
 
 if (cudax_ENABLE_TESTING)
-  include(CTest)
-  enable_testing() # Must be in root directory
   add_subdirectory(test)
 endif()
+
+if (cudax_ENABLE_EXAMPLES)
+  add_subdirectory(examples)
+endif()
diff --git a/cudax/cmake/cudaxBuildCompilerTargets.cmake b/cudax/cmake/cudaxBuildCompilerTargets.cmake
@@ -23,6 +23,26 @@ function(cudax_build_compiler_targets)
     # cudax requires dim3 to be usable from a constexpr context, and the CUDART headers require
     # __cplusplus to be defined for this to work:
     append_option_if_available("/Zc:__cplusplus" cxx_compile_options)
+
+    # cudax requires __VA_OPT__ for its unit tests
+    append_option_if_available("/Zc:preprocessor" cxx_compile_options)
+
+    # XXX Temporary hack for STF !
+    # C4267: conversion from 'meow' to 'purr', possible loss of data
+    append_option_if_available("/wd4267" cxx_compile_options)
+
+    # C4459 : declaration of 'identifier' hides global declaration
+    # We work around std::chrono::last which hides some internal "last" variable
+    append_option_if_available("/wd4459" cxx_compile_options)
+
+    # stf used getenv which is potentially unsafe but not in our context
+    list(APPEND cxx_compile_definitions "_CRT_SECURE_NO_WARNINGS")
+  endif()
+
+  if("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    # stf heavily uses host device lambdas which break on clang due to a warning about the implicitly
+    # deleted copy constructor
+    append_option_if_available("-Wno-deprecated-copy" cxx_compile_options)
   endif()
 
   cccl_build_compiler_interface(cudax.compiler_interface

diff --git a/cudax/cmake/cudaxHeaderTesting.cmake b/cudax/cmake/cudaxHeaderTesting.cmake
@@ -12,7 +12,9 @@ function(cudax_add_header_test label definitions)
     cudax_get_target_property(config_dialect ${cn_target} DIALECT)
     cudax_get_target_property(config_prefix ${cn_target} PREFIX)
 
-    set(headertest_target ${config_prefix}.headers.${label})
+    ###################
+    # Non-STF headers #
+    set(headertest_target ${config_prefix}.headers.${label}.no_stf)
     cccl_generate_header_tests(${headertest_target} cudax/include
       DIALECT ${config_dialect}
       # The cudax header template removes the check for the `small` macro.
@@ -22,6 +24,9 @@ function(cudax_add_header_test label definitions)
         # The following internal headers are not required to compile independently:
         "cuda/experimental/__async/prologue.cuh"
         "cuda/experimental/__async/epilogue.cuh"
+        # STF headers are compiled separately:
+        "cuda/experimental/stf.cuh"
+        "cuda/experimental/__stf/*"
     )
     target_link_libraries(${headertest_target} PUBLIC ${cn_target})
     target_compile_definitions(${headertest_target} PRIVATE
@@ -32,6 +37,38 @@ function(cudax_add_header_test label definitions)
 
     add_dependencies(cudax.all.headers ${headertest_target})
     add_dependencies(${config_prefix}.all ${headertest_target})
+
+    # FIXME: Enable MSVC
+    if (NOT "MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+      ###############
+      # STF headers #
+      set(headertest_target ${config_prefix}.headers.${label}.stf)
+      cccl_generate_header_tests(${headertest_target} cudax/include
+        DIALECT ${config_dialect}
+        GLOBS
+          "cuda/experimental/stf.cuh"
+          "cuda/experimental/__stf/*.cuh"
+
+        # FIXME: The cudax header template removes the check for the `small` macro.
+        # cuda/experimental/__stf/utility/memory.cuh defines functions named `small`.
+        # These should be renamed to avoid conflicts with windows system headers, and
+        # the following line removed:
+        HEADER_TEMPLATE "${cudax_SOURCE_DIR}/cmake/header_test.in.cu"
+      )
+      target_link_libraries(${headertest_target} PUBLIC ${cn_target})
+      target_compile_options(${headertest_target} PRIVATE
+        # Required by stf headers:
+        $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>
+        # FIXME: We should be able to refactor away from needing this by
+        # using _CCCL_HOST_DEVICE and friends + `::cuda::std` utilities where
+        # necessary.
+        $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>
+      )
+      cudax_clone_target_properties(${headertest_target} ${cn_target})
+    endif()
+
+    add_dependencies(cudax.all.headers ${headertest_target})
+    add_dependencies(${config_prefix}.all ${headertest_target})
   endforeach()
 endfunction()
 

diff --git a/cudax/cmake/cudaxSTFConfigureTarget.cmake b/cudax/cmake/cudaxSTFConfigureTarget.cmake
@@ -0,0 +1,41 @@
+# Configures a target for the STF framework.
+function(cudax_stf_configure_target target_name)
+  set(options LINK_MATHLIBS)
+  set(oneValueArgs)
+  set(multiValueArgs)
+  cmake_parse_arguments(CSCT "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  target_link_libraries(${target_name} PRIVATE
+    ${cn_target}
+    CUDA::cudart
+    CUDA::curand
+    CUDA::cuda_driver
+  )
+  target_compile_options(${target_name} PRIVATE
+    $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>
+    $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>
+  )
+  set_target_properties(${target_name} PROPERTIES
+    CUDA_RUNTIME_LIBRARY Static
+    CUDA_SEPARABLE_COMPILATION ON
+  )
+
+  if (CSCT_LINK_MATHLIBS)
+    target_link_libraries(${target_name} PRIVATE
+      CUDA::cublas
+      CUDA::cusolver
+    )
+  endif()
+
+  if (cudax_ENABLE_CUDASTF_BOUNDSCHECK)
+    target_compile_definitions(${target_name} PRIVATE
+      "CUDASTF_BOUNDSCHECK"
+    )
+  endif()
+
+  if (cudax_ENABLE_CUDASTF_DEBUG)
+    target_compile_definitions(${target_name} PRIVATE
+      "CUDASTF_DEBUG"
+    )
+  endif()
+endfunction()
diff --git a/cudax/cmake/stf_header_unittest.in.cu b/cudax/cmake/stf_header_unittest.in.cu
@@ -0,0 +1,9 @@
+// This file is autogenerated by configuring stf_header_unittest.in.cu.
+
+// clang-format off
+#define UNITTESTED_FILE "@source@"
+
+#include <cuda/experimental/__stf/utility/unittest.cuh>
+
+#include <@source@>
+//clang-format on
diff --git a/cudax/examples/CMakeLists.txt b/cudax/examples/CMakeLists.txt
@@ -0,0 +1,14 @@
+foreach(cn_target IN LISTS cudax_TARGETS)
+  cudax_get_target_property(config_prefix ${cn_target} PREFIX)
+
+  # Metatarget for the current configuration's tests:
+  set(config_meta_target ${config_prefix}.examples)
+  add_custom_target(${config_meta_target})
+  add_dependencies(${config_prefix}.all ${config_meta_target})
+endforeach()
+
+# FIXME: Enable MSVC
+if (NOT "MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  # STF tests are handled separately:
+  add_subdirectory(stf)
+endif()
diff --git a/cudax/examples/stf/01-axpy-cuda_kernel.cu b/cudax/examples/stf/01-axpy-cuda_kernel.cu
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file
+ *
+ * @brief An AXPY kernel described using a cuda_kernel construct
+ *
+ */
+
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+__global__ void axpy(double a, slice<const double> x, slice<double> y)
+{
+  int tid      = blockIdx.x * blockDim.x + threadIdx.x;
+  int nthreads = gridDim.x * blockDim.x;
+
+  for (int i = tid; i < x.size(); i += nthreads)
+  {
+    y(i) += a * x(i);
+  }
+}
+
+double X0(int i)
+{
+  return sin((double) i);
+}
+
+double Y0(int i)
+{
+  return cos((double) i);
+}
+
+int main()
+{
+  context ctx    = graph_ctx();
+  const size_t N = 16;
+  double X[N], Y[N];
+
+  for (size_t i = 0; i < N; i++)
+  {
+    X[i] = X0(i);
+    Y[i] = Y0(i);
+  }
+
+  double alpha = 3.14;
+
+  auto lX = ctx.logical_data(X);
+  auto lY = ctx.logical_data(Y);
+
+  /* Compute Y = Y + alpha X */
+  ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
+    // axpy<<<16, 128, 0, ...>>>(alpha, dX, dY)
+    return cuda_kernel_desc{axpy, 16, 128, 0, alpha, dX, dY};
+  };
+
+  ctx.finalize();
+
+  for (size_t i = 0; i < N; i++)
+  {
+    assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001);
+    assert(fabs(X[i] - X0(i)) < 0.0001);
+  }
+}
diff --git a/cudax/examples/stf/01-axpy-cuda_kernel_chain.cu b/cudax/examples/stf/01-axpy-cuda_kernel_chain.cu
@@ -0,0 +1,80 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file
+ *
+ * @brief Example of task implementing a chain of CUDA kernels
+ *
+ */
+
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+__global__ void axpy(double a, slice<const double> x, slice<double> y)
+{
+  int tid      = blockIdx.x * blockDim.x + threadIdx.x;
+  int nthreads = gridDim.x * blockDim.x;
+
+  for (int i = tid; i < x.size(); i += nthreads)
+  {
+    y(i) += a * x(i);
+  }
+}
+
+double X0(int i)
+{
+  return sin((double) i);
+}
+
+double Y0(int i)
+{
+  return cos((double) i);
+}
+
+int main()
+{
+  context ctx    = graph_ctx();
+  const size_t N = 16;
+  double X[N], Y[N];
+
+  for (size_t i = 0; i < N; i++)
+  {
+    X[i] = X0(i);
+    Y[i] = Y0(i);
+  }
+
+  double alpha = 3.14;
+  double beta  = 4.5;
+  double gamma = -4.1;
+
+  auto lX = ctx.logical_data(X);
+  auto lY = ctx.logical_data(Y);
+
+  /* Compute Y = Y + alpha X, Y = Y + beta X and then  Y = Y + gamma X */
+  ctx.cuda_kernel_chain(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
+    // clang-format off
+        return std::vector<cuda_kernel_desc> {
+            { axpy, 16, 128, 0, alpha, dX, dY },
+            { axpy, 16, 128, 0, beta, dX, dY },
+            { axpy, 16, 128, 0, gamma, dX, dY }
+        };
+    // clang-format on
+  };
+
+  ctx.finalize();
+
+  for (size_t i = 0; i < N; i++)
+  {
+    assert(fabs(Y[i] - (Y0(i) + (alpha + beta + gamma) * X0(i))) < 0.0001);
+    assert(fabs(X[i] - X0(i)) < 0.0001);
+  }
+}