Skip to content

Commit

Permalink
Integrate CUDASTF -> CudaX (NVIDIA#2572)
Browse files Browse the repository at this point in the history
CUDASTF is an implementation of the Sequential Task Flow model for CUDA.

The availability of parallelism within modern hardware has dramatically
increased, with large nodes now featuring multiple accelerators. As a
result, maximizing concurrency at the application level in a scalable
manner has become a crucial priority. To effectively hide latencies, it
is essential to achieve the highest level of asynchrony possible.

CUDASTF introduces a tasking model that automates data transfers while
enforcing implicit data-driven dependencies.

Implemented as a header-only C++ library, CUDASTF builds on top of CUDA
APIs to simplify the development of multi-GPU applications.

CUDASTF is currently capable of generating parallel applications using
either the CUDA stream API or the CUDA graph API.

---------

Co-authored-by: Cédric Augonnet <[email protected]>
Co-authored-by: Andrei Alexandrescu <[email protected]>
  • Loading branch information
3 people authored and fbusato committed Nov 5, 2024
1 parent ca1e682 commit 7c50458
Show file tree
Hide file tree
Showing 320 changed files with 66,556 additions and 13 deletions.
13 changes: 11 additions & 2 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@
"THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB": true,
"cudax_ENABLE_HEADER_TESTING": true,
"cudax_ENABLE_TESTING": true,
"cudax_ENABLE_EXAMPLES": true,
"cudax_ENABLE_CUDASTF_BOUNDSCHECK": false,
"cudax_ENABLE_CUDASTF_DEBUG": false,
"cudax_ENABLE_CUDASTF_MATHLIBS": false,
"cudax_ENABLE_DIALECT_CPP17": true,
"cudax_ENABLE_DIALECT_CPP20": true,
"CCCL_C_Parallel_ENABLE_TESTING": true,
Expand All @@ -73,9 +77,11 @@
"displayName": "all-dev debug",
"inherits": "all-dev",
"cacheVariables": {
"CCCL_ENABLE_BENCHMARKS": false,
"CMAKE_BUILD_TYPE": "Debug",
"CMAKE_CUDA_FLAGS": "-G"
"CMAKE_CUDA_FLAGS": "-G",
"CCCL_ENABLE_BENCHMARKS": false,
"cudax_ENABLE_CUDASTF_BOUNDSCHECK": true,
"cudax_ENABLE_CUDASTF_DEBUG": true
}
},
{
Expand Down Expand Up @@ -297,6 +303,9 @@
"CCCL_ENABLE_CUDAX": true,
"cudax_ENABLE_HEADER_TESTING": true,
"cudax_ENABLE_TESTING": true,
"cudax_ENABLE_EXAMPLES": true,
"cudax_ENABLE_CUDASTF_BOUNDSCHECK": false,
"cudax_ENABLE_CUDASTF_MATHLIBS": false,
"cudax_ENABLE_DIALECT_CPP17": false,
"cudax_ENABLE_DIALECT_CPP20": false
}
Expand Down
2 changes: 1 addition & 1 deletion ci/windows/build_common.psm1
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ function build_preset {

sccache_stats('Start')

cmake --build --preset $PRESET -v
cmake --build --preset $PRESET -v -- -k 0
$test_result = $LastExitCode

$preset_dir = "${BUILD_DIR}/${PRESET}"
Expand Down
16 changes: 14 additions & 2 deletions cudax/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,19 @@ enable_language(CUDA)

option(cudax_ENABLE_HEADER_TESTING "Test that CUDA Experimental's public headers compile." ON)
option(cudax_ENABLE_TESTING "Build CUDA Experimental's tests." ON)
option(cudax_ENABLE_EXAMPLES "Build CUDA Experimental's examples." ON)
option(cudax_ENABLE_CUDASTF_BOUNDSCHECK "Enable bounds checks for STF targets. Requires debug build." OFF)
option(cudax_ENABLE_CUDASTF_DEBUG "Enable additional debugging for STF targets. Requires debug build." OFF)
option(cudax_ENABLE_CUDASTF_MATHLIBS "Enable STF tests/examples that use cublas/cusolver." OFF)

if ((cudax_ENABLE_CUDASTF_BOUNDSCHECK OR cudax_ENABLE_CUDASTF_DEBUG) AND
NOT CMAKE_BUILD_TYPE MATCHES "Debug" AND NOT CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo")
message(FATAL_ERROR "cudax_ENABLE_CUDASTF_BOUNDSCHECK and cudax_ENABLE_CUDASTF_DEBUG require a Debug build.")
endif()

include(cmake/cudaxBuildCompilerTargets.cmake)
include(cmake/cudaxBuildTargetList.cmake)
include(cmake/cudaxSTFConfigureTarget.cmake)

cudax_build_compiler_targets()
cudax_build_target_list()
Expand All @@ -31,7 +41,9 @@ if (cudax_ENABLE_HEADER_TESTING)
endif()

if (cudax_ENABLE_TESTING)
include(CTest)
enable_testing() # Must be in root directory
add_subdirectory(test)
endif()

if (cudax_ENABLE_EXAMPLES)
add_subdirectory(examples)
endif()
20 changes: 20 additions & 0 deletions cudax/cmake/cudaxBuildCompilerTargets.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,26 @@ function(cudax_build_compiler_targets)
# cudax requires dim3 to be usable from a constexpr context, and the CUDART headers require
# __cplusplus to be defined for this to work:
append_option_if_available("/Zc:__cplusplus" cxx_compile_options)

# cudax requires __VA_OPT__ for its unit tests
append_option_if_available("/Zc:preprocessor" cxx_compile_options)

# XXX Temporary hack for STF !
# C4267: conversion from 'meow' to 'purr', possible loss of data
append_option_if_available("/wd4267" cxx_compile_options)

# C4459 : declaration of 'identifier' hides global declaration
# We work around std::chrono::last which hides some internal "last" variable
append_option_if_available("/wd4459" cxx_compile_options)

# stf used getenv which is potentially unsafe but not in our context
list(APPEND cxx_compile_definitions "_CRT_SECURE_NO_WARNINGS")
endif()

if("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
# stf heavily uses host device lambdas which break on clang due to a warning about the implicitly
# deleted copy constructor
append_option_if_available("-Wno-deprecated-copy" cxx_compile_options)
endif()

cccl_build_compiler_interface(cudax.compiler_interface
Expand Down
39 changes: 38 additions & 1 deletion cudax/cmake/cudaxHeaderTesting.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ function(cudax_add_header_test label definitions)
cudax_get_target_property(config_dialect ${cn_target} DIALECT)
cudax_get_target_property(config_prefix ${cn_target} PREFIX)

set(headertest_target ${config_prefix}.headers.${label})
###################
# Non-STF headers #
set(headertest_target ${config_prefix}.headers.${label}.no_stf)
cccl_generate_header_tests(${headertest_target} cudax/include
DIALECT ${config_dialect}
# The cudax header template removes the check for the `small` macro.
Expand All @@ -22,6 +24,9 @@ function(cudax_add_header_test label definitions)
# The following internal headers are not required to compile independently:
"cuda/experimental/__async/prologue.cuh"
"cuda/experimental/__async/epilogue.cuh"
# STF headers are compiled separately:
"cuda/experimental/stf.cuh"
"cuda/experimental/__stf/*"
)
target_link_libraries(${headertest_target} PUBLIC ${cn_target})
target_compile_definitions(${headertest_target} PRIVATE
Expand All @@ -32,6 +37,38 @@ function(cudax_add_header_test label definitions)

add_dependencies(cudax.all.headers ${headertest_target})
add_dependencies(${config_prefix}.all ${headertest_target})

# FIXME: Enable MSVC
if (NOT "MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
###############
# STF headers #
set(headertest_target ${config_prefix}.headers.${label}.stf)
cccl_generate_header_tests(${headertest_target} cudax/include
DIALECT ${config_dialect}
GLOBS
"cuda/experimental/stf.cuh"
"cuda/experimental/__stf/*.cuh"

# FIXME: The cudax header template removes the check for the `small` macro.
# cuda/experimental/__stf/utility/memory.cuh defines functions named `small`.
# These should be renamed to avoid conflicts with windows system headers, and
# the following line removed:
HEADER_TEMPLATE "${cudax_SOURCE_DIR}/cmake/header_test.in.cu"
)
target_link_libraries(${headertest_target} PUBLIC ${cn_target})
target_compile_options(${headertest_target} PRIVATE
# Required by stf headers:
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>
# FIXME: We should be able to refactor away from needing this by
# using _CCCL_HOST_DEVICE and friends + `::cuda::std` utilities where
# necessary.
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>
)
cudax_clone_target_properties(${headertest_target} ${cn_target})
endif()

add_dependencies(cudax.all.headers ${headertest_target})
add_dependencies(${config_prefix}.all ${headertest_target})
endforeach()
endfunction()

Expand Down
41 changes: 41 additions & 0 deletions cudax/cmake/cudaxSTFConfigureTarget.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Configures a target for the STF framework.
function(cudax_stf_configure_target target_name)
set(options LINK_MATHLIBS)
set(oneValueArgs)
set(multiValueArgs)
cmake_parse_arguments(CSCT "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

target_link_libraries(${target_name} PRIVATE
${cn_target}
CUDA::cudart
CUDA::curand
CUDA::cuda_driver
)
target_compile_options(${target_name} PRIVATE
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>
)
set_target_properties(${target_name} PROPERTIES
CUDA_RUNTIME_LIBRARY Static
CUDA_SEPARABLE_COMPILATION ON
)

if (CSCT_LINK_MATHLIBS)
target_link_libraries(${target_name} PRIVATE
CUDA::cublas
CUDA::cusolver
)
endif()

if (cudax_ENABLE_CUDASTF_BOUNDSCHECK)
target_compile_definitions(${target_name} PRIVATE
"CUDASTF_BOUNDSCHECK"
)
endif()

if (cudax_ENABLE_CUDASTF_DEBUG)
target_compile_definitions(${target_name} PRIVATE
"CUDASTF_DEBUG"
)
endif()
endfunction()
9 changes: 9 additions & 0 deletions cudax/cmake/stf_header_unittest.in.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// This file is autogenerated by configuring stf_header_unittest.in.cu.

// clang-format off
#define UNITTESTED_FILE "@source@"

#include <cuda/experimental/__stf/utility/unittest.cuh>

#include <@source@>
//clang-format on
14 changes: 14 additions & 0 deletions cudax/examples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
foreach(cn_target IN LISTS cudax_TARGETS)
cudax_get_target_property(config_prefix ${cn_target} PREFIX)

# Metatarget for the current configuration's tests:
set(config_meta_target ${config_prefix}.examples)
add_custom_target(${config_meta_target})
add_dependencies(${config_prefix}.all ${config_meta_target})
endforeach()

# FIXME: Enable MSVC
if (NOT "MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
# STF tests are handled separately:
add_subdirectory(stf)
endif()
73 changes: 73 additions & 0 deletions cudax/examples/stf/01-axpy-cuda_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDASTF in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

/**
* @file
*
* @brief An AXPY kernel described using a cuda_kernel construct
*
*/

#include <cuda/experimental/stf.cuh>

using namespace cuda::experimental::stf;

__global__ void axpy(double a, slice<const double> x, slice<double> y)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int nthreads = gridDim.x * blockDim.x;

for (int i = tid; i < x.size(); i += nthreads)
{
y(i) += a * x(i);
}
}

double X0(int i)
{
return sin((double) i);
}

double Y0(int i)
{
return cos((double) i);
}

int main()
{
context ctx = graph_ctx();
const size_t N = 16;
double X[N], Y[N];

for (size_t i = 0; i < N; i++)
{
X[i] = X0(i);
Y[i] = Y0(i);
}

double alpha = 3.14;

auto lX = ctx.logical_data(X);
auto lY = ctx.logical_data(Y);

/* Compute Y = Y + alpha X */
ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
// axpy<<<16, 128, 0, ...>>>(alpha, dX, dY)
return cuda_kernel_desc{axpy, 16, 128, 0, alpha, dX, dY};
};

ctx.finalize();

for (size_t i = 0; i < N; i++)
{
assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001);
assert(fabs(X[i] - X0(i)) < 0.0001);
}
}
80 changes: 80 additions & 0 deletions cudax/examples/stf/01-axpy-cuda_kernel_chain.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDASTF in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

/**
* @file
*
* @brief Example of task implementing a chain of CUDA kernels
*
*/

#include <cuda/experimental/stf.cuh>

using namespace cuda::experimental::stf;

__global__ void axpy(double a, slice<const double> x, slice<double> y)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int nthreads = gridDim.x * blockDim.x;

for (int i = tid; i < x.size(); i += nthreads)
{
y(i) += a * x(i);
}
}

double X0(int i)
{
return sin((double) i);
}

double Y0(int i)
{
return cos((double) i);
}

int main()
{
context ctx = graph_ctx();
const size_t N = 16;
double X[N], Y[N];

for (size_t i = 0; i < N; i++)
{
X[i] = X0(i);
Y[i] = Y0(i);
}

double alpha = 3.14;
double beta = 4.5;
double gamma = -4.1;

auto lX = ctx.logical_data(X);
auto lY = ctx.logical_data(Y);

/* Compute Y = Y + alpha X, Y = Y + beta X and then Y = Y + gamma X */
ctx.cuda_kernel_chain(lX.read(), lY.rw())->*[&](auto dX, auto dY) {
// clang-format off
return std::vector<cuda_kernel_desc> {
{ axpy, 16, 128, 0, alpha, dX, dY },
{ axpy, 16, 128, 0, beta, dX, dY },
{ axpy, 16, 128, 0, gamma, dX, dY }
};
// clang-format on
};

ctx.finalize();

for (size_t i = 0; i < N; i++)
{
assert(fabs(Y[i] - (Y0(i) + (alpha + beta + gamma) * X0(i))) < 0.0001);
assert(fabs(X[i] - X0(i)) < 0.0001);
}
}
Loading

0 comments on commit 7c50458

Please sign in to comment.