forked from NVIDIA/cccl
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Integrate CUDASTF -> CudaX (NVIDIA#2572)
CUDASTF is an implementation of the Sequential Task Flow model for CUDA. The availability of parallelism within modern hardware has dramatically increased, with large nodes now featuring multiple accelerators. As a result, maximizing concurrency at the application level in a scalable manner has become a crucial priority. To effectively hide latencies, it is essential to achieve the highest level of asynchrony possible. CUDASTF introduces a tasking model that automates data transfers while enforcing implicit data-driven dependencies. Implemented as a header-only C++ library, CUDASTF builds on top of CUDA APIs to simplify the development of multi-GPU applications. CUDASTF is currently capable of generating parallel applications using either the CUDA stream API or the CUDA graph API. --------- Co-authored-by: Cédric Augonnet <[email protected]> Co-authored-by: Andrei Alexandrescu <[email protected]>
- Loading branch information
Showing
320 changed files
with
66,556 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# Configures a target for the STF framework. | ||
function(cudax_stf_configure_target target_name) | ||
set(options LINK_MATHLIBS) | ||
set(oneValueArgs) | ||
set(multiValueArgs) | ||
cmake_parse_arguments(CSCT "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) | ||
|
||
target_link_libraries(${target_name} PRIVATE | ||
${cn_target} | ||
CUDA::cudart | ||
CUDA::curand | ||
CUDA::cuda_driver | ||
) | ||
target_compile_options(${target_name} PRIVATE | ||
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda> | ||
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr> | ||
) | ||
set_target_properties(${target_name} PROPERTIES | ||
CUDA_RUNTIME_LIBRARY Static | ||
CUDA_SEPARABLE_COMPILATION ON | ||
) | ||
|
||
if (CSCT_LINK_MATHLIBS) | ||
target_link_libraries(${target_name} PRIVATE | ||
CUDA::cublas | ||
CUDA::cusolver | ||
) | ||
endif() | ||
|
||
if (cudax_ENABLE_CUDASTF_BOUNDSCHECK) | ||
target_compile_definitions(${target_name} PRIVATE | ||
"CUDASTF_BOUNDSCHECK" | ||
) | ||
endif() | ||
|
||
if (cudax_ENABLE_CUDASTF_DEBUG) | ||
target_compile_definitions(${target_name} PRIVATE | ||
"CUDASTF_DEBUG" | ||
) | ||
endif() | ||
endfunction() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
// This file is autogenerated by configuring stf_header_unittest.in.cu. | ||
|
||
// clang-format off | ||
#define UNITTESTED_FILE "@source@" | ||
|
||
#include <cuda/experimental/__stf/utility/unittest.cuh> | ||
|
||
#include <@source@> | ||
//clang-format on |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
foreach(cn_target IN LISTS cudax_TARGETS) | ||
cudax_get_target_property(config_prefix ${cn_target} PREFIX) | ||
|
||
# Metatarget for the current configuration's tests: | ||
set(config_meta_target ${config_prefix}.examples) | ||
add_custom_target(${config_meta_target}) | ||
add_dependencies(${config_prefix}.all ${config_meta_target}) | ||
endforeach() | ||
|
||
# FIXME: Enable MSVC | ||
if (NOT "MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") | ||
# STF tests are handled separately: | ||
add_subdirectory(stf) | ||
endif() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of CUDASTF in CUDA C++ Core Libraries, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
/** | ||
* @file | ||
* | ||
* @brief An AXPY kernel described using a cuda_kernel construct | ||
* | ||
*/ | ||
|
||
#include <cuda/experimental/stf.cuh> | ||
|
||
using namespace cuda::experimental::stf; | ||
|
||
__global__ void axpy(double a, slice<const double> x, slice<double> y) | ||
{ | ||
int tid = blockIdx.x * blockDim.x + threadIdx.x; | ||
int nthreads = gridDim.x * blockDim.x; | ||
|
||
for (int i = tid; i < x.size(); i += nthreads) | ||
{ | ||
y(i) += a * x(i); | ||
} | ||
} | ||
|
||
double X0(int i) | ||
{ | ||
return sin((double) i); | ||
} | ||
|
||
double Y0(int i) | ||
{ | ||
return cos((double) i); | ||
} | ||
|
||
int main() | ||
{ | ||
context ctx = graph_ctx(); | ||
const size_t N = 16; | ||
double X[N], Y[N]; | ||
|
||
for (size_t i = 0; i < N; i++) | ||
{ | ||
X[i] = X0(i); | ||
Y[i] = Y0(i); | ||
} | ||
|
||
double alpha = 3.14; | ||
|
||
auto lX = ctx.logical_data(X); | ||
auto lY = ctx.logical_data(Y); | ||
|
||
/* Compute Y = Y + alpha X */ | ||
ctx.cuda_kernel(lX.read(), lY.rw())->*[&](auto dX, auto dY) { | ||
// axpy<<<16, 128, 0, ...>>>(alpha, dX, dY) | ||
return cuda_kernel_desc{axpy, 16, 128, 0, alpha, dX, dY}; | ||
}; | ||
|
||
ctx.finalize(); | ||
|
||
for (size_t i = 0; i < N; i++) | ||
{ | ||
assert(fabs(Y[i] - (Y0(i) + alpha * X0(i))) < 0.0001); | ||
assert(fabs(X[i] - X0(i)) < 0.0001); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of CUDASTF in CUDA C++ Core Libraries, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
/** | ||
* @file | ||
* | ||
* @brief Example of task implementing a chain of CUDA kernels | ||
* | ||
*/ | ||
|
||
#include <cuda/experimental/stf.cuh> | ||
|
||
using namespace cuda::experimental::stf; | ||
|
||
__global__ void axpy(double a, slice<const double> x, slice<double> y) | ||
{ | ||
int tid = blockIdx.x * blockDim.x + threadIdx.x; | ||
int nthreads = gridDim.x * blockDim.x; | ||
|
||
for (int i = tid; i < x.size(); i += nthreads) | ||
{ | ||
y(i) += a * x(i); | ||
} | ||
} | ||
|
||
double X0(int i) | ||
{ | ||
return sin((double) i); | ||
} | ||
|
||
double Y0(int i) | ||
{ | ||
return cos((double) i); | ||
} | ||
|
||
int main() | ||
{ | ||
context ctx = graph_ctx(); | ||
const size_t N = 16; | ||
double X[N], Y[N]; | ||
|
||
for (size_t i = 0; i < N; i++) | ||
{ | ||
X[i] = X0(i); | ||
Y[i] = Y0(i); | ||
} | ||
|
||
double alpha = 3.14; | ||
double beta = 4.5; | ||
double gamma = -4.1; | ||
|
||
auto lX = ctx.logical_data(X); | ||
auto lY = ctx.logical_data(Y); | ||
|
||
/* Compute Y = Y + alpha X, Y = Y + beta X and then Y = Y + gamma X */ | ||
ctx.cuda_kernel_chain(lX.read(), lY.rw())->*[&](auto dX, auto dY) { | ||
// clang-format off | ||
return std::vector<cuda_kernel_desc> { | ||
{ axpy, 16, 128, 0, alpha, dX, dY }, | ||
{ axpy, 16, 128, 0, beta, dX, dY }, | ||
{ axpy, 16, 128, 0, gamma, dX, dY } | ||
}; | ||
// clang-format on | ||
}; | ||
|
||
ctx.finalize(); | ||
|
||
for (size_t i = 0; i < N; i++) | ||
{ | ||
assert(fabs(Y[i] - (Y0(i) + (alpha + beta + gamma) * X0(i))) < 0.0001); | ||
assert(fabs(X[i] - X0(i)) < 0.0001); | ||
} | ||
} |
Oops, something went wrong.