Add NVRTC_SKIP_KERNEL_RUN tag to compile, but skip running NVRTC test (…

…#434) Some kernels using TMA need some host code to run as well. In addition, they need the device symbols to be accessible from host code. This turns out to be quite difficult to achieve. Therefore, since the device code is already executed in the non-nvrtc tests, it is reasonable to allow a test to skip running device code when NVRTC is enabled. This still tests whether the code actually compiles.
NVIDIA · Sep 18, 2023 · 6be81eb · 6be81eb
1 parent a6054a0
commit 6be81eb
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 6 deletions.
diff --git a/libcudacxx/.upstream-tests/test/std/skip_nvrtc_launch.pass.cpp b/libcudacxx/.upstream-tests/test/std/skip_nvrtc_launch.pass.cpp
@@ -0,0 +1,31 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: !nvrtc
+// NVRTC_SKIP_KERNEL_RUN // do compile, but do not run under nvrtc
+
+#include <cuda/std/cassert>
+#include <nv/target>
+
+// This is a test of the NVRTC_SKIP_KERNEL_RUN tag that indicates that a test
+// should compiler under NVRTC, but should not be run.
+int main(int, char**)
+{
+  NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+          // Ensure that code fails at runtime when run under NVRTC.
+          assert(false);
+        )
+    );
+
+  return 0;
+}
diff --git a/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/nvrtc.sh b/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/nvrtc.sh
@@ -128,6 +128,9 @@ trap finish EXIT
 thread_count=$(cat "${input}" | egrep 'cuda_thread_count = [0-9]+' | egrep -o '[0-9]+' || echo 1)
 shmem_size=$(cat "${input}" | egrep 'cuda_block_shmem_size = [0-9]+' | egrep -o '[0-9]+' || echo 0)
 
+# grep through test to see if running the NVRTC kernel is disabled.
+do_run_kernel=$(cat "${input}" | grep -q NVRTC_SKIP_KERNEL_RUN && echo "false" || echo "true")
+
 if [[ "${#gpu_archs[@]}" -eq 0 ]]
 then
     arch=""
@@ -142,6 +145,7 @@ else
     arch="compute_$(printf "%s\n" "${gpu_archs[@]}" | awk -F_ '{ print $2 }' | sort -un | head -n1)"
 fi
 
+echo "static const bool nvrtc_do_run_kernel = ${do_run_kernel};" >> ${tempfile}
 cat "${nvrtcdir}/head.cu.in" >> "${tempfile}"
 cat "${input}" >> "${tempfile}"
 cat "${nvrtcdir}/middle.cu.in" >> "${tempfile}"
@@ -158,8 +162,8 @@ then
 fi
 echo '        // END SCRIPT GENERATED OPTIONS' >> "${tempfile}"
 cat "${nvrtcdir}/tail.cu.in" >> "${tempfile}"
-echo '        '"${thread_count}, 1, 1," >> "${tempfile}"
-echo '        '"${shmem_size}," >> "${tempfile}"
+echo '            '"${thread_count}, 1, 1," >> "${tempfile}"
+echo '            '"${shmem_size}," >> "${tempfile}"
 cat "${nvrtcdir}/post_tail.cu.in" >> "${tempfile}"
 
 cat "${tempfile}" > ${logdir}/generated_file

diff --git a/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/post_tail.cu.in b/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/post_tail.cu.in
@@ -1,5 +1,8 @@
-        NULL,
-        NULL, 0));
+            NULL,
+            NULL, 0));
+    } else {
+        printf("Skipped running NVRTC-compiled kernel.\n");
+    }
 
     CUDA_API_CALL(cudaGetLastError());
     CUDA_API_CALL(cudaDeviceSynchronize());

diff --git a/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/tail.cu.in b/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/tail.cu.in
@@ -39,5 +39,6 @@
     CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
     CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, code.get(), 0, 0, 0));
     CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "main_kernel"));
-    CUDA_SAFE_CALL(cuLaunchKernel(kernel,
-        1, 1, 1,
+    if (nvrtc_do_run_kernel) {
+        CUDA_SAFE_CALL(cuLaunchKernel(kernel,
+            1, 1, 1,