From 6be81eb472117e5e7b12c3d2d9d3314c32afb12a Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 18 Sep 2023 09:16:06 +0200
Subject: [PATCH] Add NVRTC_SKIP_KERNEL_RUN tag to compile, but skip running
 NVRTC test (#434)

Some kernels using TMA need some host code to run as well. In addition,
they need the device symbols to be accessible from host code. This turns
out to be quite difficult to achieve. Therefore, since the device code
is already executed in the non-nvrtc tests, it is reasonable to allow a
test to skip running device code when NVRTC is enabled. This still tests
whether the code actually compiles.
---
 .../test/std/skip_nvrtc_launch.pass.cpp       | 31 +++++++++++++++++++
 .../utils/nvidia/nvrtc/nvrtc.sh               |  8 +++--
 .../utils/nvidia/nvrtc/post_tail.cu.in        |  7 +++--
 .../utils/nvidia/nvrtc/tail.cu.in             |  5 +--
 4 files changed, 45 insertions(+), 6 deletions(-)
 create mode 100644 libcudacxx/.upstream-tests/test/std/skip_nvrtc_launch.pass.cpp

diff --git a/libcudacxx/.upstream-tests/test/std/skip_nvrtc_launch.pass.cpp b/libcudacxx/.upstream-tests/test/std/skip_nvrtc_launch.pass.cpp
new file mode 100644
index 00000000000..beed5046b38
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/std/skip_nvrtc_launch.pass.cpp
@@ -0,0 +1,31 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: !nvrtc
+// NVRTC_SKIP_KERNEL_RUN // do compile, but do not run under nvrtc
+
+#include <cuda/std/cassert>
+#include <nv/target>
+
+// This is a test of the NVRTC_SKIP_KERNEL_RUN tag that indicates that a test
+// should compiler under NVRTC, but should not be run.
+int main(int, char**)
+{
+  NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+          // Ensure that code fails at runtime when run under NVRTC.
+          assert(false);
+        )
+    );
+
+  return 0;
+}
diff --git a/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/nvrtc.sh b/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/nvrtc.sh
index 66eb9c84d09..f4b63d2cbb0 100755
--- a/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/nvrtc.sh
+++ b/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/nvrtc.sh
@@ -128,6 +128,9 @@ trap finish EXIT
 thread_count=$(cat "${input}" | egrep 'cuda_thread_count = [0-9]+' | egrep -o '[0-9]+' || echo 1)
 shmem_size=$(cat "${input}" | egrep 'cuda_block_shmem_size = [0-9]+' | egrep -o '[0-9]+' || echo 0)
 
+# grep through test to see if running the NVRTC kernel is disabled.
+do_run_kernel=$(cat "${input}" | grep -q NVRTC_SKIP_KERNEL_RUN && echo "false" || echo "true")
+
 if [[ "${#gpu_archs[@]}" -eq 0 ]]
 then
     arch=""
@@ -142,6 +145,7 @@ else
     arch="compute_$(printf "%s\n" "${gpu_archs[@]}" | awk -F_ '{ print $2 }' | sort -un | head -n1)"
 fi
 
+echo "static const bool nvrtc_do_run_kernel = ${do_run_kernel};" >> ${tempfile}
 cat "${nvrtcdir}/head.cu.in" >> "${tempfile}"
 cat "${input}" >> "${tempfile}"
 cat "${nvrtcdir}/middle.cu.in" >> "${tempfile}"
@@ -158,8 +162,8 @@ then
 fi
 echo '        // END SCRIPT GENERATED OPTIONS' >> "${tempfile}"
 cat "${nvrtcdir}/tail.cu.in" >> "${tempfile}"
-echo '        '"${thread_count}, 1, 1," >> "${tempfile}"
-echo '        '"${shmem_size}," >> "${tempfile}"
+echo '            '"${thread_count}, 1, 1," >> "${tempfile}"
+echo '            '"${shmem_size}," >> "${tempfile}"
 cat "${nvrtcdir}/post_tail.cu.in" >> "${tempfile}"
 
 cat "${tempfile}" > ${logdir}/generated_file
diff --git a/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/post_tail.cu.in b/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/post_tail.cu.in
index 507be254ae1..a0f28ebf2df 100644
--- a/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/post_tail.cu.in
+++ b/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/post_tail.cu.in
@@ -1,5 +1,8 @@
-        NULL,
-        NULL, 0));
+            NULL,
+            NULL, 0));
+    } else {
+        printf("Skipped running NVRTC-compiled kernel.\n");
+    }
 
     CUDA_API_CALL(cudaGetLastError());
     CUDA_API_CALL(cudaDeviceSynchronize());
diff --git a/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/tail.cu.in b/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/tail.cu.in
index 1a400d688d4..d3d678114b0 100644
--- a/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/tail.cu.in
+++ b/libcudacxx/.upstream-tests/utils/nvidia/nvrtc/tail.cu.in
@@ -39,5 +39,6 @@
     CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
     CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, code.get(), 0, 0, 0));
     CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "main_kernel"));
-    CUDA_SAFE_CALL(cuLaunchKernel(kernel,
-        1, 1, 1,
+    if (nvrtc_do_run_kernel) {
+        CUDA_SAFE_CALL(cuLaunchKernel(kernel,
+            1, 1, 1,