From 82d74afed1c24ec0a1933d138adc70a2b55971b7 Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Tue, 25 Jun 2024 13:34:44 -0400
Subject: [PATCH] Sanitizer fixes (#1916)

* Add [skip-rapids] option.

* Remove dead skip-tests logic.

* Fix memory leak in CUB's radix sort custom test.

* Fix memory leak in CUB's test_util_device

* Fix memory leak in catch2_test_vsmem.

* Fix #1895; Memory leak in device_select_api test.

* Fix #1897; memory leak in test_block_radix_rank.cu

* Fix #1896; memory leak in CUB's NVRTC test
---
 .github/actions/workflow-build/action.yml     |  6 ------
 .../workflows/ci-workflow-pull-request.yml    |  2 +-
 .../catch2_test_device_radix_sort_custom.cu   |  3 ---
 cub/test/catch2_test_device_select_api.cu     | 21 ++++++++++++-------
 cub/test/catch2_test_nvrtc.cu                 |  1 +
 cub/test/catch2_test_util_device.cu           | 15 ++++++++-----
 cub/test/catch2_test_vsmem.cu                 |  2 ++
 cub/test/test_block_radix_rank.cu             |  2 ++
 8 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/.github/actions/workflow-build/action.yml b/.github/actions/workflow-build/action.yml
index 96a04a7c6c..57a9498952 100644
--- a/.github/actions/workflow-build/action.yml
+++ b/.github/actions/workflow-build/action.yml
@@ -9,10 +9,6 @@ inputs:
     description: "If true, the requested `workflows` will be ignored when a non-empty 'override' workflow exists in the matrix file."
     default: "false"
     required: false
-  skip_tests:
-    description: "Skip running tests"
-    default: "false"
-    required: false
   inspect_changes_script:
     description: "If defined, run this script to determine which projects/deps need to be tested."
     default: ""
@@ -57,7 +53,6 @@ runs:
       shell: bash --noprofile --norc -euo pipefail {0}
       env:
         allow_override: ${{ inputs.allow_override == 'true' && '--allow-override' || ''}}
-        skip_tests: ${{ inputs.skip_tests == 'true' && '--skip-tests' || ''}}
         dirty_projects_flag: ${{ inputs.inspect_changes_script != '' && '--dirty-projects' || ''}}
         dirty_projects: ${{ inputs.inspect_changes_script != '' && steps.inspect-changes.outputs.dirty_projects || ''}}
         matrix_parser: ${{ inputs.matrix_parser && inputs.matrix_parser || '${GITHUB_ACTION_PATH}/build-workflow.py' }}
@@ -67,7 +62,6 @@ runs:
         ${{ env.matrix_parser }} ${{ inputs.matrix_file }}           \
             --workflows ${{ inputs.workflows }}                      \
             ${{ env.allow_override }}                                \
-            ${{ env.skip_tests }}                                    \
             ${{ env.dirty_projects_flag }} ${{ env.dirty_projects }}
 
         echo "::group::Workflow"
diff --git a/.github/workflows/ci-workflow-pull-request.yml b/.github/workflows/ci-workflow-pull-request.yml
index e329d60ee4..06cd639164 100644
--- a/.github/workflows/ci-workflow-pull-request.yml
+++ b/.github/workflows/ci-workflow-pull-request.yml
@@ -62,7 +62,6 @@ jobs:
           nightly_workflow: ${{ contains(github.event.head_commit.message, '[workflow:nightly]') && 'nightly' || '' }}
         with:
           allow_override: "true"
-          skip_tests: ${{ toJSON(contains(github.event.head_commit.message, '[skip-tests]')) }}
           inspect_changes_script: ${{ toJSON(!contains(github.event.head_commit.message, '[all-projects]') && 'ci/inspect_changes.sh' || '') }}
           inspect_changes_base_sha: ${{ steps.export-pr-info.outputs.base_sha }}
           workflows: >-
@@ -193,6 +192,7 @@ jobs:
 
   build-rapids:
     name: Build RAPIDS
+    if: ${{ !contains(github.event.head_commit.message, '[skip-rapids]') }}
     secrets: inherit
     permissions:
       actions: read
diff --git a/cub/test/catch2_test_device_radix_sort_custom.cu b/cub/test/catch2_test_device_radix_sort_custom.cu
index 4ffc5c008f..9f4a190c78 100644
--- a/cub/test/catch2_test_device_radix_sort_custom.cu
+++ b/cub/test/catch2_test_device_radix_sort_custom.cu
@@ -452,9 +452,6 @@ CUB_TEST("Device radix sort works with bits of custom i128_t keys (db)", "[radix
   constexpr int max_items = 1 << 18;
   const int num_items     = GENERATE_COPY(take(4, random(max_items / 2, max_items)));
 
-  int* selector = nullptr;
-  cudaMallocHost(&selector, sizeof(int));
-
   c2h::device_vector<key> keys_1(num_items);
   c2h::device_vector<key> keys_2(num_items);
   c2h::gen(CUB_SEED(2), keys_1);
diff --git a/cub/test/catch2_test_device_select_api.cu b/cub/test/catch2_test_device_select_api.cu
index 6c230566f5..18b5ce887c 100644
--- a/cub/test/catch2_test_device_select_api.cu
+++ b/cub/test/catch2_test_device_select_api.cu
@@ -32,6 +32,7 @@
 
 #include <thrust/device_vector.h>
 #include <thrust/equal.h>
+#include <thrust/memory.h>
 
 #include <cstddef>
 
@@ -58,10 +59,9 @@ CUB_TEST("cub::DeviceSelect::FlaggedIf works with int data elements", "[select][
   is_even_t is_even{};
 
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cub::DeviceSelect::FlaggedIf(
-    d_temp_storage,
+    nullptr,
     temp_storage_bytes,
     d_in.begin(),
     d_flags.begin(),
@@ -71,11 +71,11 @@ CUB_TEST("cub::DeviceSelect::FlaggedIf works with int data elements", "[select][
     is_even);
 
   // Allocate temporary storage
-  cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  c2h::device_vector<char> temp_storage(temp_storage_bytes);
 
   // Run selection
   cub::DeviceSelect::FlaggedIf(
-    d_temp_storage,
+    thrust::raw_pointer_cast(temp_storage.data()),
     temp_storage_bytes,
     d_in.begin(),
     d_flags.begin(),
@@ -102,17 +102,22 @@ CUB_TEST("cub::DeviceSelect::FlaggedIf in-place works with int data elements", "
   is_even_t is_even{};
 
   // Determine temporary device storage requirements
-  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cub::DeviceSelect::FlaggedIf(
-    d_temp_storage, temp_storage_bytes, d_data.begin(), d_flags.begin(), d_num_selected_out.data(), num_items, is_even);
+    nullptr, temp_storage_bytes, d_data.begin(), d_flags.begin(), d_num_selected_out.data(), num_items, is_even);
 
   // Allocate temporary storage
-  cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  c2h::device_vector<char> temp_storage(temp_storage_bytes);
 
   // Run selection
   cub::DeviceSelect::FlaggedIf(
-    d_temp_storage, temp_storage_bytes, d_data.begin(), d_flags.begin(), d_num_selected_out.data(), num_items, is_even);
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    d_data.begin(),
+    d_flags.begin(),
+    d_num_selected_out.data(),
+    num_items,
+    is_even);
 
   thrust::device_vector<int> expected{0, 1, 5};
   // example-end segmented-select-flaggedif-inplace
diff --git a/cub/test/catch2_test_nvrtc.cu b/cub/test/catch2_test_nvrtc.cu
index b842f0a2e3..0e1b232ff6 100644
--- a/cub/test/catch2_test_nvrtc.cu
+++ b/cub/test/catch2_test_nvrtc.cu
@@ -294,6 +294,7 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
   }
 
   REQUIRE(CUDA_SUCCESS == cuMemFree(d_ptr));
+  REQUIRE(CUDA_SUCCESS == cuMemFree(d_err));
   REQUIRE(CUDA_SUCCESS == cuModuleUnload(module));
   REQUIRE(CUDA_SUCCESS == cuCtxDestroy(context));
 }
diff --git a/cub/test/catch2_test_util_device.cu b/cub/test/catch2_test_util_device.cu
index 888bcf1d44..c59c076ec5 100644
--- a/cub/test/catch2_test_util_device.cu
+++ b/cub/test/catch2_test_util_device.cu
@@ -64,11 +64,16 @@ CUB_TEST("CUB correctly identifies the ptx version the kernel was compiled for",
   constexpr std::size_t single_item = 1;
   c2h::device_vector<int> cuda_arch(single_item);
 
-  int* ptx_version{};
-  cudaMallocHost(&ptx_version, sizeof(*ptx_version));
-
   // Query the arch the kernel was actually compiled for
-  get_cuda_arch_from_kernel(thrust::raw_pointer_cast(cuda_arch.data()), ptx_version);
+  int ptx_version = [&]() -> int {
+    int* buffer{};
+    cudaMallocHost(&buffer, sizeof(*buffer));
+    get_cuda_arch_from_kernel(thrust::raw_pointer_cast(cuda_arch.data()), buffer);
+    int result = *buffer;
+    cudaFreeHost(buffer);
+    return result;
+  }();
+
   int kernel_cuda_arch = cuda_arch[0];
 
   // Host cub::PtxVersion
@@ -79,6 +84,6 @@ CUB_TEST("CUB correctly identifies the ptx version the kernel was compiled for",
   REQUIRE(0 != kernel_cuda_arch);
 
   // Ensure that the ptx version corresponds to the arch the kernel was compiled for
-  REQUIRE(*ptx_version == kernel_cuda_arch);
+  REQUIRE(ptx_version == kernel_cuda_arch);
   REQUIRE(host_ptx_version == kernel_cuda_arch);
 }
diff --git a/cub/test/catch2_test_vsmem.cu b/cub/test/catch2_test_vsmem.cu
index a4ae61e08c..91069fcab1 100644
--- a/cub/test/catch2_test_vsmem.cu
+++ b/cub/test/catch2_test_vsmem.cu
@@ -474,4 +474,6 @@ CUB_TEST("Virtual shared memory works within algorithms", "[util][vsmem]", type_
     // The virtual shared memory helper pads vsmem to a multiple of a line size, hence the range check
     REQUIRE(launch_config_info->config_vsmem_per_block >= expected_vsmem_per_block);
   }
+
+  cudaFreeHost(launch_config_info);
 }
diff --git a/cub/test/test_block_radix_rank.cu b/cub/test/test_block_radix_rank.cu
index f642696c1f..6d36378882 100644
--- a/cub/test/test_block_radix_rank.cu
+++ b/cub/test/test_block_radix_rank.cu
@@ -337,5 +337,7 @@ int main(int argc, char** argv)
   Test<128>();
   Test<130>();
 
+  g_allocator.FreeAllCached();
+
   return 0;
 }