From 3bee6788a795ebc22125d8726fbc79dbfb50b0b5 Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Mon, 14 Oct 2024 18:13:29 +0100
Subject: [PATCH 1/6] Made cudftestutil header-only and removed GTest
 dependency (#16839)

This merge request follows up on https://github.com/rapidsai/cudf/issues/16658.
It removes the dependency on GTest by cudftestutil.  It satisfies the requirement that we only need API compatibility with the GTest API and we don't expose the GTest symbols to our consumers nor ship any binary artifact.
The source files defining the symbols are late-binded to the resulting executable (via library INTERFACE sources).
The user has to link to manually link the GTest and GMock libraries to the final executable as illustrated below.

Closes #16658

### Usage

CMakeLists.txt:

```cmake
add_executable(test1 test1.cpp)
target_link_libraries(test1 PRIVATE GTest::gtest GTest::gmock GTest::gtest_main cudf::cudftestutil cudf::cudftestutil_impl)
```

Authors:
  - Basit Ayantunde (https://github.com/lamarrr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Robert Maynard (https://github.com/robertmaynard)
  - David Wendt (https://github.com/davidwendt)
  - Mike Sarahan (https://github.com/msarahan)

URL: https://github.com/rapidsai/cudf/pull/16839
---
 cpp/CMakeLists.txt                            | 65 +++++++++++-------
 cpp/benchmarks/CMakeLists.txt                 | 25 +++----
 cpp/include/cudf_test/testing_main.hpp        | 67 +++++++++++++------
 cpp/tests/CMakeLists.txt                      | 11 ++-
 cpp/tests/io/metadata_utilities.cpp           |  5 +-
 .../large_strings/large_strings_fixture.cpp   |  9 +--
 cpp/tests/utilities/table_utilities.cu        |  5 +-
 7 files changed, 115 insertions(+), 72 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a0ea9579475..c8f8ae2dcfe 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -863,15 +863,7 @@ if(CUDF_BUILD_TESTUTIL)
 
   add_library(cudf::cudftest_default_stream ALIAS cudftest_default_stream)
 
-  add_library(
-    cudftestutil SHARED
-    tests/io/metadata_utilities.cpp
-    tests/utilities/column_utilities.cu
-    tests/utilities/debug_utilities.cu
-    tests/utilities/random_seed.cpp
-    tests/utilities/table_utilities.cu
-    tests/utilities/tdigest_utilities.cu
-  )
+  add_library(cudftestutil INTERFACE)
 
   set_target_properties(
     cudftestutil
@@ -880,32 +872,56 @@ if(CUDF_BUILD_TESTUTIL)
                # set target compile options
                CXX_STANDARD 17
                CXX_STANDARD_REQUIRED ON
-               CXX_VISIBILITY_PRESET hidden
                CUDA_STANDARD 17
                CUDA_STANDARD_REQUIRED ON
-               CUDA_VISIBILITY_PRESET hidden
-               POSITION_INDEPENDENT_CODE ON
-               INTERFACE_POSITION_INDEPENDENT_CODE ON
   )
 
   target_compile_options(
-    cudftestutil PUBLIC "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
-                        "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
+    cudftestutil INTERFACE "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
+                           "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
   )
 
   target_link_libraries(
-    cudftestutil
-    PUBLIC Threads::Threads cudf cudftest_default_stream
-    PRIVATE GTest::gmock GTest::gtest $<TARGET_NAME_IF_EXISTS:conda_env>
+    cudftestutil INTERFACE Threads::Threads cudf cudftest_default_stream
+                           $<TARGET_NAME_IF_EXISTS:conda_env>
   )
 
   target_include_directories(
-    cudftestutil PUBLIC "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
-                        "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
+    cudftestutil INTERFACE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
+                           "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
   )
   rapids_cuda_set_runtime(cudftestutil USE_STATIC ${CUDA_STATIC_RUNTIME})
   add_library(cudf::cudftestutil ALIAS cudftestutil)
 
+  add_library(cudftestutil_impl INTERFACE)
+  add_library(cudf::cudftestutil_impl ALIAS cudftestutil_impl)
+  target_sources(
+    cudftestutil_impl
+    INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/io/metadata_utilities.cpp>
+              $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/utilities/column_utilities.cu>
+              $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/utilities/debug_utilities.cu>
+              $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/utilities/random_seed.cpp>
+              $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/utilities/table_utilities.cu>
+              $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/tests/utilities/tdigest_utilities.cu>
+              $<INSTALL_INTERFACE:src/cudftestutil/io/metadata_utilities.cpp>
+              $<INSTALL_INTERFACE:src/cudftestutil/utilities/column_utilities.cu>
+              $<INSTALL_INTERFACE:src/cudftestutil/utilities/debug_utilities.cu>
+              $<INSTALL_INTERFACE:src/cudftestutil/utilities/random_seed.cpp>
+              $<INSTALL_INTERFACE:src/cudftestutil/utilities/table_utilities.cu>
+              $<INSTALL_INTERFACE:src/cudftestutil/utilities/tdigest_utilities.cu>
+  )
+  target_link_libraries(cudftestutil_impl INTERFACE cudf::cudftestutil)
+
+  install(FILES tests/io/metadata_utilities.cpp DESTINATION src/cudftestutil/io)
+  install(
+    FILES tests/utilities/column_utilities.cu
+          tests/utilities/debug_utilities.cu
+          tests/utilities/random_seed.cpp
+          tests/utilities/table_utilities.cu
+          tests/utilities/tdigest_utilities.cu
+    DESTINATION src/cudftestutil/utilities
+  )
+
 endif()
 
 # * build cudf_identify_stream_usage --------------------------------------------------------------
@@ -1006,7 +1022,7 @@ install(
 set(_components_export_string)
 if(TARGET cudftestutil)
   install(
-    TARGETS cudftest_default_stream cudftestutil
+    TARGETS cudftest_default_stream cudftestutil cudftestutil_impl
     DESTINATION ${lib_dir}
     EXPORT cudf-testing-exports
   )
@@ -1046,14 +1062,15 @@ targets:
 This module offers an optional testing component which defines the
 following IMPORTED GLOBAL  targets:
 
- cudf::cudftestutil     - The main cudf testing library
+ cudf::cudftestutil          - The main cudf testing library
+ cudf::cudftestutil_impl     - C++ and CUDA sources to compile for definitions in cudf::cudftestutil
     ]=]
 )
 
 rapids_export(
   INSTALL cudf
   EXPORT_SET cudf-exports ${_components_export_string}
-  GLOBAL_TARGETS cudf cudftestutil
+  GLOBAL_TARGETS cudf cudftestutil cudftestutil_impl
   NAMESPACE cudf::
   DOCUMENTATION doc_string
 )
@@ -1074,7 +1091,7 @@ endif()
 rapids_export(
   BUILD cudf
   EXPORT_SET cudf-exports ${_components_export_string}
-  GLOBAL_TARGETS cudf cudftestutil
+  GLOBAL_TARGETS cudf cudftestutil cudftestutil_impl
   NAMESPACE cudf::
   DOCUMENTATION doc_string
   FINAL_CODE_BLOCK build_code_string
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index b0f75b25975..d6fc5dc6039 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -25,7 +25,7 @@ target_compile_options(
 target_link_libraries(
   cudf_datagen
   PUBLIC GTest::gmock GTest::gtest benchmark::benchmark nvbench::nvbench Threads::Threads cudf
-         cudftestutil nvtx3::nvtx3-cpp
+         cudf::cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
@@ -49,7 +49,7 @@ target_compile_options(
 
 target_link_libraries(
   ndsh_data_generator
-  PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp
+  PUBLIC cudf GTest::gmock GTest::gtest cudf::cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
@@ -65,14 +65,14 @@ target_include_directories(
 # Use an OBJECT library so we only compile these helper source files only once
 add_library(
   cudf_benchmark_common OBJECT
-  "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
-  synchronization/synchronization.cpp
-  io/cuio_common.cpp
-  common/table_utilities.cpp
-  common/benchmark_utilities.cpp
-  common/nvbench_utilities.cpp
+  synchronization/synchronization.cpp io/cuio_common.cpp common/table_utilities.cpp
+  common/benchmark_utilities.cpp common/nvbench_utilities.cpp
 )
-target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
+target_link_libraries(
+  cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env> GTest::gmock
+                                GTest::gtest
+)
+
 add_custom_command(
   OUTPUT CUDF_BENCHMARKS
   COMMAND echo Running benchmarks
@@ -99,7 +99,7 @@ function(ConfigureBench CMAKE_BENCH_NAME)
   )
   target_link_libraries(
     ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main
-                                $<TARGET_NAME_IF_EXISTS:conda_env>
+                                cudf::cudftestutil_impl $<TARGET_NAME_IF_EXISTS:conda_env>
   )
   add_custom_command(
     OUTPUT CUDF_BENCHMARKS
@@ -127,8 +127,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
                INSTALL_RPATH "\$ORIGIN/../../../lib"
   )
   target_link_libraries(
-    ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen
-                                nvbench::nvbench $<TARGET_NAME_IF_EXISTS:conda_env>
+    ${CMAKE_BENCH_NAME}
+    PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen nvbench::nvbench
+            $<TARGET_NAME_IF_EXISTS:conda_env> cudf::cudftestutil_impl
   )
   install(
     TARGETS ${CMAKE_BENCH_NAME}
diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index 272c91133f8..2bd08f410e0 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/cxxopts.hpp>
 #include <cudf_test/stream_checking_resource_adaptor.hpp>
 
@@ -36,6 +37,12 @@
 namespace CUDF_EXPORT cudf {
 namespace test {
 
+struct config {
+  std::string rmm_mode;
+  std::string stream_mode;
+  std::string stream_error_mode;
+};
+
 /// MR factory functions
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
 
@@ -157,10 +164,9 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
  * @param cmd_opts Command line options returned by parse_cudf_test_opts
  * @return Memory resource adaptor
  */
-inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
+inline auto make_memory_resource_adaptor(cudf::test::config const& config)
 {
-  auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();
-  auto resource       = cudf::test::create_memory_resource(rmm_mode);
+  auto resource = cudf::test::create_memory_resource(config.rmm_mode);
   cudf::set_current_device_resource(resource.get());
   return resource;
 }
@@ -176,37 +182,54 @@ inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
  * @param cmd_opts Command line options returned by parse_cudf_test_opts
  * @return Memory resource adaptor
  */
-inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
+inline auto make_stream_mode_adaptor(cudf::test::config const& config)
 {
   auto resource                      = cudf::get_current_device_resource_ref();
-  auto const stream_mode             = cmd_opts["stream_mode"].as<std::string>();
-  auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
-  auto const error_on_invalid_stream = (stream_error_mode == "error");
-  auto const check_default_stream    = (stream_mode == "new_cudf_default");
+  auto const error_on_invalid_stream = (config.stream_error_mode == "error");
+  auto const check_default_stream    = (config.stream_mode == "new_cudf_default");
   auto adaptor                       = cudf::test::stream_checking_resource_adaptor(
     resource, error_on_invalid_stream, check_default_stream);
-  if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
+  if ((config.stream_mode == "new_cudf_default") || (config.stream_mode == "new_testing_default")) {
     cudf::set_current_device_resource(&adaptor);
   }
   return adaptor;
 }
 
+/**
+ * @brief Should be called in every test program that uses rmm allocators since it maintains the
+ * lifespan of the rmm default memory resource. this function parses the command line to customize
+ * test behavior, like the allocation mode used for creating the default memory resource.
+ *
+ */
+inline void init_cudf_test(int argc, char** argv, cudf::test::config const& config_override = {})
+{
+  // static lifetime to keep rmm resource alive till tests end
+  auto const cmd_opts       = parse_cudf_test_opts(argc, argv);
+  cudf::test::config config = config_override;
+  if (config.rmm_mode.empty()) { config.rmm_mode = cmd_opts["rmm_mode"].as<std::string>(); }
+
+  if (config.stream_mode.empty()) {
+    config.stream_mode = cmd_opts["stream_mode"].as<std::string>();
+  }
+
+  if (config.stream_error_mode.empty()) {
+    config.stream_error_mode = cmd_opts["stream_error_mode"].as<std::string>();
+  }
+
+  [[maybe_unused]] static auto mr      = make_memory_resource_adaptor(config);
+  [[maybe_unused]] static auto adaptor = make_stream_mode_adaptor(config);
+}
+
 /**
  * @brief Macro that defines main function for gtest programs that use rmm
  *
- * Should be included in every test program that uses rmm allocators since
- * it maintains the lifespan of the rmm default memory resource.
  * This `main` function is a wrapper around the google test generated `main`,
- * maintaining the original functionality. In addition, this custom `main`
- * function parses the command line to customize test behavior, like the
- * allocation mode used for creating the default memory resource.
+ * maintaining the original functionality.
  */
-#define CUDF_TEST_PROGRAM_MAIN()                                            \
-  int main(int argc, char** argv)                                           \
-  {                                                                         \
-    ::testing::InitGoogleTest(&argc, argv);                                 \
-    auto const cmd_opts           = parse_cudf_test_opts(argc, argv);       \
-    [[maybe_unused]] auto mr      = make_memory_resource_adaptor(cmd_opts); \
-    [[maybe_unused]] auto adaptor = make_stream_mode_adaptor(cmd_opts);     \
-    return RUN_ALL_TESTS();                                                 \
+#define CUDF_TEST_PROGRAM_MAIN()            \
+  int main(int argc, char** argv)           \
+  {                                         \
+    ::testing::InitGoogleTest(&argc, argv); \
+    init_cudf_test(argc, argv);             \
+    return RUN_ALL_TESTS();                 \
   }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 4596ec65ce7..62189f76cae 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -56,8 +56,15 @@ function(ConfigureTest CMAKE_TEST_NAME)
 
   target_link_libraries(
     ${CMAKE_TEST_NAME}
-    PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main
-            nvtx3::nvtx3-cpp $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIBS}"
+    PRIVATE cudf::cudftestutil
+            cudf::cudftestutil_impl
+            GTest::gmock
+            GTest::gmock_main
+            GTest::gtest
+            GTest::gtest_main
+            nvtx3::nvtx3-cpp
+            $<TARGET_NAME_IF_EXISTS:conda_env>
+            "${_CUDF_TEST_EXTRA_LIBS}"
   )
   rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME})
   rapids_test_add(
diff --git a/cpp/tests/io/metadata_utilities.cpp b/cpp/tests/io/metadata_utilities.cpp
index 84f04f67038..380d66c53f9 100644
--- a/cpp/tests/io/metadata_utilities.cpp
+++ b/cpp/tests/io/metadata_utilities.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,9 @@
  * limitations under the License.
  */
 
+#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/io_metadata_utilities.hpp>
 
-#include <gmock/gmock.h>
-
 namespace cudf::test {
 
 void expect_metadata_equal(cudf::io::table_input_metadata in_meta,
diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp
index 249319da7f7..7b61be113f9 100644
--- a/cpp/tests/large_strings/large_strings_fixture.cpp
+++ b/cpp/tests/large_strings/large_strings_fixture.cpp
@@ -123,12 +123,9 @@ LargeStringsData* StringsLargeTest::g_ls_data = nullptr;
 int main(int argc, char** argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
-  auto const cmd_opts = parse_cudf_test_opts(argc, argv);
-  // hardcoding the CUDA memory resource to keep from exceeding the pool
-  auto mr = cudf::test::make_cuda();
-  cudf::set_current_device_resource(mr.get());
-  auto adaptor = make_stream_mode_adaptor(cmd_opts);
-
+  cudf::test::config config;
+  config.rmm_mode = "cuda";
+  init_cudf_test(argc, argv, config);
   // create object to automatically be destroyed at the end of main()
   auto lsd = cudf::test::StringsLargeTest::get_ls_data();
 
diff --git a/cpp/tests/utilities/table_utilities.cu b/cpp/tests/utilities/table_utilities.cu
index 354c0b1b57e..8e4906408de 100644
--- a/cpp/tests/utilities/table_utilities.cu
+++ b/cpp/tests/utilities/table_utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,10 +15,9 @@
  */
 
 #include <cudf_test/column_utilities.hpp>
+#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <gmock/gmock.h>
-
 namespace cudf::test::detail {
 void expect_table_properties_equal(cudf::table_view lhs, cudf::table_view rhs)
 {

From e41dea933f044183ccbfe26875a2b6c3ff383814 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 14 Oct 2024 10:35:37 -0700
Subject: [PATCH 2/6] Add profilers to CUDA 12 conda devcontainers (#17066)

This will make sure that profilers are available by default for everyone using our devcontainers.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/17066
---
 .../cuda12.5-conda/devcontainer.json          | 22 +++++++++++++++++++
 ci/release/update-version.sh                  |  1 +
 2 files changed, 23 insertions(+)

diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index 2a195c6c81d..a0e193ff0bf 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -15,9 +15,31 @@
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
+        "version": "12.5",
+        "installCompilers": false,
+        "installProfilers": true,
+        "installDevPackages": false,
+        "installcuDNN": false,
+        "installcuTensor": false,
+        "installNCCL": false,
+        "installCUDARuntime": false,
+        "installNVRTC": false,
+        "installOpenCL": false,
+        "installcuBLAS": false,
+        "installcuSPARSE": false,
+        "installcuFFT": false,
+        "installcuFile": false,
+        "installcuRAND": false,
+        "installcuSOLVER": false,
+        "installNPP": false,
+        "installnvJPEG": false,
+        "pruneStaticLibs": true
+      },
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
   "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"],
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 870901d223b..95f36653c2c 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -93,6 +93,7 @@ sed_runner "s/cudf-.*-SNAPSHOT/cudf-${NEXT_FULL_JAVA_TAG}/g" java/ci/README.md
 # .devcontainer files
 find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
     sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/cuda:[0-9.]*@rapidsai/devcontainers/features/cuda:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
 done

From 768fbaa28033446dc899872b3c94213c75bc1a98 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 14 Oct 2024 13:53:28 -0700
Subject: [PATCH 3/6] Fix ORC reader when using `device_read_async` while the
 destination device buffers are not ready (#17074)

This fixes a bug in ORC reader when `device_read_async` is called while the destination device buffers are not ready to write in. In detail, this bug is because `device_read_async` does not use the user-provided stream but its own generated stream for data copying. As such, the copying ops could happen before the destination device buffers are being allocated, causing data corruption.

This bug only shows up in certain conditions, and also hard to reproduce. It occurs when copying buffers with small sizes (below `gds_threshold`) and most likely to show up with setting `rmm_mode=async`.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17074
---
 cpp/src/io/orc/reader_impl_chunking.cu | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 01ee5ad177d..ecf319e75ab 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -500,6 +500,8 @@ void reader_impl::load_next_stripe_data(read_mode mode)
   auto const [read_begin, read_end] =
     merge_selected_ranges(_file_itm_data.stripe_data_read_ranges, load_stripe_range);
 
+  bool stream_synchronized{false};
+
   for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
     auto const& read_info = _file_itm_data.data_read_info[read_idx];
     auto const source_ptr = _metadata.per_file_metadata[read_info.source_idx].source;
@@ -507,6 +509,13 @@ void reader_impl::load_next_stripe_data(read_mode mode)
       lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data());
 
     if (source_ptr->is_device_read_preferred(read_info.length)) {
+      // `device_read_async` may not use _stream at all.
+      // Instead, it may use some other stream(s) to sync the H->D memcpy.
+      // As such, we need to make sure the device buffers in `lvl_stripe_data` are ready first.
+      if (!stream_synchronized) {
+        _stream.synchronize();
+        stream_synchronized = true;
+      }
       device_read_tasks.push_back(
         std::pair(source_ptr->device_read_async(
                     read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),

From 44afc5109b342b653797a20db1e2654fa450417f Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 14 Oct 2024 16:02:14 -0700
Subject: [PATCH 4/6] Add clang-tidy to CI (#16958)

This PR adds clang-tidy checks to our CI. clang-tidy will be run in nightly CI via CMake. For now, only the parts of the code base that were already made compliant in the PRs leading up to this have been enabled, namely cudf source and test cpp files. Over time we can add more files like benchmarks and examples, add or subtract more rules, or enable linting of cu files (see https://gitlab.kitware.com/cmake/cmake/-/issues/25399). This PR is intended to be the starting point enabling systematic linting, at which point everything else should be significantly easier.

Resolves #584

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16958
---
 .github/workflows/test.yaml           |  14 +-
 ci/clang_tidy.sh                      |  29 +++
 cpp/.clang-tidy                       |   2 +-
 cpp/CMakeLists.txt                    |  54 ++++++
 cpp/scripts/run-clang-tidy.py         | 253 --------------------------
 cpp/tests/CMakeLists.txt              |   1 +
 cpp/tests/interop/nanoarrow_utils.hpp |   3 +-
 dependencies.yaml                     |  30 ++-
 8 files changed, 129 insertions(+), 257 deletions(-)
 create mode 100755 ci/clang_tidy.sh
 delete mode 100644 cpp/scripts/run-clang-tidy.py

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index a22d3c5b9cc..1275aad757c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -47,11 +47,23 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
-      build_type: pull-request
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
       # Use the wheel container so we can skip conda solves and since our
       # primary static consumers (Spark) are not in conda anyway.
       container_image: "rapidsai/ci-wheel:latest"
       run_script: "ci/configure_cpp_static.sh"
+  clang-tidy:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      run_script: "ci/clang_tidy.sh"
   conda-python-cudf-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
diff --git a/ci/clang_tidy.sh b/ci/clang_tidy.sh
new file mode 100755
index 00000000000..4d5d3fc3136
--- /dev/null
+++ b/ci/clang_tidy.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+rapids-logger "Create clang-tidy conda environment"
+. /opt/conda/etc/profile.d/conda.sh
+
+ENV_YAML_DIR="$(mktemp -d)"
+
+rapids-dependency-file-generator \
+  --output conda \
+  --file-key clang_tidy \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
+
+rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n clang_tidy
+
+# Temporarily allow unbound variables for conda activation.
+set +u
+conda activate clang_tidy
+set -u
+
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
+source rapids-configure-sccache
+
+# Run the build via CMake, which will run clang-tidy when CUDF_CLANG_TIDY is enabled.
+cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON -GNinja
+cmake --build cpp/build
diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy
index 2d4f8c0d80e..12120a5c6d1 100644
--- a/cpp/.clang-tidy
+++ b/cpp/.clang-tidy
@@ -39,7 +39,7 @@ Checks:
        -clang-analyzer-optin.core.EnumCastOutOfRange,
        -clang-analyzer-optin.cplusplus.UninitializedObject'
 
-WarningsAsErrors: ''
+WarningsAsErrors: '*'
 HeaderFilterRegex: '.*cudf/cpp/(src|include|tests).*'
 ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*'
 FormatStyle:     none
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c8f8ae2dcfe..32a753c9f40 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -88,6 +88,7 @@ option(
   ${DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL}
 )
 mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL)
+option(CUDF_CLANG_TIDY "Enable clang-tidy checking" OFF)
 
 message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}")
 message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
@@ -144,6 +145,58 @@ if(NOT CUDF_GENERATED_INCLUDE_DIR)
   set(CUDF_GENERATED_INCLUDE_DIR ${CUDF_BINARY_DIR})
 endif()
 
+# ##################################################################################################
+# * clang-tidy configuration ----------------------------------------------------------------------
+if(CUDF_CLANG_TIDY)
+  find_program(
+    CLANG_TIDY_EXE
+    NAMES "clang-tidy"
+    DOC "Path to clang-tidy executable" REQUIRED
+  )
+
+  execute_process(
+    COMMAND ${CLANG_TIDY_EXE} --version
+    OUTPUT_VARIABLE CLANG_TIDY_OUTPUT
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  string(REGEX MATCH "LLVM version ([0-9]+\\.[0-9]+)\\.[0-9]+" LLVM_VERSION_MATCH
+               "${CLANG_TIDY_OUTPUT}"
+  )
+  # Discard the patch version and allow it to float. Empirically, results between patch versions are
+  # mostly stable, and looking at available packages on some package managers sometimes patch
+  # versions are skipped so we don't want to constrain to a patch version that the user can't
+  # install.
+  set(LLVM_VERSION "${CMAKE_MATCH_1}")
+  set(expected_clang_tidy_version 19.1)
+  if(NOT expected_clang_tidy_version VERSION_EQUAL LLVM_VERSION)
+    message(
+      FATAL_ERROR
+        "clang-tidy version ${expected_clang_tidy_version} is required, but found ${LLVM_VERSION}"
+    )
+  endif()
+endif()
+
+# Turn on the clang-tidy property for a target excluding the files specified in SKIPPED_FILES.
+function(enable_clang_tidy target)
+  set(_tidy_options)
+  set(_tidy_one_value)
+  set(_tidy_multi_value SKIPPED_FILES)
+  cmake_parse_arguments(
+    _TIDY "${_tidy_options}" "${_tidy_one_value}" "${_tidy_multi_value}" ${ARGN}
+  )
+
+  if(CUDF_CLANG_TIDY)
+    # clang will complain about unused link libraries on the compile line unless we specify
+    # -Qunused-arguments.
+    set_target_properties(
+      ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments"
+    )
+    foreach(file IN LISTS _TIDY_SKIPPED_FILES)
+      set_source_files_properties(${file} PROPERTIES SKIP_LINTING ON)
+    endforeach()
+  endif()
+endfunction()
+
 # ##################################################################################################
 # * conda environment -----------------------------------------------------------------------------
 rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH)
@@ -714,6 +767,7 @@ target_compile_options(
   cudf PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
                "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
 )
+enable_clang_tidy(cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp)
 
 if(CUDF_BUILD_STACKTRACE_DEBUG)
   # Remove any optimization level to avoid nvcc warning "incompatible redefinition for option
diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py
deleted file mode 100644
index e5e57dbf562..00000000000
--- a/cpp/scripts/run-clang-tidy.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import re
-import os
-import subprocess
-import argparse
-import json
-import multiprocessing as mp
-import shutil
-
-
-EXPECTED_VERSION = "16.0.6"
-VERSION_REGEX = re.compile(r"  LLVM version ([0-9.]+)")
-GPU_ARCH_REGEX = re.compile(r"sm_(\d+)")
-SPACES = re.compile(r"\s+")
-SEPARATOR = "-" * 16
-
-
-def parse_args():
-    argparser = argparse.ArgumentParser("Runs clang-tidy on a project")
-    argparser.add_argument("-cdb", type=str,
-                           # TODO This is a hack, needs to be fixed
-                           default="cpp/build/cuda-11.5.0/clang-tidy/release/compile_commands.clangd.json",
-                           help="Path to cmake-generated compilation database"
-                           " file. It is always found inside the root of the "
-                           "cmake build folder. So make sure that `cmake` has "
-                           "been run once before running this script!")
-    argparser.add_argument("-exe", type=str, default="clang-tidy",
-                           help="Path to clang-tidy exe")
-    argparser.add_argument("-ignore", type=str, default="[.]cu$|examples/kmeans/",
-                           help="Regex used to ignore files from checking")
-    argparser.add_argument("-select", type=str, default=None,
-                           help="Regex used to select files for checking")
-    argparser.add_argument("-j", type=int, default=-1,
-                           help="Number of parallel jobs to launch.")
-    args = argparser.parse_args()
-    if args.j <= 0:
-        args.j = mp.cpu_count()
-    args.ignore_compiled = re.compile(args.ignore) if args.ignore else None
-    args.select_compiled = re.compile(args.select) if args.select else None
-    ret = subprocess.check_output("%s --version" % args.exe, shell=True)
-    ret = ret.decode("utf-8")
-    version = VERSION_REGEX.search(ret)
-    if version is None:
-        raise Exception("Failed to figure out clang-tidy version!")
-    version = version.group(1)
-    if version != EXPECTED_VERSION:
-        raise Exception("clang-tidy exe must be v%s found '%s'" % \
-                        (EXPECTED_VERSION, version))
-    if not os.path.exists(args.cdb):
-        raise Exception("Compilation database '%s' missing" % args.cdb)
-    return args
-
-
-def get_all_commands(cdb):
-    with open(cdb) as fp:
-        return json.load(fp)
-
-
-def get_gpu_archs(command):
-    archs = []
-    for loc in range(len(command)):
-        if command[loc] != "-gencode":
-            continue
-        arch_flag = command[loc + 1]
-        match = GPU_ARCH_REGEX.search(arch_flag)
-        if match is not None:
-            archs.append("--cuda-gpu-arch=sm_%s" % match.group(1))
-    return archs
-
-
-def get_index(arr, item):
-    try:
-        return arr.index(item)
-    except:
-        return -1
-
-
-def remove_item(arr, item):
-    loc = get_index(arr, item)
-    if loc >= 0:
-        del arr[loc]
-    return loc
-
-
-def remove_item_plus_one(arr, item):
-    loc = get_index(arr, item)
-    if loc >= 0:
-        del arr[loc + 1]
-        del arr[loc]
-    return loc
-
-
-def get_clang_includes(exe):
-    dir = os.getenv("CONDA_PREFIX")
-    if dir is None:
-        ret = subprocess.check_output("which %s 2>&1" % exe, shell=True)
-        ret = ret.decode("utf-8")
-        dir = os.path.dirname(os.path.dirname(ret))
-    header = os.path.join(dir, "include", "ClangHeaders")
-    return ["-I", header]
-
-
-def get_tidy_args(cmd, exe):
-    command, file = cmd["command"], cmd["file"]
-    is_cuda = file.endswith(".cu")
-    command = re.split(SPACES, command)
-    # compiler is always clang++!
-    command[0] = "clang++"
-    # remove compilation and output targets from the original command
-    remove_item_plus_one(command, "-c")
-    remove_item_plus_one(command, "-o")
-    if is_cuda:
-        # replace nvcc's "-gencode ..." with clang's "--cuda-gpu-arch ..."
-        archs = get_gpu_archs(command)
-        command.extend(archs)
-        while True:
-            loc = remove_item_plus_one(command, "-gencode")
-            if loc < 0:
-                break
-        # "-x cuda" is the right usage in clang
-        loc = get_index(command, "-x")
-        if loc >= 0:
-            command[loc + 1] = "cuda"
-        remove_item_plus_one(command, "-ccbin")
-        remove_item(command, "--expt-extended-lambda")
-        remove_item(command, "--diag_suppress=unrecognized_gcc_pragma")
-    command.extend(get_clang_includes(exe))
-    return command, is_cuda
-
-
-def run_clang_tidy_command(tidy_cmd):
-    cmd = " ".join(tidy_cmd)
-    result = subprocess.run(cmd, check=False, shell=True,
-                            stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    status = result.returncode == 0
-    if status:
-        out = ""
-    else:
-        out = "CMD: " + cmd
-    out += result.stdout.decode("utf-8").rstrip()
-    return status, out
-
-
-def run_clang_tidy(cmd, args):
-    command, is_cuda = get_tidy_args(cmd, args.exe)
-    tidy_cmd = [args.exe,
-                "-header-filter='.*cudf/cpp/(src|include|bench|comms).*'",
-                cmd["file"], "--", ]
-    tidy_cmd.extend(command)
-    status = True
-    out = ""
-    if is_cuda:
-        tidy_cmd.append("--cuda-device-only")
-        tidy_cmd.append(cmd["file"])
-        ret, out1 = run_clang_tidy_command(tidy_cmd)
-        out += out1
-        out += "%s" % SEPARATOR
-        if not ret:
-            status = ret
-        tidy_cmd[-2] = "--cuda-host-only"
-        ret, out1 = run_clang_tidy_command(tidy_cmd)
-        if not ret:
-            status = ret
-        out += out1
-    else:
-        tidy_cmd.append(cmd["file"])
-        ret, out1 = run_clang_tidy_command(tidy_cmd)
-        if not ret:
-            status = ret
-        out += out1
-    return status, out, cmd["file"]
-
-
-# yikes! global var :(
-results = []
-def collect_result(result):
-    global results
-    results.append(result)
-
-
-def print_result(passed, stdout, file):
-    status_str = "PASSED" if passed else "FAILED"
-    print(f"{SEPARATOR} File:{file} {status_str} {SEPARATOR}")
-    if stdout:
-        print(stdout)
-        print(f"{SEPARATOR} File:{file} ENDS {SEPARATOR}")
-
-
-def print_results():
-    global results
-    status = True
-    for passed, stdout, file in results:
-        print_result(passed, stdout, file)
-        if not passed:
-            status = False
-    return status
-
-
-def run_tidy_for_all_files(args, all_files):
-    pool = None if args.j == 1 else mp.Pool(args.j)
-    # actual tidy checker
-    for cmd in all_files:
-        # skip files that we don't want to look at
-        if args.ignore_compiled is not None and \
-           re.search(args.ignore_compiled, cmd["file"]) is not None:
-            continue
-        if args.select_compiled is not None and \
-           re.search(args.select_compiled, cmd["file"]) is None:
-            continue
-        if pool is not None:
-            pool.apply_async(run_clang_tidy, args=(cmd, args),
-                             callback=collect_result)
-        else:
-            passed, stdout, file = run_clang_tidy(cmd, args)
-            collect_result((passed, stdout, file))
-    if pool is not None:
-        pool.close()
-        pool.join()
-    return print_results()
-
-
-def main():
-    args = parse_args()
-    # Attempt to making sure that we run this script from root of repo always
-    if not os.path.exists(".git"):
-        raise Exception("This needs to always be run from the root of repo")
-    # Check whether clang-tidy exists
-    # print(args)
-    if "exe" not in args and shutil.which("clang-tidy") is not None:
-        print("clang-tidy not found. Exiting...")
-        return
-    all_files = get_all_commands(args.cdb)
-    status = run_tidy_for_all_files(args, all_files)
-    if not status:
-        raise Exception("clang-tidy failed! Refer to the errors above.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 62189f76cae..799a84cbc37 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -83,6 +83,7 @@ function(ConfigureTest CMAKE_TEST_NAME)
         "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$<TARGET_FILE:cudf_identify_stream_usage_mode_${_CUDF_TEST_STREAM_MODE}>"
     )
   endif()
+  enable_clang_tidy(${CMAKE_TEST_NAME})
 endfunction()
 
 # ##################################################################################################
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index a961f73d955..8be7e087b6d 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -256,7 +256,8 @@ std::enable_if_t<std::is_same_v<T, bool>, nanoarrow::UniqueArray> get_nanoarrow_
     ArrowBitmap out;
     ArrowBitmapInit(&out);
     NANOARROW_THROW_NOT_OK(ArrowBitmapResize(&out, b.size(), 1));
-    std::memset(out.buffer.data, 0, out.buffer.size_bytes);
+    // TODO: Investigate clang-tidy issue further after nanoarrow is made compliant
+    std::memset(out.buffer.data, 0, out.buffer.size_bytes);  // NOLINT
 
     for (size_t i = 0; i < b.size(); ++i) {
       ArrowBitSetTo(out.buffer.data, i, static_cast<uint8_t>(b[i]));
diff --git a/dependencies.yaml b/dependencies.yaml
index ca17917c905..ff97b67f0ce 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -6,10 +6,18 @@ files:
       cuda: ["11.8", "12.5"]
       arch: [x86_64]
     includes:
+      # Note that clang-tidy is not included here because cudf's preferred
+      # version conflicts with the rest of RAPIDS as well as its own
+      # clang-format version. Until we update our clang-format version we will
+      # not be able to install both into the same environment. Moreover, using
+      # this version will break compatibility with other RAPIDS libraries that
+      # are still using 16.0.6, and as such will and that would break any
+      # unified environment like that used in unified devcontainers.
       - build_base
       - build_all
       - build_cpp
       - build_python_common
+      - clang_format
       - cuda
       - cuda_version
       - depends_on_cupy
@@ -86,6 +94,16 @@ files:
     includes:
       - develop
       - py_version
+  clang_tidy:
+    output: none
+    includes:
+      - build_all
+      - build_base
+      - clang_tidy
+      - cuda
+      - cuda_version
+      - develop
+      - py_version
   docs:
     output: none
     includes:
@@ -553,11 +571,21 @@ dependencies:
           # pre-commit requires identify minimum version 1.0, but clang-format requires textproto support and that was
           # added in 2.5.20, so we need to call out the minimum version needed for our plugins
           - identify>=2.5.20
+      - output_types: conda
+        packages:
+          - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version.
+  clang_format:
+    common:
       - output_types: conda
         packages:
           - clang==16.0.6
           - clang-tools=16.0.6
-          - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version.
+  clang_tidy:
+    common:
+      - output_types: conda
+        packages:
+          - clang==19.1.0
+          - clang-tools==19.1.0
   docs:
     common:
       - output_types: [conda]

From 86db9804746fb20554c1900b311a228dc1d6e349 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 14 Oct 2024 16:30:22 -0700
Subject: [PATCH 5/6] Clean up hash-groupby `var_hash_functor` (#17034)

This work is part of splitting the original bulk shared memory groupby PR #16619.

This PR renames the file originally titled `multi_pass_kernels.cuh`, which contains the `var_hash_functor`, to `var_hash_functor.cuh`. It also includes cleanups such as utilizing `cuda::std::` utilities in device code and removing redundant template parameters.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17034
---
 cpp/src/groupby/hash/groupby.cu               |  5 +-
 cpp/src/groupby/hash/groupby_kernels.cuh      |  2 -
 ..._pass_kernels.cuh => var_hash_functor.cuh} | 51 ++++++++-----------
 3 files changed, 25 insertions(+), 33 deletions(-)
 rename cpp/src/groupby/hash/{multi_pass_kernels.cuh => var_hash_functor.cuh} (69%)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 75767786272..0432b9d120a 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -16,7 +16,8 @@
 
 #include "flatten_single_pass_aggs.hpp"
 #include "groupby/common/utils.hpp"
-#include "groupby/hash/groupby_kernels.cuh"
+#include "groupby_kernels.cuh"
+#include "var_hash_functor.cuh"
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
@@ -261,7 +262,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
       rmm::exec_policy(stream),
       thrust::make_counting_iterator(0),
       col.size(),
-      ::cudf::detail::var_hash_functor{
+      var_hash_functor{
         set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
     sparse_results->add_result(col, agg, std::move(var_result));
     dense_results->add_result(col, agg, to_dense_agg_result(agg));
diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh
index 188d0cff3f1..86f4d76487f 100644
--- a/cpp/src/groupby/hash/groupby_kernels.cuh
+++ b/cpp/src/groupby/hash/groupby_kernels.cuh
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include "multi_pass_kernels.cuh"
-
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/device_aggregators.cuh>
 #include <cudf/groupby.hpp>
diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/var_hash_functor.cuh
similarity index 69%
rename from cpp/src/groupby/hash/multi_pass_kernels.cuh
rename to cpp/src/groupby/hash/var_hash_functor.cuh
index 7043eafdc10..bb55cc9188c 100644
--- a/cpp/src/groupby/hash/multi_pass_kernels.cuh
+++ b/cpp/src/groupby/hash/var_hash_functor.cuh
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/aggregation.hpp>
@@ -21,17 +20,14 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <cuco/static_set_ref.cuh>
 #include <cuda/atomic>
+#include <cuda/std/type_traits>
 
-#include <cmath>
-
-namespace cudf {
-namespace detail {
-
-template <typename SetType, bool target_has_nulls = true, bool source_has_nulls = true>
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
 struct var_hash_functor {
   SetType set;
   bitmask_type const* __restrict__ row_bitmask;
@@ -47,13 +43,13 @@ struct var_hash_functor {
                    column_device_view sum,
                    column_device_view count,
                    size_type ddof)
-    : set(set),
-      row_bitmask(row_bitmask),
-      target(target),
-      source(source),
-      sum(sum),
-      count(count),
-      ddof(ddof)
+    : set{set},
+      row_bitmask{row_bitmask},
+      target{target},
+      source{source},
+      sum{sum},
+      count{count},
+      ddof{ddof}
   {
   }
 
@@ -64,23 +60,21 @@ struct var_hash_functor {
   }
 
   template <typename Source>
-  __device__ std::enable_if_t<!is_supported<Source>()> operator()(column_device_view const& source,
-                                                                  size_type source_index,
-                                                                  size_type target_index) noexcept
+  __device__ cuda::std::enable_if_t<!is_supported<Source>()> operator()(
+    column_device_view const& source, size_type source_index, size_type target_index) noexcept
   {
     CUDF_UNREACHABLE("Invalid source type for std, var aggregation combination.");
   }
 
   template <typename Source>
-  __device__ std::enable_if_t<is_supported<Source>()> operator()(column_device_view const& source,
-                                                                 size_type source_index,
-                                                                 size_type target_index) noexcept
+  __device__ cuda::std::enable_if_t<is_supported<Source>()> operator()(
+    column_device_view const& source, size_type source_index, size_type target_index) noexcept
   {
-    using Target    = target_type_t<Source, aggregation::VARIANCE>;
-    using SumType   = target_type_t<Source, aggregation::SUM>;
-    using CountType = target_type_t<Source, aggregation::COUNT_VALID>;
+    using Target    = cudf::detail::target_type_t<Source, aggregation::VARIANCE>;
+    using SumType   = cudf::detail::target_type_t<Source, aggregation::SUM>;
+    using CountType = cudf::detail::target_type_t<Source, aggregation::COUNT_VALID>;
 
-    if (source_has_nulls and source.is_null(source_index)) return;
+    if (source.is_null(source_index)) return;
     CountType group_size = count.element<CountType>(target_index);
     if (group_size == 0 or group_size - ddof <= 0) return;
 
@@ -91,8 +85,9 @@ struct var_hash_functor {
     ref.fetch_add(result, cuda::std::memory_order_relaxed);
     // STD sqrt is applied in finalize()
 
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
   }
+
   __device__ inline void operator()(size_type source_index)
   {
     if (row_bitmask == nullptr or cudf::bit_is_set(row_bitmask, source_index)) {
@@ -110,6 +105,4 @@ struct var_hash_functor {
     }
   }
 };
-
-}  // namespace detail
-}  // namespace cudf
+}  // namespace cudf::groupby::detail::hash

From 319ec3b8031e4deb7dfc3f4c4a07a10ef88c131f Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 14 Oct 2024 19:58:30 -0400
Subject: [PATCH 6/6] Adding assertion to check for regular JSON inputs of size
 greater than `INT_MAX` bytes (#17057)

Addresses #17017

Libcudf does not support parsing regular JSON inputs of size greater than `INT_MAX` bytes. Note that the batched reader can only be used for JSON lines inputs.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/17057
---
 cpp/src/io/json/nested_json_gpu.cu |  3 +--
 cpp/src/io/json/read_json.cu       | 14 ++++++++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 76816071d8c..69a51fab5dc 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -83,8 +83,7 @@ struct tree_node {
 void check_input_size(std::size_t input_size)
 {
   // Transduce() writes symbol offsets that may be as large input_size-1
-  CUDF_EXPECTS(input_size == 0 ||
-                 (input_size - 1) <= std::numeric_limits<cudf::io::json::SymbolOffsetT>::max(),
+  CUDF_EXPECTS(input_size == 0 || (input_size - 1) <= std::numeric_limits<int32_t>::max(),
                "Given JSON input is too large");
 }
 }  // namespace
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 99a5b17bce8..c424d2b3b62 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -351,10 +351,16 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
    * JSON inputs.
    */
   std::size_t const total_source_size = sources_size(sources, 0, 0);
-  std::size_t chunk_offset            = reader_opts.get_byte_range_offset();
-  std::size_t chunk_size              = reader_opts.get_byte_range_size();
-  chunk_size                          = !chunk_size ? total_source_size - chunk_offset
-                                                    : std::min(chunk_size, total_source_size - chunk_offset);
+
+  // Batching is enabled only for JSONL inputs, not regular JSON files
+  CUDF_EXPECTS(
+    reader_opts.is_enabled_lines() || total_source_size < std::numeric_limits<int32_t>::max(),
+    "Parsing Regular JSON inputs of size greater than INT_MAX bytes is not supported");
+
+  std::size_t chunk_offset = reader_opts.get_byte_range_offset();
+  std::size_t chunk_size   = reader_opts.get_byte_range_size();
+  chunk_size               = !chunk_size ? total_source_size - chunk_offset
+                                         : std::min(chunk_size, total_source_size - chunk_offset);
 
   std::size_t const size_per_subchunk      = estimate_size_per_subchunk(chunk_size);
   std::size_t const batch_size_upper_bound = get_batch_size_upper_bound();