From 044e22411e6416807ed0ef4142f6a8d18c7a5b56 Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Mon, 8 Apr 2024 18:23:38 -0400
Subject: [PATCH 1/5] Move CUPTI disablement to build script. (#167)

---
 ci/build_nvbench.sh          | 17 ++++++++++
 ci/matrix.yaml               | 66 ++++++++++++++++++------------------
 ci/windows/build_nvbench.ps1 |  2 +-
 ci/windows/test_nvbench.ps1  |  2 +-
 4 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/ci/build_nvbench.sh b/ci/build_nvbench.sh
index cc245d3..e9ba372 100755
--- a/ci/build_nvbench.sh
+++ b/ci/build_nvbench.sh
@@ -8,6 +8,23 @@ PRESET="nvbench-ci"
 
 CMAKE_OPTIONS=""
 
+function version_lt() {
+  local lhs="${1//v/}"
+  local rhs="${2//v/}"
+  # If the versions are equal, return false
+  [ "$lhs" = "$rhs" ] && return 1
+  # If the left-hand side is less than the right-hand side, return true
+  [  "$lhs" = `echo -e "$lhs\n$rhs" | sort -V | head -n1` ]
+}
+
+# If CUDA_COMPILER is nvcc and the version < 11.3, disable CUPTI
+if [[ "$CUDA_COMPILER" == *"nvcc"* ]]; then
+  CUDA_VERSION=$(nvcc --version | grep release | sed -r 's/.*release ([0-9.]+).*/\1/')
+  if version_lt "$CUDA_VERSION" "11.3"; then
+    CMAKE_OPTIONS+=" -DNVBench_ENABLE_CUPTI=OFF "
+  fi
+fi
+
 configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"
 
 print_time_summary
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 9959473..8308d63 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -47,40 +47,40 @@ msvc2022: &msvc2022 { name: 'cl', version: '14.39', exe: 'cl++' }
 # Configurations that will run for every PR
 pull_request:
   nvcc:
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc7,     extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc8,     extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc9,     extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm9,    extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
-    - {cuda: *cuda_prev_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10}
-    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11}
-    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10}
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11}
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc7   }
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc8   }
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc9   }
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm9  }
+    - {cuda: *cuda_prev_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11  }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7   }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8   }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9   }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10  }
+    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11  }
+    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12  }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9  }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10 }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11 }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12 }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13 }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14 }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10  }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11  }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12  }
     # Fails to compile simple input on CTK12.4. Try to add later.
-    # {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc13}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14}
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15}
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16}
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm17}
+    # {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc13  }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9  }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10 }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11 }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12 }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13 }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14 }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15 }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16 }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm17 }
     - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm18,   extra_build_args: "-cmake-options '-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler'"}
     - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2019, extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF -DNVBench_ENABLE_NVML=OFF'"}
     - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2022, extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF -DNVBench_ENABLE_NVML=OFF'"}
diff --git a/ci/windows/build_nvbench.ps1 b/ci/windows/build_nvbench.ps1
index 1ac8bd1..7240698 100644
--- a/ci/windows/build_nvbench.ps1
+++ b/ci/windows/build_nvbench.ps1
@@ -20,7 +20,7 @@ $CMAKE_OPTIONS = ""
 
 # Append any arguments pass in on the command line
 If($ARG_CMAKE_OPTIONS -ne "") {
-    $CMAKE_OPTIONS += "$ARG_CMAKE_OPTIONS"
+    $CMAKE_OPTIONS += " $ARG_CMAKE_OPTIONS"
 }
 
 configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"
diff --git a/ci/windows/test_nvbench.ps1 b/ci/windows/test_nvbench.ps1
index bcd9f2c..4ee5106 100644
--- a/ci/windows/test_nvbench.ps1
+++ b/ci/windows/test_nvbench.ps1
@@ -20,7 +20,7 @@ $CMAKE_OPTIONS = ""
 
 # Append any arguments pass in on the command line
 If($ARG_CMAKE_OPTIONS -ne "") {
-    $CMAKE_OPTIONS += "$ARG_CMAKE_OPTIONS"
+    $CMAKE_OPTIONS += " $ARG_CMAKE_OPTIONS"
 }
 
 configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"

From 9e8efa2c88b11a0253c0e7844fe45aee307e36e6 Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Mon, 8 Apr 2024 18:24:07 -0400
Subject: [PATCH 2/5] Preserve `.devcontainers/img/` when cleaning. (#166)

---
 .devcontainer/make_devcontainers.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.devcontainer/make_devcontainers.sh b/.devcontainer/make_devcontainers.sh
index 64b92c0..f868cc1 100755
--- a/.devcontainer/make_devcontainers.sh
+++ b/.devcontainer/make_devcontainers.sh
@@ -111,6 +111,9 @@ mv "./temp_devcontainer.json" ${base_devcontainer_file}
 # Create an array to keep track of valid subdirectory names
 valid_subdirs=()
 
+# The img folder should not be removed:
+valid_subdirs+=("img")
+
 # For each unique combination
 for combination in $combinations; do
     cuda_version=$(echo "$combination" | jq -r '.cuda')

From 165cf924c5b04d4ff8a5e8de08e35928b5e64c4b Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Tue, 9 Apr 2024 12:45:58 -0400
Subject: [PATCH 3/5] Refactor main implementation to improve reusability and
 customization. (#165)

* Refactor main implementation to improve reusability and customization.

Move the implementation of `main` out of macros and into separate
functions. This allows for easier reuse and customization of the macros.
Existing macro usage should still work as expected, and new
customization points will simplify common tasks like argument parsing
going forward.

* Add tests that validate common main customizations.
---
 nvbench/main.cuh                         | 241 ++++++++++++++++++-----
 testing/CMakeLists.txt                   |  15 +-
 testing/custom_main_custom_args.cu       | 132 +++++++++++++
 testing/custom_main_custom_exceptions.cu |  64 ++++++
 4 files changed, 403 insertions(+), 49 deletions(-)
 create mode 100644 testing/custom_main_custom_args.cu
 create mode 100644 testing/custom_main_custom_exceptions.cu

diff --git a/nvbench/main.cuh b/nvbench/main.cuh
index bcdced5..84b8ebb 100644
--- a/nvbench/main.cuh
+++ b/nvbench/main.cuh
@@ -28,14 +28,130 @@
 #include <cstdlib>
 #include <iostream>
 
+// Advanced users can rebuild NVBench's `main` function using the macros in this file, or replace
+// them with customized implementations.
+
+// Customization point, called before NVBench initialization.
+#ifndef NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE
+#define NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE(argc, argv) []() {}()
+#endif
+
+// Customization point, called after NVBench initialization.
+#ifndef NVBENCH_MAIN_INITIALIZE_CUSTOM_POST
+#define NVBENCH_MAIN_INITIALIZE_CUSTOM_POST(argc, argv) []() {}()
+#endif
+
+// Customization point, called before NVBench parsing. Update argc/argv if needed.
+// argc/argv are the usual command line arguments types. The ARGS version of this
+// macro is a bit more convenient.
+#ifndef NVBENCH_MAIN_CUSTOM_ARGC_ARGV_HANDLER
+#define NVBENCH_MAIN_CUSTOM_ARGC_ARGV_HANDLER(argc, argv) []() {}()
+#endif
+
+// Customization point, called before NVBench parsing. Update args if needed.
+// Args is a vector of strings, each element is an argument.
+#ifndef NVBENCH_MAIN_CUSTOM_ARGS_HANDLER
+#define NVBENCH_MAIN_CUSTOM_ARGS_HANDLER(args) []() {}()
+#endif
+
+// Customization point, called before NVBench parsing.
+#ifndef NVBENCH_MAIN_PARSE_CUSTOM_PRE
+#define NVBENCH_MAIN_PARSE_CUSTOM_PRE(parser, args) []() {}()
+#endif
+
+// Customization point, called after NVBench parsing.
+#ifndef NVBENCH_MAIN_PARSE_CUSTOM_POST
+#define NVBENCH_MAIN_PARSE_CUSTOM_POST(parser) []() {}()
+#endif
+
+// Customization point, called before NVBench finalization.
+#ifndef NVBENCH_MAIN_FINALIZE_CUSTOM_PRE
+#define NVBENCH_MAIN_FINALIZE_CUSTOM_PRE() []() {}()
+#endif
+
+// Customization point, called after NVBench finalization.
+#ifndef NVBENCH_MAIN_FINALIZE_CUSTOM_POST
+#define NVBENCH_MAIN_FINALIZE_CUSTOM_POST() []() {}()
+#endif
+
+// Customization point, use to catch addition exceptions.
+#ifndef NVBENCH_MAIN_CATCH_EXCEPTIONS_CUSTOM
+#define NVBENCH_MAIN_CATCH_EXCEPTIONS_CUSTOM
+#endif
+
+/************************************ Default implementation **************************************/
+
+#ifndef NVBENCH_MAIN
 #define NVBENCH_MAIN                                                                               \
-  int main(int argc, char const *const *argv)                                                      \
+  int main(int argc, char **argv)                                                                  \
   try                                                                                              \
   {                                                                                                \
     NVBENCH_MAIN_BODY(argc, argv);                                                                 \
-    NVBENCH_CUDA_CALL(cudaDeviceReset());                                                          \
     return 0;                                                                                      \
   }                                                                                                \
+  NVBENCH_MAIN_CATCH_EXCEPTIONS_CUSTOM                                                             \
+  NVBENCH_MAIN_CATCH_EXCEPTIONS
+#endif
+
+#ifndef NVBENCH_MAIN_BODY
+#define NVBENCH_MAIN_BODY(argc, argv)                                                              \
+  NVBENCH_MAIN_INITIALIZE(argc, argv);                                                             \
+  {                                                                                                \
+    NVBENCH_MAIN_PARSE(argc, argv);                                                                \
+                                                                                                   \
+    NVBENCH_MAIN_PRINT_PREAMBLE(parser);                                                           \
+    NVBENCH_MAIN_RUN_BENCHMARKS(parser);                                                           \
+    NVBENCH_MAIN_PRINT_EPILOGUE(parser);                                                           \
+                                                                                                   \
+    NVBENCH_MAIN_PRINT_RESULTS(parser);                                                            \
+  } /* Tear down parser before finalization */                                                     \
+  NVBENCH_MAIN_FINALIZE();                                                                         \
+  return 0;
+#endif
+
+#ifndef NVBENCH_MAIN_INITIALIZE
+#define NVBENCH_MAIN_INITIALIZE(argc, argv)                                                        \
+  NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE(argc, argv);                                                  \
+  nvbench::detail::main_initialize(argc, argv);                                                    \
+  NVBENCH_MAIN_INITIALIZE_CUSTOM_POST(argc, argv)
+#endif
+
+#ifndef NVBENCH_MAIN_PARSE
+#define NVBENCH_MAIN_PARSE(argc, argv)                                                             \
+  NVBENCH_MAIN_CUSTOM_ARGC_ARGV_HANDLER(argc, argv);                                               \
+  std::vector<std::string> args = nvbench::detail::main_convert_args(argc, argv);                  \
+  NVBENCH_MAIN_CUSTOM_ARGS_HANDLER(args);                                                          \
+  nvbench::option_parser parser;                                                                   \
+  NVBENCH_MAIN_PARSE_CUSTOM_PRE(parser, args);                                                     \
+  parser.parse(args);                                                                              \
+  NVBENCH_MAIN_PARSE_CUSTOM_POST(parser)
+#endif
+
+#ifndef NVBENCH_MAIN_PRINT_PREAMBLE
+#define NVBENCH_MAIN_PRINT_PREAMBLE(parser) nvbench::detail::main_print_preamble(parser)
+#endif
+
+#ifndef NVBENCH_MAIN_RUN_BENCHMARKS
+#define NVBENCH_MAIN_RUN_BENCHMARKS(parser) nvbench::detail::main_run_benchmarks(parser)
+#endif
+
+#ifndef NVBENCH_MAIN_PRINT_EPILOGUE
+#define NVBENCH_MAIN_PRINT_EPILOGUE(parser) nvbench::detail::main_print_epilogue(parser)
+#endif
+
+#ifndef NVBENCH_MAIN_PRINT_RESULTS
+#define NVBENCH_MAIN_PRINT_RESULTS(parser) nvbench::detail::main_print_results(parser)
+#endif
+
+#ifndef NVBENCH_MAIN_FINALIZE
+#define NVBENCH_MAIN_FINALIZE()                                                                    \
+  NVBENCH_MAIN_FINALIZE_CUSTOM_PRE();                                                              \
+  nvbench::detail::main_finalize();                                                                \
+  NVBENCH_MAIN_FINALIZE_CUSTOM_POST()
+#endif
+
+#ifndef NVBENCH_MAIN_CATCH_EXCEPTIONS
+#define NVBENCH_MAIN_CATCH_EXCEPTIONS                                                              \
   catch (std::exception & e)                                                                       \
   {                                                                                                \
     std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n";                        \
@@ -46,56 +162,87 @@
     std::cerr << "\nNVBench encountered an unknown error.\n";                                      \
     return 1;                                                                                      \
   }
-
-#ifdef NVBENCH_HAS_CUPTI
-#define NVBENCH_INITIALIZE_DRIVER_API NVBENCH_DRIVER_API_CALL(cuInit(0))
-#else
-// clang-format off
-#define NVBENCH_INITIALIZE_DRIVER_API do {} while (false)
-// clang-format on
 #endif
 
-#define NVBENCH_MAIN_PARSE(argc, argv)                                                             \
-  nvbench::option_parser parser;                                                                   \
-  parser.parse(argc, argv)
+namespace nvbench::detail
+{
 
-// See NVIDIA/NVBench#136 for CUDA_MODULE_LOADING
+inline void set_env(const char *name, const char *value)
+{
 #ifdef _MSC_VER
-#define NVBENCH_INITIALIZE_CUDA_ENV _putenv_s("CUDA_MODULE_LOADING", "EAGER")
+  _putenv_s(name, value);
 #else
-#define NVBENCH_INITIALIZE_CUDA_ENV setenv("CUDA_MODULE_LOADING", "EAGER", 1)
+  setenv(name, value, 1);
 #endif
+}
 
-#define NVBENCH_INITIALIZE_BENCHMARKS()                                                            \
-  nvbench::benchmark_manager::get().initialize()
+inline void main_initialize(int, char **)
+{
+  // See NVIDIA/NVBench#136 for CUDA_MODULE_LOADING
+  set_env("CUDA_MODULE_LOADING", "EAGER");
 
-#define NVBENCH_MAIN_BODY(argc, argv)                                                              \
-  do                                                                                               \
-  {                                                                                                \
-    NVBENCH_INITIALIZE_CUDA_ENV;                                                                   \
-    NVBENCH_INITIALIZE_DRIVER_API;                                                                 \
-    NVBENCH_INITIALIZE_BENCHMARKS();                                                               \
-    NVBENCH_MAIN_PARSE(argc, argv);                                                                \
-    auto &printer = parser.get_printer();                                                          \
-                                                                                                   \
-    printer.print_device_info();                                                                   \
-    printer.print_log_preamble();                                                                  \
-    auto &benchmarks = parser.get_benchmarks();                                                    \
-                                                                                                   \
-    std::size_t total_states = 0;                                                                  \
-    for (auto &bench_ptr : benchmarks)                                                             \
-    {                                                                                              \
-      total_states += bench_ptr->get_config_count();                                               \
-    }                                                                                              \
-    printer.set_total_state_count(total_states);                                                   \
-                                                                                                   \
-    printer.set_completed_state_count(0);                                                          \
-    for (auto &bench_ptr : benchmarks)                                                             \
-    {                                                                                              \
-      bench_ptr->set_printer(printer);                                                             \
-      bench_ptr->run();                                                                            \
-      bench_ptr->clear_printer();                                                                  \
-    }                                                                                              \
-    printer.print_log_epilogue();                                                                  \
-    printer.print_benchmark_results(benchmarks);                                                   \
-  } while (false)
+  // Initialize CUDA driver API if needed:
+#ifdef NVBENCH_HAS_CUPTI
+  NVBENCH_DRIVER_API_CALL(cuInit(0));
+#endif
+
+  // Initialize the benchmarks *after* setting up the CUDA environment:
+  nvbench::benchmark_manager::get().initialize();
+}
+
+inline std::vector<std::string> main_convert_args(int argc, char const *const *argv)
+{
+  std::vector<std::string> args;
+  for (int i = 0; i < argc; ++i)
+  {
+    args.push_back(argv[i]);
+  }
+  return args;
+}
+
+inline void main_print_preamble(option_parser &parser)
+{
+  auto &printer = parser.get_printer();
+
+  printer.print_device_info();
+  printer.print_log_preamble();
+}
+
+inline void main_run_benchmarks(option_parser &parser)
+{
+  auto &printer    = parser.get_printer();
+  auto &benchmarks = parser.get_benchmarks();
+
+  std::size_t total_states = 0;
+  for (auto &bench_ptr : benchmarks)
+  {
+    total_states += bench_ptr->get_config_count();
+  }
+
+  printer.set_completed_state_count(0);
+  printer.set_total_state_count(total_states);
+
+  for (auto &bench_ptr : benchmarks)
+  {
+    bench_ptr->set_printer(printer);
+    bench_ptr->run();
+    bench_ptr->clear_printer();
+  }
+}
+
+inline void main_print_epilogue(option_parser &parser)
+{
+  auto &printer = parser.get_printer();
+  printer.print_log_epilogue();
+}
+
+inline void main_print_results(option_parser &parser)
+{
+  auto &printer    = parser.get_printer();
+  auto &benchmarks = parser.get_benchmarks();
+  printer.print_benchmark_results(benchmarks);
+}
+
+inline void main_finalize() { NVBENCH_CUDA_CALL(cudaDeviceReset()); }
+
+} // namespace nvbench::detail
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
index 1535878..55eb741 100644
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -6,6 +6,8 @@ set(test_srcs
   cpu_timer.cu
   criterion_manager.cu
   criterion_params.cu
+  custom_main_custom_args.cu
+  custom_main_custom_exceptions.cu
   enum_type_list.cu
   entropy_criterion.cu
   float64_axis.cu
@@ -24,7 +26,12 @@ set(test_srcs
   type_list.cu
 )
 
-# Metatarget for all examples:
+# Custom arguments:
+# CTest commands+args can't be modified after creation, so we need to rely on substitution.
+set(NVBench_TEST_ARGS_nvbench.test.custom_main_custom_args "--quiet" "--my-custom-arg" "--run-once" "-d" "0")
+set(NVBench_TEST_ARGS_nvbench.test.custom_main_custom_exceptions "--quiet" "--run-once" "-d" "0")
+
+# Metatarget for all tests:
 add_custom_target(nvbench.test.all)
 add_dependencies(nvbench.all nvbench.test.all)
 
@@ -36,10 +43,14 @@ foreach(test_src IN LISTS test_srcs)
   target_link_libraries(${test_name} PRIVATE nvbench::nvbench fmt)
   set_target_properties(${test_name} PROPERTIES COMPILE_FEATURES cuda_std_17)
   nvbench_config_target(${test_name})
-  add_test(NAME ${test_name} COMMAND "$<TARGET_FILE:${test_name}>")
+  add_test(NAME ${test_name} COMMAND "$<TARGET_FILE:${test_name}>" ${NVBench_TEST_ARGS_${test_name}})
 
   add_dependencies(nvbench.test.all ${test_name})
 endforeach()
 
+set_tests_properties(nvbench.test.custom_main_custom_exceptions PROPERTIES
+  PASS_REGULAR_EXPRESSION "Custom error detected: Expected exception thrown."
+)
+
 add_subdirectory(cmake)
 add_subdirectory(device)
diff --git a/testing/custom_main_custom_args.cu b/testing/custom_main_custom_args.cu
new file mode 100644
index 0000000..f7e331e
--- /dev/null
+++ b/testing/custom_main_custom_args.cu
@@ -0,0 +1,132 @@
+/*
+ *  Copyright 2024 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+#include "nvbench/cuda_call.cuh"
+
+/******************************************************************************
+ * Install custom parser.
+ * sSee <nvbench/main.cuh> for more details.
+ ******************************************************************************/
+
+//
+// Step 1: Define a custom argument handler that accepts a vector of strings.
+//          - This handler should modify the vector in place to remove any custom
+//            arguments it handles. NVbench will then parse the remaining arguments.
+//          - The handler should also update any application state needed to handle
+//            the custom arguments.
+//
+
+// User code to handle a specific argument:
+void handle_my_custom_arg();
+
+// NVBench hook for modiifying the command line arguments before parsing:
+void custom_arg_handler(std::vector<std::string> &args)
+{
+  // Handle and remove "--my-custom-arg"
+  if (auto it = std::find(args.begin(), args.end(), "--my-custom-arg"); it != args.end())
+  {
+    handle_my_custom_arg();
+    args.erase(it);
+  }
+}
+
+//
+// Step 2: Install the custom argument handler.
+//         - This is done by defining a macro that invokes the custom argument handler.
+//
+
+// Install the custom argument handler:
+// Either define this before any NVBench headers are included, or undefine and redefine:
+#undef NVBENCH_MAIN_CUSTOM_ARGS_HANDLER
+#define NVBENCH_MAIN_CUSTOM_ARGS_HANDLER(args) custom_arg_handler(args)
+
+// Step 3: Define `main`
+//
+// After installing the custom argument handler, define the main function using:
+//
+// ```
+// NVBENCH_MAIN
+// ```
+//
+// Here, this is done at the end of this file.
+
+/******************************************************************************
+ * Unit test verification:
+ ******************************************************************************/
+
+// Track whether the args are found / handled.
+bool h_custom_arg_found             = false;
+bool h_handled_on_device            = false;
+__device__ bool d_custom_arg_found  = false;
+__device__ bool d_handled_on_device = false;
+
+// Copy host values to device:
+void copy_host_state_to_device()
+{
+  NVBENCH_CUDA_CALL(cudaMemcpyToSymbol(d_custom_arg_found, &h_custom_arg_found, sizeof(bool)));
+  NVBENCH_CUDA_CALL(cudaMemcpyToSymbol(d_handled_on_device, &h_handled_on_device, sizeof(bool)));
+}
+
+// Copy device values to host:
+void copy_device_state_to_host()
+{
+  NVBENCH_CUDA_CALL(cudaMemcpyFromSymbol(&h_custom_arg_found, d_custom_arg_found, sizeof(bool)));
+  NVBENCH_CUDA_CALL(cudaMemcpyFromSymbol(&h_handled_on_device, d_handled_on_device, sizeof(bool)));
+}
+
+void handle_my_custom_arg()
+{
+  h_custom_arg_found = true;
+  copy_host_state_to_device();
+}
+
+void verify()
+{
+  copy_device_state_to_host();
+  if (!h_custom_arg_found)
+  {
+    throw std::runtime_error("Custom argument not detected.");
+  }
+  if (!h_handled_on_device)
+  {
+    throw std::runtime_error("Custom argument not handled on device.");
+  }
+}
+
+// Install a verification check to ensure the custom argument was handled.
+// Use the `PRE` finalize hook to ensure we check device state before resetting the context.
+#undef NVBENCH_MAIN_FINALIZE_CUSTOM_PRE
+#define NVBENCH_MAIN_FINALIZE_CUSTOM_PRE() verify()
+
+// Simple kernel/benchmark to make sure that the handler can successfully modify CUDA state:
+__global__ void kernel()
+{
+  if (d_custom_arg_found)
+  {
+    d_handled_on_device = true;
+  }
+}
+void bench(nvbench::state &state)
+{
+  state.exec([](nvbench::launch &) { kernel<<<1, 1>>>(); });
+}
+NVBENCH_BENCH(bench);
+
+// Define the customized main function:
+NVBENCH_MAIN
diff --git a/testing/custom_main_custom_exceptions.cu b/testing/custom_main_custom_exceptions.cu
new file mode 100644
index 0000000..b1f9b9c
--- /dev/null
+++ b/testing/custom_main_custom_exceptions.cu
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2022 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+#include <stdexcept>
+
+/******************************************************************************
+ * Install exception handler around the NVBench main body. This is used
+ * to print helpful information when a user exception is thrown before exiting.
+ *
+ * Note that this will **NOT** be used when a benchmark throws an exception.
+ * That will fail the benchmark and note the exception, and continue
+ * execution.
+ *
+ * This is used to catch exceptions in user extensions of NVBench, things like
+ * customized initialization, command line parsing, finalization, etc. See
+ * <nvbench/main.cuh> for more details.
+ ******************************************************************************/
+
+struct user_exception : public std::runtime_error
+{
+  user_exception()
+      : std::runtime_error("Expected exception thrown.")
+  {}
+};
+
+// User code to handle user exception:
+void handle_my_exception(user_exception &e)
+{
+  std::cerr << "Custom error detected: " << e.what() << std::endl;
+  std::exit(1);
+}
+
+// Install the exception handler around the NVBench main body.
+// NVBench will have sensible defaults for common exceptions following this if no terminating catch
+// block is defined.
+// Either define this before any NVBench headers are included, or undefine and redefine.
+#undef NVBENCH_MAIN_CATCH_EXCEPTIONS_CUSTOM
+#define NVBENCH_MAIN_CATCH_EXCEPTIONS_CUSTOM                                                       \
+  catch (user_exception & e) { handle_my_exception(e); }
+
+// For testing purposes, install a argument parser that throws:
+void really_robust_argument_parser(std::vector<std::string> &) { throw user_exception(); }
+#undef NVBENCH_MAIN_CUSTOM_ARGS_HANDLER
+#define NVBENCH_MAIN_CUSTOM_ARGS_HANDLER(args) really_robust_argument_parser(args);
+
+// Define the customized main function:
+NVBENCH_MAIN

From 5ee8811a1ac5a90f73a4dc52ab8572c25724a0e8 Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Tue, 9 Apr 2024 17:27:49 -0400
Subject: [PATCH 4/5] Fix and test using RAII global state in `main`. (#168)

---
 nvbench/main.cuh                         |  13 ++-
 testing/CMakeLists.txt                   |   1 +
 testing/custom_main_global_state_raii.cu | 121 +++++++++++++++++++++++
 3 files changed, 131 insertions(+), 4 deletions(-)
 create mode 100644 testing/custom_main_global_state_raii.cu

diff --git a/nvbench/main.cuh b/nvbench/main.cuh
index 84b8ebb..cd809ba 100644
--- a/nvbench/main.cuh
+++ b/nvbench/main.cuh
@@ -111,9 +111,11 @@
 
 #ifndef NVBENCH_MAIN_INITIALIZE
 #define NVBENCH_MAIN_INITIALIZE(argc, argv)                                                        \
-  NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE(argc, argv);                                                  \
-  nvbench::detail::main_initialize(argc, argv);                                                    \
-  NVBENCH_MAIN_INITIALIZE_CUSTOM_POST(argc, argv)
+  { /* Open a scope to ensure that the inner initialize/finalize hooks clean up in order. */       \
+    NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE(argc, argv);                                                \
+    nvbench::detail::main_initialize(argc, argv);                                                  \
+    { /* Open a scope to ensure that the inner initialize/finalize hooks clean up in order. */     \
+      NVBENCH_MAIN_INITIALIZE_CUSTOM_POST(argc, argv)
 #endif
 
 #ifndef NVBENCH_MAIN_PARSE
@@ -146,8 +148,11 @@
 #ifndef NVBENCH_MAIN_FINALIZE
 #define NVBENCH_MAIN_FINALIZE()                                                                    \
   NVBENCH_MAIN_FINALIZE_CUSTOM_PRE();                                                              \
+  } /* Close a scope to ensure that the inner initialize/finalize hooks clean up in order. */      \
   nvbench::detail::main_finalize();                                                                \
-  NVBENCH_MAIN_FINALIZE_CUSTOM_POST()
+  NVBENCH_MAIN_FINALIZE_CUSTOM_POST();                                                             \
+  } /* Close a scope to ensure that the inner initialize/finalize hooks clean up in order. */      \
+  []() {}()
 #endif
 
 #ifndef NVBENCH_MAIN_CATCH_EXCEPTIONS
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
index 55eb741..4a03000 100644
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -8,6 +8,7 @@ set(test_srcs
   criterion_params.cu
   custom_main_custom_args.cu
   custom_main_custom_exceptions.cu
+  custom_main_global_state_raii.cu
   enum_type_list.cu
   entropy_criterion.cu
   float64_axis.cu
diff --git a/testing/custom_main_global_state_raii.cu b/testing/custom_main_global_state_raii.cu
new file mode 100644
index 0000000..e3584ab
--- /dev/null
+++ b/testing/custom_main_global_state_raii.cu
@@ -0,0 +1,121 @@
+/*
+ *  Copyright 2024 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstdio>
+
+/******************************************************************************
+ * Test having global state that is initialized and finalized via RAII.
+ *****************************************************************************/
+
+struct raii
+{
+  const char m_ref_data[6];
+  char *m_data;
+  bool m_cuda;
+
+  const char *m_outer_data;
+  bool m_outer_cuda;
+
+  explicit raii(bool cuda, char *outer_data = nullptr, bool outer_cuda = false)
+      : m_ref_data{'a', 'b', 'c', '1', '2', '3'}
+      , m_data(nullptr)
+      , m_cuda(cuda)
+      , m_outer_data(outer_data)
+      , m_outer_cuda(outer_cuda)
+  {
+    if (m_cuda)
+    {
+      printf("(%p) RAII test: allocating device memory\n", this);
+      NVBENCH_CUDA_CALL(cudaMalloc(&m_data, 6));
+      NVBENCH_CUDA_CALL(cudaMemcpy(m_data, m_ref_data, 6, cudaMemcpyHostToDevice));
+    }
+    else
+    {
+      printf("(%p) RAII test: allocating host memory\n", this);
+      m_data = new char[6];
+      std::copy(m_ref_data, m_ref_data + 6, m_data);
+    }
+  }
+
+  ~raii()
+  {
+    this->verify();
+    if (m_cuda)
+    {
+      printf("(%p) RAII test: invalidating device memory\n", this);
+      NVBENCH_CUDA_CALL(cudaMemset(m_data, 0, 6));
+      printf("(%p) RAII test: freeing device memory\n", this);
+      NVBENCH_CUDA_CALL(cudaFree(m_data));
+    }
+    else
+    {
+      printf("(%p) RAII test: invalidating host memory\n", this);
+      std::fill(m_data, m_data + 6, '\0');
+      printf("(%p) RAII test: freeing host memory\n", this);
+      delete[] m_data;
+    }
+  }
+
+  void verify() noexcept
+  {
+    printf("(%p) RAII test: verifying instance state\n", this);
+    this->verify(m_cuda, m_data);
+    if (m_outer_data)
+    {
+      printf("(%p) RAII test: verifying outer state\n", this);
+      this->verify(m_outer_cuda, m_outer_data);
+    }
+  }
+
+  void verify(bool cuda, const char *data) noexcept
+  {
+    if (cuda)
+    {
+      char test_data[6];
+      NVBENCH_CUDA_CALL(cudaMemcpy(test_data, data, 6, cudaMemcpyDeviceToHost));
+      if (strncmp(test_data, m_ref_data, 6) != 0)
+      {
+        printf("(%p) RAII test failed: device data mismatch\n", this);
+        std::exit(1);
+      }
+    }
+    else
+    {
+      if (strncmp(data, m_ref_data, 6) != 0)
+      {
+        printf("(%p) RAII test failed: host data mismatch\n", this);
+        std::exit(1);
+      }
+    }
+  }
+};
+
+// These will be destroyed in the opposite order in which they are created:
+
+#undef NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE
+#define NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE(argc, argv) raii raii_outer(false);
+
+#undef NVBENCH_MAIN_INITIALIZE_CUSTOM_POST
+#define NVBENCH_MAIN_INITIALIZE_CUSTOM_POST(argc, argv)                                            \
+  [[maybe_unused]] raii raii_inner(true, raii_outer.m_data, raii_outer.m_cuda);
+
+NVBENCH_MAIN

From 60761e0946faaef9b7acb096d0db0c840dc52dc4 Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Wed, 10 Apr 2024 13:45:53 -0400
Subject: [PATCH 5/5] Enable extra NVBench features in windows build. (#169)

* Enable extra NVBench features in windows build.

These were delayed as they required changes to the devcontainers.

* Revamp nvml.dll logic.
---
 ci/matrix.yaml          | 72 ++++++++++++++++++-------------------
 cmake/NVBenchNVML.cmake | 80 ++++++++++++++++++++++-------------------
 2 files changed, 79 insertions(+), 73 deletions(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 8308d63..563dac0 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -47,40 +47,40 @@ msvc2022: &msvc2022 { name: 'cl', version: '14.39', exe: 'cl++' }
 # Configurations that will run for every PR
 pull_request:
   nvcc:
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc7   }
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc8   }
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc9   }
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm9  }
-    - {cuda: *cuda_prev_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11  }
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7   }
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8   }
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9   }
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10  }
-    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11  }
-    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12  }
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9  }
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10 }
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11 }
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12 }
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13 }
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14 }
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7   }
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8   }
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9   }
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10  }
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11  }
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12  }
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc7     }
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc8     }
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc9     }
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm9    }
+    - {cuda: *cuda_prev_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7     }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8     }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9     }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10   }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11   }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12   }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13   }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7     }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8     }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9     }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12    }
     # Fails to compile simple input on CTK12.4. Try to add later.
-    # {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc13  }
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9  }
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10 }
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11 }
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12 }
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13 }
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14 }
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15 }
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16 }
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm17 }
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm18,   extra_build_args: "-cmake-options '-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler'"}
-    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2019, extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF -DNVBench_ENABLE_NVML=OFF'"}
-    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2022, extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF -DNVBench_ENABLE_NVML=OFF'"}
+    # {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc13    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm17   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm18,  extra_build_args: "-cmake-options '-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler'"}
+    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2019 }
+    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2022 }
diff --git a/cmake/NVBenchNVML.cmake b/cmake/NVBenchNVML.cmake
index f2aadbb..4b005f3 100644
--- a/cmake/NVBenchNVML.cmake
+++ b/cmake/NVBenchNVML.cmake
@@ -1,37 +1,43 @@
-# Since this file is installed, we need to make sure that the CUDAToolkit has
-# been found by consumers:
-if (NOT TARGET CUDA::toolkit)
-  find_package(CUDAToolkit REQUIRED)
-endif()
-
-if (WIN32)
-  # The CUDA:: targets currently don't provide dll locations through the
-  # `IMPORTED_LOCATION` property, nor are they marked as `SHARED` libraries
-  # (they're currently `UNKNOWN`). This prevents the `nvbench_setup_dep_dlls`
-  # CMake function from copying the dlls to the build / install directories.
-  # This is discussed in https://gitlab.kitware.com/cmake/cmake/-/issues/22845
-  # and the other CMake issues it links to.
-  #
-  # We create a nvbench-specific target that configures the nvml interface as
-  # described here:
-  # https://gitlab.kitware.com/cmake/cmake/-/issues/22845#note_1077538
-  #
-  # Use find_file instead of find_library, which would search for a .lib file.
-  # This is also nice because find_file searches recursively (find_library
-  # does not) and some versions of CTK nest nvml.dll several directories deep
-  # under C:\Windows\System32.
-  find_file(NVBench_NVML_DLL nvml.dll REQUIRED
-    DOC "The full path to nvml.dll. Usually somewhere under C:/Windows/System32."
-    PATHS "C:/Windows/System32"
-  )
-  mark_as_advanced(NVBench_NVML_DLL)
-  add_library(nvbench::nvml SHARED IMPORTED)
-  target_link_libraries(nvbench::nvml INTERFACE CUDA::toolkit)
-  set_target_properties(nvbench::nvml PROPERTIES
-    IMPORTED_LOCATION "${NVBench_NVML_DLL}"
-    IMPORTED_IMPLIB "${CUDA_nvml_LIBRARY}"
-  )
-else()
-  # Linux is much easier...
-  add_library(nvbench::nvml ALIAS CUDA::nvml)
-endif()
+# Since this file is installed, we need to make sure that the CUDAToolkit has
+# been found by consumers:
+if (NOT TARGET CUDA::toolkit)
+  find_package(CUDAToolkit REQUIRED)
+endif()
+
+if (WIN32)
+  # The CUDA:: targets currently don't provide dll locations through the
+  # `IMPORTED_LOCATION` property, nor are they marked as `SHARED` libraries
+  # (they're currently `UNKNOWN`). This prevents the `nvbench_setup_dep_dlls`
+  # CMake function from copying the dlls to the build / install directories.
+  # This is discussed in https://gitlab.kitware.com/cmake/cmake/-/issues/22845
+  # and the other CMake issues it links to.
+  #
+  # We create a nvbench-specific target that configures the nvml interface as
+  # described here:
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/22845#note_1077538
+  #
+  # Use find_file instead of find_library, which would search for a .lib file.
+  # This is also nice because find_file searches recursively (find_library
+  # does not) and some versions of CTK nest nvml.dll several directories deep
+  # under C:\Windows\System32.
+  find_file(NVBench_NVML_DLL nvml.dll
+    DOC "The full path to nvml.dll. Usually somewhere under C:/Windows/System32."
+    PATHS "C:/Windows/System32"
+  )
+  mark_as_advanced(NVBench_NVML_DLL)
+endif()
+
+if (NVBench_NVML_DLL)
+  add_library(nvbench::nvml SHARED IMPORTED)
+  target_link_libraries(nvbench::nvml INTERFACE CUDA::toolkit)
+  set_target_properties(nvbench::nvml PROPERTIES
+    IMPORTED_LOCATION "${NVBench_NVML_DLL}"
+    IMPORTED_IMPLIB "${CUDA_nvml_LIBRARY}"
+  )
+elseif(TARGET CUDA::nvml)
+  add_library(nvbench::nvml ALIAS CUDA::nvml)
+else()
+  message(FATAL_ERROR "Could not find nvml.dll or CUDA::nvml target. "
+          "Set -DNVBench_ENABLE_NVML=OFF to disable NVML support "
+          "or set -DNVBench_NVML_DLL to the full path to nvml.dll on Windows.")
+endif()