From 8f92f51bbd5fb2b3801feb6c1988564feee524f9 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Tue, 2 May 2023 18:40:06 +0400
Subject: [PATCH] Build one ct workload at a time

---
 benchmarks/bench/reduce/max.cu               |  2 +-
 benchmarks/bench/reduce/sum.cu               |  2 +-
 benchmarks/nvbench_helper/nvbench_helper.cu  | 22 +++++------
 benchmarks/nvbench_helper/nvbench_helper.cuh | 39 +++++++++++---------
 benchmarks/scripts/cub/bench/bench.py        | 36 +++++++++++++++++-
 benchmarks/scripts/cub/bench/cmake.py        |  3 +-
 benchmarks/scripts/cub/bench/search.py       |  2 +-
 7 files changed, 70 insertions(+), 36 deletions(-)
diff --git a/benchmarks/bench/reduce/max.cu b/benchmarks/bench/reduce/max.cu
index ffa2aac07c..3edf253fd8 100644
--- a/benchmarks/bench/reduce/max.cu
+++ b/benchmarks/bench/reduce/max.cu
@@ -2,7 +2,7 @@
 
 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
-// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:3:1
+// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
 
 using op_t = max_t;
 #include "base.cuh"
diff --git a/benchmarks/bench/reduce/sum.cu b/benchmarks/bench/reduce/sum.cu
index 891b6dfea9..fc2154872d 100644
--- a/benchmarks/bench/reduce/sum.cu
+++ b/benchmarks/bench/reduce/sum.cu
@@ -2,7 +2,7 @@
 
 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
-// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:3:1
+// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
 
 using op_t = cub::Sum;
 #include "base.cuh"
\ No newline at end of file
diff --git a/benchmarks/nvbench_helper/nvbench_helper.cu b/benchmarks/nvbench_helper/nvbench_helper.cu
index 72d90b450e..0ba2373640 100644
--- a/benchmarks/nvbench_helper/nvbench_helper.cu
+++ b/benchmarks/nvbench_helper/nvbench_helper.cu
@@ -280,17 +280,17 @@ void gen(seed_t seed, thrust::device_vector<T> &data, bit_entropy entropy, T min
 
 #define INSTANTIATE(TYPE) INSTANTIATE_RND(TYPE);
 
-INSTANTIATE(std::uint8_t);
-INSTANTIATE(std::uint16_t);
-INSTANTIATE(std::uint32_t);
-INSTANTIATE(std::uint64_t);
-INSTANTIATE(__uint128_t);
-
-INSTANTIATE(std::int8_t);
-INSTANTIATE(std::int16_t);
-INSTANTIATE(std::int32_t);
-INSTANTIATE(std::int64_t);
-INSTANTIATE(__int128_t);
+INSTANTIATE(uint8_t);
+INSTANTIATE(uint16_t);
+INSTANTIATE(uint32_t);
+INSTANTIATE(uint64_t);
+INSTANTIATE(uint128_t);
+
+INSTANTIATE(int8_t);
+INSTANTIATE(int16_t);
+INSTANTIATE(int32_t);
+INSTANTIATE(int64_t);
+INSTANTIATE(int128_t);
 
 INSTANTIATE(float);
 INSTANTIATE(double);
diff --git a/benchmarks/nvbench_helper/nvbench_helper.cuh b/benchmarks/nvbench_helper/nvbench_helper.cuh
index da214098b5..8bcad1bb82 100644
--- a/benchmarks/nvbench_helper/nvbench_helper.cuh
+++ b/benchmarks/nvbench_helper/nvbench_helper.cuh
@@ -10,10 +10,12 @@
 #include <nvbench/nvbench.cuh>
 
 using complex = cuda::std::complex<float>;
+using int128_t = __int128_t;
+using uint128_t = __uint128_t;
 
-NVBENCH_DECLARE_TYPE_STRINGS(__int128_t, "I128", "int128_t");
-NVBENCH_DECLARE_TYPE_STRINGS(__uint128_t, "U128", "uint128_t");
-NVBENCH_DECLARE_TYPE_STRINGS(complex, "C64", "complex<float>");
+NVBENCH_DECLARE_TYPE_STRINGS(int128_t, "I128", "int128_t");
+NVBENCH_DECLARE_TYPE_STRINGS(uint128_t, "U128", "uint128_t");
+NVBENCH_DECLARE_TYPE_STRINGS(complex, "C64", "complex");
 
 namespace detail 
 {
@@ -33,28 +35,29 @@ struct push_back<T, nvbench::type_list<As...>>
 template <class T, class List>
 using push_back_t = typename detail::push_back<T, List>::type;
 
-#ifdef TUNE_OFFSET_TYPE
-using offset_types = nvbench::type_list<TUNE_OFFSET_TYPE>;
+#ifdef TUNE_OffsetT
+using offset_types = nvbench::type_list<TUNE_OffsetT>;
 #else
-using offset_types = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
+using offset_types = nvbench::type_list<int32_t, int64_t>;
 #endif
 
-#ifdef TUNE_TYPE
-using all_value_types = nvbench::type_list<TUNE_TYPE>;
+#ifdef TUNE_T
+using fundamental_types = nvbench::type_list<TUNE_T>;
+using all_types = nvbench::type_list<TUNE_T>;
 #else
-using fundamental_types = nvbench::type_list<nvbench::int8_t,
-                                             nvbench::int16_t,
-                                             nvbench::int32_t,
-                                             nvbench::int64_t,
-                                             __int128_t,
+using fundamental_types = nvbench::type_list<int8_t,
+                                             int16_t,
+                                             int32_t,
+                                             int64_t,
+                                             int128_t,
                                              float,
                                              double>;
                                              
-using all_types = nvbench::type_list<nvbench::int8_t,
-                                     nvbench::int16_t,
-                                     nvbench::int32_t,
-                                     nvbench::int64_t,
-                                     __int128_t,
+using all_types = nvbench::type_list<int8_t,
+                                     int16_t,
+                                     int32_t,
+                                     int64_t,
+                                     int128_t,
                                      float,
                                      double,
                                      complex>;
diff --git a/benchmarks/scripts/cub/bench/bench.py b/benchmarks/scripts/cub/bench/bench.py
index 843854cdbc..22c334ea8c 100644
--- a/benchmarks/scripts/cub/bench/bench.py
+++ b/benchmarks/scripts/cub/bench/bench.py
@@ -244,9 +244,10 @@ def pull_elapsed(self, bench, workload_point):
 
 
 class Bench:
-    def __init__(self, algorithm_name, variant):
+    def __init__(self, algorithm_name, variant, ct_workload):
         self.algname = algorithm_name
         self.variant = variant
+        self.ct_workload = ct_workload
 
     def label(self):
         return self.algname + '.' + self.variant.label()
@@ -315,6 +316,24 @@ def axes_values(self, sub_space, ct):
 
         return space
     
+    def axes_value_descriptions(self):
+        result = json_benches(self.algname)
+
+        if len(result["benchmarks"]) != 1:
+            raise Exception("Executable should contain exactly one benchmark")
+
+        descriptions = {}
+        for axis in result["benchmarks"][0]["axes"]:
+            name = axis["name"]
+            if axis["flags"]:
+                name = name + "[{}]".format(axis["flags"])
+            descriptions[name] = {}
+            for value in axis["values"]:
+                descriptions[name][value["input_string"]] = value["description"]
+
+        return descriptions
+
+    
     def axis_values(self, axis_name):
         result = json_benches(self.algname)
 
@@ -341,6 +360,19 @@ def axis_values(self, axis_name):
     def build(self):
         build = CMake().build(self)
         return build.code == 0
+    
+    def definitions(self):
+        definitions = self.variant.tuning()
+        definitions = definitions + "\n"
+
+        descriptions = self.axes_value_descriptions()
+        for ct_component in self.ct_workload:
+            ct_axis_name, ct_value = ct_component.split('=')
+            description = descriptions[ct_axis_name][ct_value]
+            ct_axis_name = ct_axis_name.replace('{ct}', '')
+            definitions = definitions + "#define TUNE_{} {}\n".format(ct_axis_name, description)
+
+        return definitions
 
     def do_run(self, point, timeout):
         try:
@@ -483,4 +515,4 @@ def score(self, ct_workload, rt_workload_space, base_estimator, variant_estimato
 
 class BaseBench(Bench):
     def __init__(self, algname):
-        super().__init__(algname, BasePoint())
+        super().__init__(algname, BasePoint(), [])
diff --git a/benchmarks/scripts/cub/bench/cmake.py b/benchmarks/scripts/cub/bench/cmake.py
index 5558de85d8..7dfe0d5c3d 100644
--- a/benchmarks/scripts/cub/bench/cmake.py
+++ b/benchmarks/scripts/cub/bench/cmake.py
@@ -64,9 +64,8 @@ def __init__(self):
   def do_build(self, bench, timeout):
       try:
           if not bench.is_base():
-              # TODO populate tuning file
               with open(bench.exe_name() + ".h", "w") as f:
-                  f.writelines(bench.variant.tuning())
+                  f.writelines(bench.definitions())
 
           cmd = ["cmake", "--build", ".", "--target", bench.exe_name()]
 
diff --git a/benchmarks/scripts/cub/bench/search.py b/benchmarks/scripts/cub/bench/search.py
index 83adde723e..8cecb11a50 100644
--- a/benchmarks/scripts/cub/bench/search.py
+++ b/benchmarks/scripts/cub/bench/search.py
@@ -132,7 +132,7 @@ def __call__(self, algname, ct_workload_space, rt_workload_space):
 
         for ct_workload in ct_workload_space:
             for variant in variants:
-                bench = Bench(algname, variant)
+                bench = Bench(algname, variant, list(ct_workload))
                 score = bench.score(ct_workload, 
                                     rt_workload_space, 
                                     self.base_center_estimator,