From 8f92f51bbd5fb2b3801feb6c1988564feee524f9 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 2 May 2023 18:40:06 +0400 Subject: [PATCH] Build one ct workload at a time --- benchmarks/bench/reduce/max.cu | 2 +- benchmarks/bench/reduce/sum.cu | 2 +- benchmarks/nvbench_helper/nvbench_helper.cu | 22 +++++------ benchmarks/nvbench_helper/nvbench_helper.cuh | 39 +++++++++++--------- benchmarks/scripts/cub/bench/bench.py | 36 +++++++++++++++++- benchmarks/scripts/cub/bench/cmake.py | 3 +- benchmarks/scripts/cub/bench/search.py | 2 +- 7 files changed, 70 insertions(+), 36 deletions(-) diff --git a/benchmarks/bench/reduce/max.cu b/benchmarks/bench/reduce/max.cu index ffa2aac07c..3edf253fd8 100644 --- a/benchmarks/bench/reduce/max.cu +++ b/benchmarks/bench/reduce/max.cu @@ -2,7 +2,7 @@ // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 -// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:3:1 +// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 using op_t = max_t; #include "base.cuh" diff --git a/benchmarks/bench/reduce/sum.cu b/benchmarks/bench/reduce/sum.cu index 891b6dfea9..fc2154872d 100644 --- a/benchmarks/bench/reduce/sum.cu +++ b/benchmarks/bench/reduce/sum.cu @@ -2,7 +2,7 @@ // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 -// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:3:1 +// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 using op_t = cub::Sum; #include "base.cuh" \ No newline at end of file diff --git a/benchmarks/nvbench_helper/nvbench_helper.cu b/benchmarks/nvbench_helper/nvbench_helper.cu index 72d90b450e..0ba2373640 100644 --- a/benchmarks/nvbench_helper/nvbench_helper.cu +++ b/benchmarks/nvbench_helper/nvbench_helper.cu @@ -280,17 +280,17 @@ void gen(seed_t seed, thrust::device_vector &data, bit_entropy entropy, T min #define INSTANTIATE(TYPE) INSTANTIATE_RND(TYPE); -INSTANTIATE(std::uint8_t); -INSTANTIATE(std::uint16_t); -INSTANTIATE(std::uint32_t); -INSTANTIATE(std::uint64_t); -INSTANTIATE(__uint128_t); - -INSTANTIATE(std::int8_t); -INSTANTIATE(std::int16_t); -INSTANTIATE(std::int32_t); -INSTANTIATE(std::int64_t); -INSTANTIATE(__int128_t); +INSTANTIATE(uint8_t); +INSTANTIATE(uint16_t); +INSTANTIATE(uint32_t); +INSTANTIATE(uint64_t); +INSTANTIATE(uint128_t); + +INSTANTIATE(int8_t); +INSTANTIATE(int16_t); +INSTANTIATE(int32_t); +INSTANTIATE(int64_t); +INSTANTIATE(int128_t); INSTANTIATE(float); INSTANTIATE(double); diff --git a/benchmarks/nvbench_helper/nvbench_helper.cuh b/benchmarks/nvbench_helper/nvbench_helper.cuh index da214098b5..8bcad1bb82 100644 --- a/benchmarks/nvbench_helper/nvbench_helper.cuh +++ b/benchmarks/nvbench_helper/nvbench_helper.cuh @@ -10,10 +10,12 @@ #include using complex = cuda::std::complex; +using int128_t = __int128_t; +using uint128_t = __uint128_t; -NVBENCH_DECLARE_TYPE_STRINGS(__int128_t, "I128", "int128_t"); -NVBENCH_DECLARE_TYPE_STRINGS(__uint128_t, "U128", "uint128_t"); -NVBENCH_DECLARE_TYPE_STRINGS(complex, "C64", "complex"); +NVBENCH_DECLARE_TYPE_STRINGS(int128_t, "I128", "int128_t"); +NVBENCH_DECLARE_TYPE_STRINGS(uint128_t, "U128", "uint128_t"); +NVBENCH_DECLARE_TYPE_STRINGS(complex, "C64", "complex"); namespace detail { @@ -33,28 +35,29 @@ struct push_back> template using push_back_t = typename detail::push_back::type; -#ifdef TUNE_OFFSET_TYPE -using offset_types = nvbench::type_list; +#ifdef TUNE_OffsetT +using offset_types = nvbench::type_list; #else -using offset_types = nvbench::type_list; +using offset_types = nvbench::type_list; #endif -#ifdef TUNE_TYPE -using all_value_types = nvbench::type_list; +#ifdef TUNE_T +using fundamental_types = nvbench::type_list; +using all_types = nvbench::type_list; #else -using fundamental_types = nvbench::type_list; -using all_types = nvbench::type_list; diff --git a/benchmarks/scripts/cub/bench/bench.py b/benchmarks/scripts/cub/bench/bench.py index 843854cdbc..22c334ea8c 100644 --- a/benchmarks/scripts/cub/bench/bench.py +++ b/benchmarks/scripts/cub/bench/bench.py @@ -244,9 +244,10 @@ def pull_elapsed(self, bench, workload_point): class Bench: - def __init__(self, algorithm_name, variant): + def __init__(self, algorithm_name, variant, ct_workload): self.algname = algorithm_name self.variant = variant + self.ct_workload = ct_workload def label(self): return self.algname + '.' + self.variant.label() @@ -315,6 +316,24 @@ def axes_values(self, sub_space, ct): return space + def axes_value_descriptions(self): + result = json_benches(self.algname) + + if len(result["benchmarks"]) != 1: + raise Exception("Executable should contain exactly one benchmark") + + descriptions = {} + for axis in result["benchmarks"][0]["axes"]: + name = axis["name"] + if axis["flags"]: + name = name + "[{}]".format(axis["flags"]) + descriptions[name] = {} + for value in axis["values"]: + descriptions[name][value["input_string"]] = value["description"] + + return descriptions + + def axis_values(self, axis_name): result = json_benches(self.algname) @@ -341,6 +360,19 @@ def axis_values(self, axis_name): def build(self): build = CMake().build(self) return build.code == 0 + + def definitions(self): + definitions = self.variant.tuning() + definitions = definitions + "\n" + + descriptions = self.axes_value_descriptions() + for ct_component in self.ct_workload: + ct_axis_name, ct_value = ct_component.split('=') + description = descriptions[ct_axis_name][ct_value] + ct_axis_name = ct_axis_name.replace('{ct}', '') + definitions = definitions + "#define TUNE_{} {}\n".format(ct_axis_name, description) + + return definitions def do_run(self, point, timeout): try: @@ -483,4 +515,4 @@ def score(self, ct_workload, rt_workload_space, base_estimator, variant_estimato class BaseBench(Bench): def __init__(self, algname): - super().__init__(algname, BasePoint()) + super().__init__(algname, BasePoint(), []) diff --git a/benchmarks/scripts/cub/bench/cmake.py b/benchmarks/scripts/cub/bench/cmake.py index 5558de85d8..7dfe0d5c3d 100644 --- a/benchmarks/scripts/cub/bench/cmake.py +++ b/benchmarks/scripts/cub/bench/cmake.py @@ -64,9 +64,8 @@ def __init__(self): def do_build(self, bench, timeout): try: if not bench.is_base(): - # TODO populate tuning file with open(bench.exe_name() + ".h", "w") as f: - f.writelines(bench.variant.tuning()) + f.writelines(bench.definitions()) cmd = ["cmake", "--build", ".", "--target", bench.exe_name()] diff --git a/benchmarks/scripts/cub/bench/search.py b/benchmarks/scripts/cub/bench/search.py index 83adde723e..8cecb11a50 100644 --- a/benchmarks/scripts/cub/bench/search.py +++ b/benchmarks/scripts/cub/bench/search.py @@ -132,7 +132,7 @@ def __call__(self, algname, ct_workload_space, rt_workload_space): for ct_workload in ct_workload_space: for variant in variants: - bench = Bench(algname, variant) + bench = Bench(algname, variant, list(ct_workload)) score = bench.score(ct_workload, rt_workload_space, self.base_center_estimator,