From bcbb6d1e9b607a6562c1e9ed0bdee0596b552755 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 23 May 2023 18:37:20 +0400 Subject: [PATCH] Adjust benches --- benchmarks/bench/partition/flagged.cu | 4 +++- benchmarks/bench/partition/if.cu | 4 +++- benchmarks/bench/reduce/by_key.cu | 4 +++- benchmarks/bench/run_length_encode/encode.cu | 4 +++- benchmarks/bench/run_length_encode/non_trivial_runs.cu | 4 +++- benchmarks/bench/scan/exclusive/base.cuh | 2 ++ benchmarks/bench/scan/exclusive/by_key.cu | 4 +++- benchmarks/bench/scan/exclusive/max.cu | 2 +- benchmarks/bench/scan/exclusive/sum.cu | 2 +- benchmarks/bench/select/flagged.cu | 4 +++- benchmarks/bench/select/if.cu | 4 +++- cub/agent/single_pass_scan_operators.cuh | 5 ----- 12 files changed, 28 insertions(+), 15 deletions(-) diff --git a/benchmarks/bench/partition/flagged.cu b/benchmarks/bench/partition/flagged.cu index cf3d4894e..f9a8b260e 100644 --- a/benchmarks/bench/partition/flagged.cu +++ b/benchmarks/bench/partition/flagged.cu @@ -5,13 +5,15 @@ // %RANGE% TUNE_LOAD ld 0:1:1 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 -// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5 +// %RANGE% CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2 l2b 3:8:1 // %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5 constexpr bool keep_rejects = true; constexpr bool may_alias = false; #if !TUNE_BASE +#define CUB_DETAIL_INITIAL_L2_BACKOFF_NS (1 << CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2) + #if TUNE_TRANSPOSE == 0 #define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT #else // TUNE_TRANSPOSE == 1 diff --git a/benchmarks/bench/partition/if.cu b/benchmarks/bench/partition/if.cu index bdcf54128..4702937d0 100644 --- a/benchmarks/bench/partition/if.cu +++ b/benchmarks/bench/partition/if.cu @@ -5,13 +5,15 @@ // %RANGE% TUNE_LOAD ld 0:1:1 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 -// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5 +// %RANGE% CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2 l2b 3:8:1 // %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5 constexpr bool keep_rejects = true; constexpr bool may_alias = false; #if !TUNE_BASE +#define CUB_DETAIL_INITIAL_L2_BACKOFF_NS (1 << CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2) + #if TUNE_TRANSPOSE == 0 #define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT #else // TUNE_TRANSPOSE == 1 diff --git a/benchmarks/bench/reduce/by_key.cu b/benchmarks/bench/reduce/by_key.cu index ffa6b3f17..dd553e09c 100644 --- a/benchmarks/bench/reduce/by_key.cu +++ b/benchmarks/bench/reduce/by_key.cu @@ -4,10 +4,12 @@ // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:1:1 -// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5 +// %RANGE% CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2 l2b 3:8:1 // %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5 #if !TUNE_BASE +#define CUB_DETAIL_INITIAL_L2_BACKOFF_NS (1 << CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2) + #if TUNE_TRANSPOSE == 0 #define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT #else // TUNE_TRANSPOSE == 1 diff --git a/benchmarks/bench/run_length_encode/encode.cu b/benchmarks/bench/run_length_encode/encode.cu index f3bd25f4b..fcc60690c 100644 --- a/benchmarks/bench/run_length_encode/encode.cu +++ b/benchmarks/bench/run_length_encode/encode.cu @@ -4,10 +4,12 @@ // %RANGE% TUNE_THREADS tpb 128:1024:32 // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_LOAD ld 0:1:1 -// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5 +// %RANGE% CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2 l2b 3:8:1 // %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5 #if !TUNE_BASE +#define CUB_DETAIL_INITIAL_L2_BACKOFF_NS (1 << CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2) + #if TUNE_TRANSPOSE == 0 #define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT #else // TUNE_TRANSPOSE == 1 diff --git a/benchmarks/bench/run_length_encode/non_trivial_runs.cu b/benchmarks/bench/run_length_encode/non_trivial_runs.cu index 993e7a775..63eb5886a 100644 --- a/benchmarks/bench/run_length_encode/non_trivial_runs.cu +++ b/benchmarks/bench/run_length_encode/non_trivial_runs.cu @@ -5,12 +5,14 @@ // %RANGE% TUNE_TRANSPOSE trp 0:1:1 // %RANGE% TUNE_TIME_SLICING ts 0:1:1 // %RANGE% TUNE_LOAD ld 0:1:1 -// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5 +// %RANGE% CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2 l2b 3:8:1 // %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5 #include #if !TUNE_BASE +#define CUB_DETAIL_INITIAL_L2_BACKOFF_NS (1 << CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2) + #if TUNE_TRANSPOSE == 0 #define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT #else // TUNE_TRANSPOSE == 1 diff --git a/benchmarks/bench/scan/exclusive/base.cuh b/benchmarks/bench/scan/exclusive/base.cuh index 7823b3e97..5af46160e 100644 --- a/benchmarks/bench/scan/exclusive/base.cuh +++ b/benchmarks/bench/scan/exclusive/base.cuh @@ -3,6 +3,8 @@ #include #if !TUNE_BASE +#define CUB_DETAIL_INITIAL_L2_BACKOFF_NS (1 << CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2) + template struct policy_hub_t { diff --git a/benchmarks/bench/scan/exclusive/by_key.cu b/benchmarks/bench/scan/exclusive/by_key.cu index 2e1ef4c09..b195b7b03 100644 --- a/benchmarks/bench/scan/exclusive/by_key.cu +++ b/benchmarks/bench/scan/exclusive/by_key.cu @@ -2,7 +2,7 @@ // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 -// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5 +// %RANGE% CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2 l2b 3:8:1 // %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5 #include @@ -10,6 +10,8 @@ #include #if !TUNE_BASE +#define CUB_DETAIL_INITIAL_L2_BACKOFF_NS (1 << CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2) + struct policy_hub_t { struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> diff --git a/benchmarks/bench/scan/exclusive/max.cu b/benchmarks/bench/scan/exclusive/max.cu index d7fcc0447..4bb25707b 100644 --- a/benchmarks/bench/scan/exclusive/max.cu +++ b/benchmarks/bench/scan/exclusive/max.cu @@ -2,7 +2,7 @@ // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 -// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5 +// %RANGE% CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2 l2b 3:8:1 // %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5 using op_t = max_t; diff --git a/benchmarks/bench/scan/exclusive/sum.cu b/benchmarks/bench/scan/exclusive/sum.cu index e26e28c06..0598772f4 100644 --- a/benchmarks/bench/scan/exclusive/sum.cu +++ b/benchmarks/bench/scan/exclusive/sum.cu @@ -1,6 +1,6 @@ // %RANGE% TUNE_ITEMS ipt 7:24:1 // %RANGE% TUNE_THREADS tpb 128:1024:32 -// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5 +// %RANGE% CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2 l2b 3:8:1 // %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5 #include diff --git a/benchmarks/bench/select/flagged.cu b/benchmarks/bench/select/flagged.cu index 0c80fe753..dd79972b6 100644 --- a/benchmarks/bench/select/flagged.cu +++ b/benchmarks/bench/select/flagged.cu @@ -6,13 +6,15 @@ // %RANGE% TUNE_LOAD ld 0:1:1 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 -// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5 +// %RANGE% CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2 l2b 3:8:1 // %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5 constexpr bool keep_rejects = false; constexpr bool may_alias = false; #if !TUNE_BASE +#define CUB_DETAIL_INITIAL_L2_BACKOFF_NS (1 << CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2) + #if TUNE_TRANSPOSE == 0 #define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT #else // TUNE_TRANSPOSE == 1 diff --git a/benchmarks/bench/select/if.cu b/benchmarks/bench/select/if.cu index 53f1ce6cb..4b76c52d5 100644 --- a/benchmarks/bench/select/if.cu +++ b/benchmarks/bench/select/if.cu @@ -7,13 +7,15 @@ // %RANGE% TUNE_LOAD ld 0:1:1 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 -// %RANGE% CUB_DETAIL_L2_BACKOFF_NS l2b 0:1200:5 +// %RANGE% CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2 l2b 3:8:1 // %RANGE% CUB_DETAIL_L2_WRITE_LATENCY_NS l2w 0:1200:5 constexpr bool keep_rejects = false; constexpr bool may_alias = false; #if !TUNE_BASE +#define CUB_DETAIL_INITIAL_L2_BACKOFF_NS (1 << CUB_DETAIL_INITIAL_L2_BACKOFF_NS_POW2) + #if TUNE_TRANSPOSE == 0 #define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT #else // TUNE_TRANSPOSE == 1 diff --git a/cub/agent/single_pass_scan_operators.cuh b/cub/agent/single_pass_scan_operators.cuh index c67e8b4d5..5ba13b6ce 100644 --- a/cub/agent/single_pass_scan_operators.cuh +++ b/cub/agent/single_pass_scan_operators.cuh @@ -52,13 +52,8 @@ CUB_NAMESPACE_BEGIN #define CUB_DETAIL_INITIAL_L2_BACKOFF_NS 8 #endif -#define CUB_DETAIL_DEFAULT_L2_BACKOFF_NS 350 #define CUB_DETAIL_DEFAULT_L2_WRITE_LATENCY_NS 450 -#ifndef CUB_DETAIL_L2_BACKOFF_NS -#define CUB_DETAIL_L2_BACKOFF_NS CUB_DETAIL_DEFAULT_L2_BACKOFF_NS -#endif - #ifndef CUB_DETAIL_L2_WRITE_LATENCY_NS #define CUB_DETAIL_L2_WRITE_LATENCY_NS CUB_DETAIL_DEFAULT_L2_WRITE_LATENCY_NS #endif