From 4f8ac66e3db3381d022186cb60daedeb3f07b763 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 31 Jul 2023 09:26:39 +0000 Subject: [PATCH 1/2] Tune rle encode on A100 --- .../tuning/tuning_run_length_encode.cuh | 133 +++++++++++++++++- 1 file changed, 130 insertions(+), 3 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh index ba3780f2ad3..b1b8759c70c 100644 --- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh @@ -182,6 +182,108 @@ struct sm90_tuning(), + primitive_key PrimitiveKey = is_primitive_key(), + length_size LengthSize = classify_length_size(), + key_size KeySize = classify_key_size()> +struct sm80_tuning +{ + static constexpr int max_input_bytes = CUB_MAX(sizeof(KeyT), sizeof(LengthT)); + static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(LengthT); + + static constexpr int threads = 128; + + static constexpr int nominal_4b_items_per_thread = 6; + + static constexpr int items = + (max_input_bytes <= 8) + ? 6 + : CUB_MIN(nominal_4b_items_per_thread, + CUB_MAX(1, + ((nominal_4b_items_per_thread * 8) + combined_input_bytes - 1) / + combined_input_bytes)); + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::default_reduce_by_key_delay_constructor_t; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 256; + + static constexpr int items = 14; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::no_delay_constructor_t<640>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 256; + + static constexpr int items = 13; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::no_delay_constructor_t<900>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 256; + + static constexpr int items = 13; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + using delay_constructor = detail::no_delay_constructor_t<1080>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 224; + + static constexpr int items = 9; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + using delay_constructor = detail::no_delay_constructor_t<1075>; +}; + +#if CUB_IS_INT128_ENABLED +template +struct sm80_tuning +{ + static constexpr int threads = 128; + + static constexpr int items = 7; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + using delay_constructor = detail::no_delay_constructor_t<630>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 128; + + static constexpr int items = 7; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + using delay_constructor = detail::no_delay_constructor_t<630>; +}; +#endif + } // namespace encode namespace non_trivial_runs @@ -306,8 +408,7 @@ struct device_run_length_encode_policy_hub static constexpr int MAX_INPUT_BYTES = CUB_MAX(sizeof(KeyT), sizeof(LengthT)); static constexpr int COMBINED_INPUT_BYTES = sizeof(KeyT) + sizeof(LengthT); - /// SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> + struct DefaultTuning { static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 6; static constexpr int ITEMS_PER_THREAD = @@ -327,8 +428,34 @@ struct device_run_length_encode_policy_hub detail::default_reduce_by_key_delay_constructor_t>; }; + /// SM35 + struct Policy350 + : DefaultTuning + , ChainedPolicy<350, Policy350, Policy350> + {}; + + /// SM80 + struct Policy800 : ChainedPolicy<800, Policy800, Policy350> + { + using tuning = detail::rle::encode::sm80_tuning; + + using ReduceByKeyPolicyT = + AgentReduceByKeyPolicy; + }; + + // SM86 + struct Policy860 + : DefaultTuning + , ChainedPolicy<860, Policy860, Policy800> + {}; + /// SM90 - struct Policy900 : ChainedPolicy<900, Policy900, Policy350> + struct Policy900 : ChainedPolicy<900, Policy900, Policy860> { using tuning = detail::rle::encode::sm90_tuning; From c3cb0b15d3fe89872328b6d19769f79bd1027feb Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 1 Aug 2023 07:21:17 +0000 Subject: [PATCH 2/2] Tune rle non-trivial-runs on A100 --- .../tuning/tuning_run_length_encode.cuh | 141 +++++++++++++++++- 1 file changed, 137 insertions(+), 4 deletions(-) diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh index b1b8759c70c..80282592915 100644 --- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh @@ -397,6 +397,114 @@ struct sm90_tuning(), + primitive_key PrimitiveKey = is_primitive_key(), + length_size LengthSize = classify_length_size(), + key_size KeySize = classify_key_size()> +struct sm80_tuning +{ + static constexpr int threads = 96; + + static constexpr int nominal_4b_items_per_thread = 15; + + static constexpr int items = CUB_MIN(nominal_4b_items_per_thread, + CUB_MAX(1, (nominal_4b_items_per_thread * 4 / sizeof(KeyT)))); + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + static constexpr bool store_with_time_slicing = true; + + using delay_constructor = detail::default_reduce_by_key_delay_constructor_t; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 192; + + static constexpr int items = 20; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; + + static constexpr bool store_with_time_slicing = false; + + using delay_constructor = detail::no_delay_constructor_t<630>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 192; + + static constexpr int items = 20; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + static constexpr bool store_with_time_slicing = false; + + using delay_constructor = detail::no_delay_constructor_t<1015>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 224; + + static constexpr int items = 15; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + static constexpr bool store_with_time_slicing = false; + + using delay_constructor = detail::no_delay_constructor_t<915>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 256; + + static constexpr int items = 13; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + static constexpr bool store_with_time_slicing = false; + + using delay_constructor = detail::no_delay_constructor_t<1065>; +}; + +#if CUB_IS_INT128_ENABLED +template +struct sm80_tuning +{ + static constexpr int threads = 192; + + static constexpr int items = 13; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + static constexpr bool store_with_time_slicing = false; + + using delay_constructor = detail::no_delay_constructor_t<1050>; +}; + +template +struct sm80_tuning +{ + static constexpr int threads = 192; + + static constexpr int items = 13; + + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; + + static constexpr bool store_with_time_slicing = false; + + using delay_constructor = detail::no_delay_constructor_t<1050>; +}; +#endif + } // namespace non_trivial_runs @@ -474,8 +582,7 @@ struct device_run_length_encode_policy_hub template struct device_non_trivial_runs_policy_hub { - /// SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy350> + struct DefaultTuning { enum { @@ -494,9 +601,35 @@ struct device_non_trivial_runs_policy_hub BLOCK_SCAN_WARP_SCANS, detail::default_reduce_by_key_delay_constructor_t>; }; - + + /// SM35 + struct Policy350 + : DefaultTuning + , ChainedPolicy<350, Policy350, Policy350> + {}; + + // SM80 + struct Policy800 : ChainedPolicy<800, Policy800, Policy350> + { + using tuning = detail::rle::non_trivial_runs::sm80_tuning; + + using RleSweepPolicyT = AgentRlePolicy; + }; + + // SM86 + struct Policy860 + : DefaultTuning + , ChainedPolicy<860, Policy860, Policy800> + {}; + // SM90 - struct Policy900 : ChainedPolicy<900, Policy900, Policy350> + struct Policy900 : ChainedPolicy<900, Policy900, Policy860> { using tuning = detail::rle::non_trivial_runs::sm90_tuning;