Skip to content

Commit

Permalink
Tune 8-byte types
Browse files Browse the repository at this point in the history
  • Loading branch information
gevtushenko committed Jun 12, 2023
1 parent 01f8003 commit f2cd25e
Showing 1 changed file with 46 additions and 0 deletions.
46 changes: 46 additions & 0 deletions cub/device/dispatch/dispatch_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,40 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SelectIfPolicyT::BLOCK_THREA
namespace detail
{

namespace select
{

template <class InputT,
bool PrimitiveInput = Traits<InputT>::PRIMITIVE,
std::size_t InputSize = sizeof(InputT)>
struct sm90_tuning
{
static constexpr int threads = 128;

static constexpr int nominal_4b_items_per_thread = 10;

static constexpr int items = CUB_MIN(nominal_4b_items_per_thread,
CUB_MAX(1, (nominal_4b_items_per_thread * 4 / InputSize)));

static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;

using delay_constructor = detail::fixed_delay_constructor_t<350, 450>;
};

template <class Input>
struct sm90_tuning<Input, true, 8>
{
static constexpr int threads = 384;

static constexpr int items = 22;

static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;

using delay_constructor = detail::no_delay_constructor_t<1120>;
};

}

template <class InputT, bool MayAlias>
struct device_select_policy_hub
{
Expand All @@ -186,6 +220,18 @@ struct device_select_policy_hub
detail::fixed_delay_constructor_t<350, 450>>;
};

struct Policy900 : ChainedPolicy<900, Policy900, Policy350>
{
using tuning = detail::select::sm90_tuning<InputT, MayAlias>;

using SelectIfPolicyT = AgentSelectIfPolicy<tuning::threads,
tuning::items,
tuning::load_algorithm,
LOAD_DEFAULT,
BLOCK_SCAN_WARP_SCANS,
typename tuning::delay_constructor>;
};

using MaxPolicy = Policy350;
};

Expand Down

0 comments on commit f2cd25e

Please sign in to comment.