Skip to content

Commit

Permalink
Better I128 tuning
Browse files Browse the repository at this point in the history
  • Loading branch information
gevtushenko committed Jun 6, 2023
1 parent ad56f13 commit 0f57445
Showing 1 changed file with 5 additions and 12 deletions.
17 changes: 5 additions & 12 deletions cub/device/dispatch/dispatch_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -245,10 +245,10 @@ struct sm90_tuning<AccumT, 8>
template <class AccumT>
struct sm90_tuning<AccumT, 16>
{
static constexpr int threads = 960;
static constexpr int items = 20;
static constexpr int threads = 576;
static constexpr int items = 21;

using delay_constructor = detail::fixed_delay_constructor_t<876, 635>;
using delay_constructor = detail::fixed_delay_constructor_t<860, 630>;
};

template <>
Expand Down Expand Up @@ -346,21 +346,14 @@ struct DeviceScanPolicy
/// SM900
struct Policy900 : ChainedPolicy<900, Policy900, Policy600>
{
static constexpr BlockLoadAlgorithm load_algorithm = sizeof(AccumT) > 8
? BLOCK_LOAD_DIRECT
: BLOCK_LOAD_WARP_TRANSPOSE;
static constexpr BlockStoreAlgorithm store_algorithm = sizeof(AccumT) > 8
? BLOCK_STORE_DIRECT
: BLOCK_STORE_WARP_TRANSPOSE;

using tuning = detail::scan::sm90_tuning<AccumT>;

using ScanPolicyT = policy_t<tuning::threads,
tuning::items,
AccumT,
load_algorithm,
ScanTransposedLoad,
LOAD_DEFAULT,
store_algorithm,
ScanTransposedStore,
BLOCK_SCAN_WARP_SCANS,
typename tuning::delay_constructor>;
};
Expand Down

0 comments on commit 0f57445

Please sign in to comment.