Skip to content

Commit

Permalink
Match decoupled look-back defaults
Browse files Browse the repository at this point in the history
  • Loading branch information
gevtushenko committed Jun 5, 2023
1 parent 700e49d commit 43b8c8a
Show file tree
Hide file tree
Showing 12 changed files with 82 additions and 45 deletions.
2 changes: 1 addition & 1 deletion cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ template <int _BLOCK_THREADS,
BlockLoadAlgorithm _LOAD_ALGORITHM,
CacheLoadModifier _LOAD_MODIFIER,
BlockScanAlgorithm _SCAN_ALGORITHM,
typename DelayConstructorT = detail::default_delay_constructor_t>
typename DelayConstructorT = detail::fixed_delay_constructor_t<350, 450>>
struct AgentReduceByKeyPolicy
{
///< Threads per thread block
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ template <
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
bool _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
BlockScanAlgorithm _SCAN_ALGORITHM, ///< The BlockScan algorithm to use
typename DelayConstructorT = detail::default_delay_constructor_t>
typename DelayConstructorT = detail::fixed_delay_constructor_t<350, 450>>
struct AgentRlePolicy
{
enum
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ template <int NOMINAL_BLOCK_THREADS_4B,
typename ScalingType = MemBoundScaling<NOMINAL_BLOCK_THREADS_4B,
NOMINAL_ITEMS_PER_THREAD_4B,
ComputeT>,
typename DelayConstructorT = detail::default_delay_constructor_t>
typename DelayConstructorT = detail::default_delay_constructor_t<ComputeT>>
struct AgentScanPolicy : ScalingType
{
static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ template <int _BLOCK_THREADS,
CacheLoadModifier _LOAD_MODIFIER = LOAD_DEFAULT,
BlockScanAlgorithm _SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
BlockStoreAlgorithm _STORE_ALGORITHM = BLOCK_STORE_DIRECT,
typename DelayConstructorT = detail::default_delay_constructor_t>
typename DelayConstructorT = detail::fixed_delay_constructor_t<350, 450>>
struct AgentScanByKeyPolicy
{
static constexpr int BLOCK_THREADS = _BLOCK_THREADS;
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ template <
BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
BlockScanAlgorithm _SCAN_ALGORITHM, ///< The BlockScan algorithm to use
typename DelayConstructorT = detail::default_delay_constructor_t>
typename DelayConstructorT = detail::fixed_delay_constructor_t<350, 450>>
struct AgentSelectIfPolicy
{
enum
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_unique_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ template <int _BLOCK_THREADS,
cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT,
cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG,
cub::BlockScanAlgorithm _SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS,
typename DelayConstructorT = detail::default_delay_constructor_t>
typename DelayConstructorT = detail::fixed_delay_constructor_t<350, 450>>
struct AgentUniqueByKeyPolicy
{
enum
Expand Down
49 changes: 41 additions & 8 deletions cub/agent/single_pass_scan_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,29 @@ struct no_delay_constructor_t

__device__ __forceinline__ no_delay_constructor_t(unsigned int /* seed */)
{
always_delay<L2WriteLatency>();
delay<L2WriteLatency>();
}

__device__ __forceinline__ delay_t operator()() { return {}; }
};

template <unsigned int Delay, unsigned int L2WriteLatency, unsigned int GridThreshold = 500>
struct reduce_by_key_delay_constructor_t
{
struct delay_t
{
__device__ __forceinline__ void operator()()
{
NV_DISPATCH_TARGET(
NV_IS_EXACTLY_SM_80, (delay<Delay, GridThreshold>();),
NV_PROVIDES_SM_70, (delay< 0, GridThreshold>();),
NV_IS_DEVICE, (__threadfence_block();));
}
};

__device__ __forceinline__ reduce_by_key_delay_constructor_t(unsigned int /* seed */)
{
delay<L2WriteLatency>();
}

__device__ __forceinline__ delay_t operator()() { return {}; }
Expand Down Expand Up @@ -422,11 +444,22 @@ struct exponential_backon_constructor_t
}
};

using default_delay_constructor_t = fixed_delay_constructor_t<350, 450>;
using default_delay_t = default_delay_constructor_t::delay_t;

using default_no_delay_constructor_t = no_delay_constructor_t<450>;
using default_no_delay_t = default_no_delay_constructor_t::delay_t;

template <class T>
using default_delay_constructor_t = cub::detail::conditional_t<Traits<T>::PRIMITIVE,
fixed_delay_constructor_t<350, 450>,
default_no_delay_constructor_t>;

template <class T>
using default_delay_t = typename default_delay_constructor_t<T>::delay_t;

template <class KeyT, class ValueT>
using default_reduce_by_key_delay_constructor_t =
detail::conditional_t<(Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16),
reduce_by_key_delay_constructor_t<350, 450>,
default_delay_constructor_t<KeyValuePair<KeyT, ValueT>>>;
}

/**
Expand Down Expand Up @@ -575,7 +608,7 @@ struct ScanTileState<T, true>
/**
* Wait for the corresponding tile to become non-invalid
*/
template <class DelayT = detail::default_delay_t>
template <class DelayT = detail::default_delay_t<T>>
__device__ __forceinline__ void WaitForValid(
int tile_idx,
StatusWord &status,
Expand Down Expand Up @@ -964,7 +997,7 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
/**
* Wait for the corresponding tile to become non-invalid
*/
template <class DelayT = detail::default_delay_t>
template <class DelayT = detail::fixed_delay_constructor_t<350, 450>>
__device__ __forceinline__ void WaitForValid(
int tile_idx,
StatusWord &status,
Expand Down Expand Up @@ -1019,7 +1052,7 @@ template <
typename ScanOpT,
typename ScanTileStateT,
int LEGACY_PTX_ARCH = 0,
typename DelayConstructorT = detail::default_delay_constructor_t>
typename DelayConstructorT = detail::default_delay_constructor_t<T>>
struct TilePrefixCallbackOp
{
// Parameterized warp reduce
Expand Down Expand Up @@ -1069,7 +1102,7 @@ struct TilePrefixCallbackOp
{}

// Block until all predecessors within the warp-wide window have non-invalid status
template <class DelayT = detail::default_delay_t>
template <class DelayT = detail::default_delay_t<T>>
__device__ __forceinline__
void ProcessWindow(
int predecessor_idx, ///< Preceding tile index to inspect
Expand Down
13 changes: 7 additions & 6 deletions cub/device/dispatch/dispatch_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -192,12 +192,13 @@ struct device_reduce_by_key_policy_hub
((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) /
COMBINED_INPUT_BYTES));

using ReduceByKeyPolicyT = AgentReduceByKeyPolicy<128,
ITEMS_PER_THREAD,
BLOCK_LOAD_DIRECT,
LOAD_LDG,
BLOCK_SCAN_WARP_SCANS,
detail::default_delay_constructor_t>;
using ReduceByKeyPolicyT =
AgentReduceByKeyPolicy<128,
ITEMS_PER_THREAD,
BLOCK_LOAD_DIRECT,
LOAD_LDG,
BLOCK_SCAN_WARP_SCANS,
detail::default_reduce_by_key_delay_constructor_t<AccumT, int>>;
};

using MaxPolicy = Policy350;
Expand Down
15 changes: 8 additions & 7 deletions cub/device/dispatch/dispatch_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,14 @@ struct device_rle_policy_hub
CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
};

using RleSweepPolicyT = AgentRlePolicy<96,
ITEMS_PER_THREAD,
BLOCK_LOAD_DIRECT,
LOAD_LDG,
true,
BLOCK_SCAN_WARP_SCANS,
detail::default_delay_constructor_t>;
using RleSweepPolicyT =
AgentRlePolicy<96,
ITEMS_PER_THREAD,
BLOCK_LOAD_DIRECT,
LOAD_LDG,
true,
BLOCK_SCAN_WARP_SCANS,
detail::default_reduce_by_key_delay_constructor_t<int, int>>;
};

using MaxPolicy = Policy350;
Expand Down
6 changes: 3 additions & 3 deletions cub/device/dispatch/dispatch_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ struct DeviceScanPolicy
LOAD_CA,
BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
BLOCK_SCAN_RAKING,
detail::default_delay_constructor_t>;
detail::default_delay_constructor_t<AccumT>>;
};

/// SM520
Expand All @@ -255,7 +255,7 @@ struct DeviceScanPolicy
LOAD_CA,
ScanTransposedStore,
BLOCK_SCAN_WARP_SCANS,
detail::default_delay_constructor_t>;
detail::default_delay_constructor_t<AccumT>>;
};

/// SM600
Expand All @@ -268,7 +268,7 @@ struct DeviceScanPolicy
LOAD_DEFAULT,
ScanTransposedStore,
BLOCK_SCAN_WARP_SCANS,
detail::default_delay_constructor_t>;
detail::default_delay_constructor_t<AccumT>>;
};

using MaxPolicy = Policy600;
Expand Down
30 changes: 16 additions & 14 deletions cub/device/dispatch/dispatch_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -209,13 +209,14 @@ struct DeviceScanByKeyPolicy
: Nominal4BItemsToItemsCombined(NOMINAL_4B_ITEMS_PER_THREAD,
CombinedInputBytes));

using ScanByKeyPolicyT = AgentScanByKeyPolicy<128,
ITEMS_PER_THREAD,
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_CA,
BLOCK_SCAN_WARP_SCANS,
BLOCK_STORE_WARP_TRANSPOSE,
detail::default_delay_constructor_t>;
using ScanByKeyPolicyT =
AgentScanByKeyPolicy<128,
ITEMS_PER_THREAD,
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_CA,
BLOCK_SCAN_WARP_SCANS,
BLOCK_STORE_WARP_TRANSPOSE,
detail::default_reduce_by_key_delay_constructor_t<AccumT, int>>;
};

// SM520
Expand All @@ -228,13 +229,14 @@ struct DeviceScanByKeyPolicy
: Nominal4BItemsToItemsCombined(NOMINAL_4B_ITEMS_PER_THREAD,
CombinedInputBytes));

using ScanByKeyPolicyT = AgentScanByKeyPolicy<256,
ITEMS_PER_THREAD,
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_CA,
BLOCK_SCAN_WARP_SCANS,
BLOCK_STORE_WARP_TRANSPOSE,
detail::default_delay_constructor_t>;
using ScanByKeyPolicyT =
AgentScanByKeyPolicy<256,
ITEMS_PER_THREAD,
BLOCK_LOAD_WARP_TRANSPOSE,
LOAD_CA,
BLOCK_SCAN_WARP_SCANS,
BLOCK_STORE_WARP_TRANSPOSE,
detail::default_reduce_by_key_delay_constructor_t<AccumT, int>>;
};

using MaxPolicy = Policy520;
Expand Down
2 changes: 1 addition & 1 deletion cub/device/dispatch/dispatch_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ struct device_select_policy_hub
BLOCK_LOAD_DIRECT,
MayAlias ? LOAD_CA : LOAD_LDG,
BLOCK_SCAN_WARP_SCANS,
detail::default_delay_constructor_t>;
detail::fixed_delay_constructor_t<350, 450>>;
};

using MaxPolicy = Policy350;
Expand Down

0 comments on commit 43b8c8a

Please sign in to comment.