Skip to content

Commit

Permalink
Clean up CUB thread operators (#2716)
Browse files Browse the repository at this point in the history
* Replace CUB thread operators by libcu++ ones where possible
* Alias cub::[Min|Max] to cuda::[minimum|maximum]
* Make aliases available in C++11
* Deprecate all CUB aliases to libcu++
* Remove obsolete unit tests
  • Loading branch information
bernhardmgruber authored Nov 7, 2024
1 parent 119a9a5 commit de599fa
Show file tree
Hide file tree
Showing 80 changed files with 447 additions and 604 deletions.
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/adjacent_difference/subtract_left.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ void left(nvbench::state& state, nvbench::type_list<T, OffsetT>)

using input_it_t = const T*;
using output_it_t = T*;
using difference_op_t = cub::Difference;
using difference_op_t = ::cuda::std::minus<>;
using offset_t = cub::detail::choose_offset_t<OffsetT>;

#if !TUNE_BASE
Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/reduce/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, Offse
using vals_input_it_t = const ValueT*;
using aggregate_output_it_t = ValueT*;
using num_runs_output_iterator_t = OffsetT*;
using equality_op_t = cub::Equality;
using reduction_op_t = cub::Sum;
using equality_op_t = ::cuda::std::equal_to<>;
using reduction_op_t = ::cuda::std::plus<>;
using accum_t = ValueT;
using offset_t = OffsetT;

Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/reduce/min.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@
*
******************************************************************************/
// NOTE: this benchmark is intented to cover DPX instructions on Hopper+ architectures.
// It specifically uses cub::Min instead of a user-defined operator.
// It specifically uses cuda::minimum<> instead of a user-defined operator.
#define TUNE_T int16_t
#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1

using op_t = cub::Min;
using op_t = ::cuda::minimum<>;
#include "base.cuh"
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/reduce/sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,5 @@
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1

using op_t = cub::Sum;
using op_t = ::cuda::std::plus<>;
#include "base.cuh"
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/run_length_encode/encode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT>)
using vals_input_it_t = cub::ConstantInputIterator<offset_t, OffsetT>;
using aggregate_output_it_t = offset_t*;
using num_runs_output_iterator_t = offset_t*;
using equality_op_t = cub::Equality;
using reduction_op_t = cub::Sum;
using equality_op_t = ::cuda::std::equal_to<>;
using reduction_op_t = ::cuda::std::plus<>;
using accum_t = offset_t;

#if !TUNE_BASE
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT>)
using offset_output_it_t = offset_t*;
using length_output_it_t = offset_t*;
using num_runs_output_iterator_t = offset_t*;
using equality_op_t = cub::Equality;
using equality_op_t = ::cuda::std::equal_to<>;
using accum_t = offset_t;

#if !TUNE_BASE
Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/scan/exclusive/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,12 @@ template <typename KeyT, typename ValueT, typename OffsetT>
static void scan(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT>)
{
using init_value_t = ValueT;
using op_t = cub::Sum;
using op_t = ::cuda::std::plus<>;
using accum_t = ::cuda::std::__accumulator_t<op_t, ValueT, init_value_t>;
using key_input_it_t = const KeyT*;
using val_input_it_t = const ValueT*;
using val_output_it_t = ValueT*;
using equality_op_t = cub::Equality;
using equality_op_t = ::cuda::std::equal_to<>;
using offset_t = OffsetT;

#if !TUNE_BASE
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/scan/exclusive/sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,5 @@
// %RANGE% TUNE_TRANSPOSE trp 0:1:1
// %RANGE% TUNE_LOAD ld 0:2:1

using op_t = cub::Sum;
using op_t = ::cuda::std::plus<>;
#include "base.cuh"
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/select/unique.cu
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ static void unique(nvbench::state& state, nvbench::type_list<T, OffsetT, InPlace
using output_it_t = T*;
using num_selected_it_t = OffsetT*;
using select_op_t = cub::NullType;
using equality_op_t = cub::Equality;
using equality_op_t = ::cuda::std::equal_to<>;
using offset_t = OffsetT;
constexpr bool may_alias = InPlaceAlgT::value;

Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/select/unique_by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ static void select(nvbench::state& state, nvbench::type_list<KeyT, ValueT, Offse
using vals_input_it_t = const ValueT*;
using vals_output_it_t = ValueT*;
using num_runs_output_iterator_t = OffsetT*;
using equality_op_t = cub::Equality;
using equality_op_t = ::cuda::std::equal_to<>;
using offset_t = OffsetT;

#if !TUNE_BASE
Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/transform_reduce/sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
using offset_t = cub::detail::choose_offset_t<OffsetT>;
using output_t = T;
using init_t = T;
using reduction_op_t = cub::Sum;
using reduction_op_t = ::cuda::std::plus<>;
using transform_op_t = square_t<T>;

# if !TUNE_BASE
Expand Down Expand Up @@ -139,7 +139,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
using offset_t = cub::detail::choose_offset_t<OffsetT>;
using output_t = T;
using init_t = T;
using reduction_op_t = cub::Sum;
using reduction_op_t = ::cuda::std::plus<>;
using transform_op_t = square_t<T>;

# if !TUNE_BASE
Expand Down
8 changes: 4 additions & 4 deletions cub/cub/agent/agent_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -638,14 +638,14 @@ private:

using BLevBuffScanPrefixCallbackOpT =
TilePrefixCallbackOp<BufferOffsetT,
Sum,
::cuda::std::plus<>,
BLevBufferOffsetTileState,
0,
typename AgentMemcpySmallBuffersPolicyT::buff_delay_constructor>;

using BLevBlockScanPrefixCallbackOpT =
TilePrefixCallbackOp<BlockOffsetT,
Sum,
::cuda::std::plus<>,
BLevBlockOffsetTileState,
0,
typename AgentMemcpySmallBuffersPolicyT::block_delay_constructor>;
Expand Down Expand Up @@ -830,7 +830,7 @@ private:
else
{
BLevBlockScanPrefixCallbackOpT blev_tile_prefix_op(
blev_block_scan_state, temp_storage.staged.blev.block_scan_callback, Sum(), tile_id);
blev_block_scan_state, temp_storage.staged.blev.block_scan_callback, ::cuda::std::plus<>{}, tile_id);
BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage)
.ExclusiveSum(block_offset, block_offset, blev_tile_prefix_op);
}
Expand Down Expand Up @@ -1062,7 +1062,7 @@ public:
else
{
BLevBuffScanPrefixCallbackOpT blev_buffer_prefix_op(
blev_buffer_scan_state, temp_storage.buffer_scan_callback, Sum(), tile_id);
blev_buffer_scan_state, temp_storage.buffer_scan_callback, ::cuda::std::plus<>{}, tile_id);

// Signal our partial prefix and wait for the inclusive prefix of previous tiles
if (threadIdx.x < CUB_PTX_WARP_THREADS)
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_radix_sort_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ struct AgentRadixSortHistogram
#pragma unroll
for (int pass = 0; pass < num_passes; ++pass)
{
OffsetT count = internal::ThreadReduce(s.bins[pass][bin], Sum());
OffsetT count = internal::ThreadReduce(s.bins[pass][bin], ::cuda::std::plus<>{});
if (count > 0)
{
// Using cuda::atomic<> here would also require using it in
Expand Down
3 changes: 2 additions & 1 deletion cub/cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,8 @@ struct AgentReduceByKey

// Whether or not the scan operation has a zero-valued identity value (true
// if we're performing addition on a primitive type)
static constexpr int HAS_IDENTITY_ZERO = (std::is_same<ReductionOpT, cub::Sum>::value) && (Traits<AccumT>::PRIMITIVE);
static constexpr int HAS_IDENTITY_ZERO =
(std::is_same<ReductionOpT, ::cuda::std::plus<>>::value) && (Traits<AccumT>::PRIMITIVE);

// Cache-modified Input iterator wrapper type (for applying cache modifier)
// for keys Wrap the native input pointer with
Expand Down
7 changes: 4 additions & 3 deletions cub/cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ struct AgentRle
using WarpScanPairs = WarpScan<LengthOffsetPair>;

// Reduce-length-by-run scan operator
using ReduceBySegmentOpT = ReduceBySegmentOp<cub::Sum>;
using ReduceBySegmentOpT = ReduceBySegmentOp<::cuda::std::plus<>>;

// Callback type for obtaining tile prefix during block scan
using DelayConstructorT = typename AgentRlePolicyT::detail::delay_constructor_t;
Expand Down Expand Up @@ -359,7 +359,7 @@ struct AgentRle
, d_offsets_out(d_offsets_out)
, d_lengths_out(d_lengths_out)
, equality_op(equality_op)
, scan_op(cub::Sum())
, scan_op(::cuda::std::plus<>{})
, num_items(num_items)
{}

Expand Down Expand Up @@ -866,7 +866,8 @@ struct AgentRle
tile_aggregate, warp_aggregate, warp_exclusive_in_tile, thread_exclusive_in_warp, lengths_and_num_runs);

// First warp computes tile prefix in lane 0
TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.scan_storage.prefix, Sum(), tile_idx);
TilePrefixCallbackOpT prefix_op(
tile_status, temp_storage.aliasable.scan_storage.prefix, ::cuda::std::plus<>{}, tile_idx);
unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
if (warp_id == 0)
{
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_segment_fixup.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ struct AgentSegmentFixup

// Whether or not the scan operation has a zero-valued identity value
// (true if we're performing addition on a primitive type)
HAS_IDENTITY_ZERO = (std::is_same<ReductionOpT, cub::Sum>::value) && (Traits<ValueT>::PRIMITIVE),
HAS_IDENTITY_ZERO = (std::is_same<ReductionOpT, ::cuda::std::plus<>>::value) && (Traits<ValueT>::PRIMITIVE),
};

// Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
Expand All @@ -187,7 +187,7 @@ struct AgentSegmentFixup
AggregatesOutputIteratorT>;

// Reduce-value-by-segment scan operator
using ReduceBySegmentOpT = ReduceByKeyOp<cub::Sum>;
using ReduceBySegmentOpT = ReduceByKeyOp<::cuda::std::plus<>>;

// Parameterized BlockLoad type for pairs
using BlockLoadPairs =
Expand Down
8 changes: 5 additions & 3 deletions cub/cub/agent/agent_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,9 @@ struct AgentSelectIf
using BlockScanT = BlockScan<OffsetT, BLOCK_THREADS, AgentSelectIfPolicyT::SCAN_ALGORITHM>;

// Callback type for obtaining tile prefix during block scan
using DelayConstructorT = typename AgentSelectIfPolicyT::detail::delay_constructor_t;
using TilePrefixCallbackOpT = TilePrefixCallbackOp<OffsetT, cub::Sum, MemoryOrderedTileStateT, 0, DelayConstructorT>;
using DelayConstructorT = typename AgentSelectIfPolicyT::detail::delay_constructor_t;
using TilePrefixCallbackOpT =
TilePrefixCallbackOp<OffsetT, ::cuda::std::plus<>, MemoryOrderedTileStateT, 0, DelayConstructorT>;

// Item exchange type
using ItemExchangeT = InputT[TILE_ITEMS];
Expand Down Expand Up @@ -896,7 +897,8 @@ struct AgentSelectIf
CTA_SYNC();

// Exclusive scan of values and selection_flags
TilePrefixCallbackOpT prefix_op(tile_state_wrapper, temp_storage.scan_storage.prefix, cub::Sum(), tile_idx);
TilePrefixCallbackOpT prefix_op(
tile_state_wrapper, temp_storage.scan_storage.prefix, ::cuda::std::plus<>{}, tile_idx);
BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);

OffsetT num_tile_selections = prefix_op.GetBlockAggregate();
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ struct AgentSpmv
using KeyValuePairT = KeyValuePair<OffsetT, ValueT>;

// Reduce-value-by-segment scan operator
using ReduceBySegmentOpT = ReduceByKeyOp<cub::Sum>;
using ReduceBySegmentOpT = ReduceByKeyOp<::cuda::std::plus<>>;

// BlockReduce specialization
using BlockReduceT = BlockReduce<ValueT, BLOCK_THREADS, BLOCK_REDUCE_WARP_REDUCTIONS>;
Expand Down
7 changes: 4 additions & 3 deletions cub/cub/agent/agent_three_way_partition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,9 @@ struct AgentThreeWayPartition
using BlockScanT = cub::BlockScan<AccumPackT, BLOCK_THREADS, PolicyT::SCAN_ALGORITHM>;

// Callback type for obtaining tile prefix during block scan
using DelayConstructorT = typename PolicyT::detail::delay_constructor_t;
using TilePrefixCallbackOpT = cub::TilePrefixCallbackOp<AccumPackT, cub::Sum, ScanTileStateT, 0, DelayConstructorT>;
using DelayConstructorT = typename PolicyT::detail::delay_constructor_t;
using TilePrefixCallbackOpT =
cub::TilePrefixCallbackOp<AccumPackT, ::cuda::std::plus<>, ScanTileStateT, 0, DelayConstructorT>;

// Item exchange type
using ItemExchangeT = InputT[TILE_ITEMS];
Expand Down Expand Up @@ -475,7 +476,7 @@ struct AgentThreeWayPartition
CTA_SYNC();

// Exclusive scan of values and selection_flags
TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, cub::Sum(), tile_idx);
TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, ::cuda::std::plus<>{}, tile_idx);

BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(items_selected_flags, items_selected_indices, prefix_op);

Expand Down
7 changes: 4 additions & 3 deletions cub/cub/agent/agent_unique_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,9 @@ struct AgentUniqueByKey
using BlockScanT = cub::BlockScan<OffsetT, BLOCK_THREADS, AgentUniqueByKeyPolicyT::SCAN_ALGORITHM>;

// Parameterized BlockDiscontinuity type for items
using DelayConstructorT = typename AgentUniqueByKeyPolicyT::detail::delay_constructor_t;
using TilePrefixCallback = cub::TilePrefixCallbackOp<OffsetT, cub::Sum, ScanTileStateT, 0, DelayConstructorT>;
using DelayConstructorT = typename AgentUniqueByKeyPolicyT::detail::delay_constructor_t;
using TilePrefixCallback =
cub::TilePrefixCallbackOp<OffsetT, ::cuda::std::plus<>, ScanTileStateT, 0, DelayConstructorT>;

// Key exchange type
using KeyExchangeT = KeyT[ITEMS_PER_TILE];
Expand Down Expand Up @@ -490,7 +491,7 @@ struct AgentUniqueByKey
OffsetT num_selections = 0;
OffsetT num_selections_prefix = 0;

TilePrefixCallback prefix_cb(tile_state, temp_storage.scan_storage.prefix, cub::Sum(), tile_idx);
TilePrefixCallback prefix_cb(tile_state, temp_storage.scan_storage.prefix, ::cuda::std::plus<>{}, tile_idx);
BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(selection_flags, selection_idx, prefix_cb);

num_selections = prefix_cb.GetInclusivePrefix();
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/block/block_radix_rank.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ private:
raking_ptr = smem_raking_ptr;
}

return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, ::cuda::std::plus<>{});
}

/// Performs exclusive downsweep raking scan
Expand All @@ -330,7 +330,7 @@ private:
PackedCounter* raking_ptr = (MEMOIZE_OUTER_SCAN) ? cached_segment : smem_raking_ptr;

// Exclusive raking downsweep scan
internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, ::cuda::std::plus<>{}, raking_partial);

if (MEMOIZE_OUTER_SCAN)
{
Expand Down Expand Up @@ -1000,7 +1000,7 @@ struct BlockRadixRankMatchEarlyCounts
for (int u = 0; u < WARP_BINS_PER_THREAD; ++u)
{
int bin = lane + u * WARP_THREADS;
bins[u] = internal::ThreadReduce(warp_histograms[bin], Sum());
bins[u] = internal::ThreadReduce(warp_histograms[bin], ::cuda::std::plus<>{});
}
CTA_SYNC();

Expand Down
8 changes: 4 additions & 4 deletions cub/cub/block/block_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ public:
//! ...
//!
//! // Compute the block-wide max for thread0
//! int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
//! int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cuda::maximum<>{});
//!
//! @endrst
//!
Expand Down Expand Up @@ -388,7 +388,7 @@ public:
//! ...
//!
//! // Compute the block-wide max for thread0
//! int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
//! int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cuda::maximum<>{});
//!
//! @endrst
//!
Expand Down Expand Up @@ -442,7 +442,7 @@ public:
//! if (threadIdx.x < num_valid) thread_data = ...
//!
//! // Compute the block-wide max for thread0
//! int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
//! int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cuda::maximum<>{}, num_valid);
//!
//! @endrst
//!
Expand Down Expand Up @@ -562,7 +562,7 @@ public:
_CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T (&inputs)[ITEMS_PER_THREAD])
{
// Reduce partials
T partial = internal::ThreadReduce(inputs, cub::Sum());
T partial = internal::ThreadReduce(inputs, ::cuda::std::plus<>{});
return Sum(partial);
}

Expand Down
Loading

0 comments on commit de599fa

Please sign in to comment.