Skip to content

Commit

Permalink
Introduce CUB transform reduce (#1091)
Browse files Browse the repository at this point in the history
* Introduce CUB transform reduce

* Enable graph capture for transform reduce

* Typo in transform reduce docs
  • Loading branch information
gevtushenko authored Nov 18, 2023
1 parent aeaa3d3 commit 61c328a
Show file tree
Hide file tree
Showing 9 changed files with 847 additions and 54 deletions.
1 change: 1 addition & 0 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: c++20
StatementMacros: [
'__thrust_exec_check_disable__',
'CUB_NAMESPACE_BEGIN',
'CUB_NAMESPACE_END',
'THRUST_NAMESPACE_BEGIN',
Expand Down
205 changes: 205 additions & 0 deletions cub/benchmarks/bench/transform_reduce/sum.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
/******************************************************************************
* Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

#include "thrust/iterator/transform_iterator.h"
#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1

#ifndef TUNE_BASE
#define TUNE_ITEMS_PER_VEC_LOAD (1 << TUNE_ITEMS_PER_VEC_LOAD_POW2)
#endif

#if !TUNE_BASE
template <typename AccumT, typename OffsetT>
struct policy_hub_t
{
struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
{
static constexpr int threads_per_block = TUNE_THREADS_PER_BLOCK;
static constexpr int items_per_thread = TUNE_ITEMS_PER_THREAD;
static constexpr int items_per_vec_load = TUNE_ITEMS_PER_VEC_LOAD;

using ReducePolicy = cub::AgentReducePolicy<threads_per_block,
items_per_thread,
AccumT,
items_per_vec_load,
cub::BLOCK_REDUCE_WARP_REDUCTIONS,
cub::LOAD_DEFAULT>;

// SingleTilePolicy
using SingleTilePolicy = ReducePolicy;

// SegmentedReducePolicy
using SegmentedReducePolicy = ReducePolicy;
};

using MaxPolicy = policy_t;
};
#endif // !TUNE_BASE

template <class T>
struct square_t
{
__host__ __device__ T operator()(const T& x) const
{
return x * x;
}
};

#define USE_TRANSPOSE_ITERATOR 0

#if USE_TRANSPOSE_ITERATOR
template <typename T, typename OffsetT>
void reduce(nvbench::state &state, nvbench::type_list<T, OffsetT>)
{
using accum_t = T;
using input_it_t = thrust::transform_iterator<square_t<T>, typename thrust::device_vector<T>::iterator>;
using output_it_t = T*;
using offset_t = typename cub::detail::ChooseOffsetT<OffsetT>::Type;
using output_t = T;
using init_t = T;
using reduction_op_t = cub::Sum;
using transform_op_t = square_t<T>;

#if !TUNE_BASE
using policy_t = policy_hub_t<accum_t, offset_t>;
using dispatch_t = cub::
DispatchReduce<input_it_t, output_it_t, offset_t, reduction_op_t, init_t, accum_t, policy_t>;
#else // TUNE_BASE
using dispatch_t =
cub::DispatchReduce<input_it_t, output_it_t, offset_t, reduction_op_t, init_t, accum_t>;
#endif // TUNE_BASE

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
thrust::device_vector<T> in = generate(elements);
thrust::device_vector<T> out(1);

input_it_t d_in = thrust::make_transform_iterator(in.begin(), square_t<T>{});
output_it_t d_out = thrust::raw_pointer_cast(out.data());

// Enable throughput calculations and add "Size" column to results.
state.add_element_count(elements);
state.add_global_memory_reads<T>(elements, "Size");
state.add_global_memory_writes<T>(1);

// Allocate temporary storage:
std::size_t temp_size;
dispatch_t::Dispatch(nullptr,
temp_size,
d_in,
d_out,
static_cast<offset_t>(elements),
reduction_op_t{},
init_t{},
0 /* stream */);

thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto *temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(temp_storage,
temp_size,
d_in,
d_out,
static_cast<offset_t>(elements),
reduction_op_t{},
init_t{},
launch.get_stream());
});
}
#else
template <typename T, typename OffsetT>
void reduce(nvbench::state &state, nvbench::type_list<T, OffsetT>)
{
using accum_t = T;
using input_it_t = const T*;
using output_it_t = T*;
using offset_t = typename cub::detail::ChooseOffsetT<OffsetT>::Type;
using output_t = T;
using init_t = T;
using reduction_op_t = cub::Sum;
using transform_op_t = square_t<T>;

#if !TUNE_BASE
using policy_t = policy_hub_t<accum_t, offset_t>;
using dispatch_t = cub::
DispatchTransformReduce<input_it_t, output_it_t, offset_t, reduction_op_t, transform_op_t, init_t, accum_t, policy_t>;
#else // TUNE_BASE
using dispatch_t =
cub::DispatchTransformReduce<input_it_t, output_it_t, offset_t, reduction_op_t, transform_op_t, init_t, accum_t>;
#endif // TUNE_BASE

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
thrust::device_vector<T> in = generate(elements);
thrust::device_vector<T> out(1);

input_it_t d_in = thrust::raw_pointer_cast(in.data());
output_it_t d_out = thrust::raw_pointer_cast(out.data());

// Enable throughput calculations and add "Size" column to results.
state.add_element_count(elements);
state.add_global_memory_reads<T>(elements, "Size");
state.add_global_memory_writes<T>(1);

// Allocate temporary storage:
std::size_t temp_size;
dispatch_t::Dispatch(nullptr,
temp_size,
d_in,
d_out,
static_cast<offset_t>(elements),
reduction_op_t{},
init_t{},
0 /* stream */,
transform_op_t{});

thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto *temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(temp_storage,
temp_size,
d_in,
d_out,
static_cast<offset_t>(elements),
reduction_op_t{},
init_t{},
launch.get_stream(),
transform_op_t{});
});
}
#endif

NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types))
.set_name("base")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
21 changes: 13 additions & 8 deletions cub/cub/agent/agent_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/util_type.cuh>

#include <cuda/std/functional>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down Expand Up @@ -125,7 +127,8 @@ template <typename AgentReducePolicy,
typename OutputIteratorT,
typename OffsetT,
typename ReductionOp,
typename AccumT>
typename AccumT,
typename TransformOp = ::cuda::std::__identity>
struct AgentReduce
{
//---------------------------------------------------------------------
Expand Down Expand Up @@ -189,6 +192,7 @@ struct AgentReduce
InputIteratorT d_in; ///< Input data to reduce
WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce
ReductionOp reduction_op; ///< Binary reduction operator
TransformOp transform_op; ///< Transform operator

//---------------------------------------------------------------------
// Utility
Expand Down Expand Up @@ -224,11 +228,13 @@ struct AgentReduce
*/
__device__ __forceinline__ AgentReduce(TempStorage &temp_storage,
InputIteratorT d_in,
ReductionOp reduction_op)
ReductionOp reduction_op,
TransformOp transform_op = {})
: temp_storage(temp_storage.Alias())
, d_in(d_in)
, d_wrapped_in(d_in)
, reduction_op(reduction_op)
, transform_op(transform_op)
{}

//---------------------------------------------------------------------
Expand All @@ -252,9 +258,8 @@ struct AgentReduce
AccumT items[ITEMS_PER_THREAD];

// Load items in striped fashion
LoadDirectStriped<BLOCK_THREADS>(threadIdx.x,
d_wrapped_in + block_offset,
items);
cub::detail::load_transform_direct_striped<BLOCK_THREADS>(
threadIdx.x, d_wrapped_in + block_offset, items, transform_op);

// Reduce items within each thread stripe
thread_aggregate =
Expand Down Expand Up @@ -303,7 +308,7 @@ struct AgentReduce
#pragma unroll
for (int i = 0; i < ITEMS_PER_THREAD; ++i)
{
items[i] = input_items[i];
items[i] = transform_op(input_items[i]);
}

// Reduce items within each thread stripe
Expand Down Expand Up @@ -334,7 +339,7 @@ struct AgentReduce
// Read first item
if ((IS_FIRST_TILE) && (thread_offset < valid_items))
{
thread_aggregate = d_wrapped_in[block_offset + thread_offset];
thread_aggregate = transform_op(d_wrapped_in[block_offset + thread_offset]);
thread_offset += BLOCK_THREADS;
}

Expand All @@ -343,7 +348,7 @@ struct AgentReduce
{
InputT item(d_wrapped_in[block_offset + thread_offset]);

thread_aggregate = reduction_op(thread_aggregate, item);
thread_aggregate = reduction_op(thread_aggregate, transform_op(item));
thread_offset += BLOCK_THREADS;
}
}
Expand Down
16 changes: 16 additions & 0 deletions cub/cub/block/block_load.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,22 @@ __device__ __forceinline__ void LoadDirectStriped(int linear_tid,
}
}

namespace detail
{

template <int BLOCK_THREADS, typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT, typename TransformOpT>
__device__ __forceinline__ void load_transform_direct_striped(
int linear_tid, InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], TransformOpT transform_op)
{
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
items[ITEM] = transform_op(block_itr[linear_tid + ITEM * BLOCK_THREADS]);
}
}

} // namespace detail

/**
* @brief Load a linear segment of items into a striped arrangement across the thread block, guarded
* by range
Expand Down
Loading

0 comments on commit 61c328a

Please sign in to comment.