Skip to content

Commit

Permalink
Evenly spread elements (small problem sizes)
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Sep 10, 2024
1 parent 9f0d63b commit 074ad49
Showing 1 changed file with 62 additions and 7 deletions.
69 changes: 62 additions & 7 deletions cub/cub/device/dispatch/dispatch_transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -746,13 +746,38 @@ _CCCL_HOST_DEVICE inline PoorExpected<int> get_max_shared_memory()
return max_smem;
}

_CCCL_HOST_DEVICE inline PoorExpected<int> get_sm_count()
{
int device = 0;
auto error = CubDebug(cudaGetDevice(&device));
if (error != cudaSuccess)
{
return error;
}

int sm_count = 0;
error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device));
if (error != cudaSuccess)
{
return error;
}

return sm_count;
}

struct elem_counts
{
int elem_per_thread;
int tile_size;
int smem_size;
};

struct prefetch_config
{
int max_occupancy;
int sm_count;
};

template <bool RequiresStableAddress,
typename Offset,
typename RandomAccessIteratorTupleIn,
Expand Down Expand Up @@ -933,19 +958,49 @@ struct dispatch_t<RequiresStableAddress,
{
using policy_t = typename ActivePolicy::algo_policy;
constexpr int block_dim = policy_t::block_threads;
int max_occupancy = 0;
const auto error = CubDebug(MaxSmOccupancy(max_occupancy, CUB_DETAIL_TRANSFORM_KERNEL_PTR, block_dim, 0));
if (error != cudaSuccess)

auto determine_config = [&]() -> PoorExpected<prefetch_config> {
int max_occupancy = 0;
const auto error = CubDebug(MaxSmOccupancy(max_occupancy, CUB_DETAIL_TRANSFORM_KERNEL_PTR, block_dim, 0));
if (error != cudaSuccess)
{
return error;
}
const auto sm_count = get_sm_count();
if (!sm_count)
{
return sm_count.error;
}
return prefetch_config{max_occupancy, *sm_count};
};

PoorExpected<prefetch_config> config = [&]() {
NV_IF_TARGET(
NV_IS_HOST,
(
// this static variable exists for each template instantiation of the surrounding function and class, on which
// the chosen element count solely depends (assuming max SMEM is constant during a program execution)
static auto cached_config = determine_config(); return cached_config;),
(
// we cannot cache the determined element count in device code
return determine_config();));
}();
if (!config)
{
return error;
return config.error;
}

const int items_per_thread =
loaded_bytes_per_iter == 0
? +policy_t::items_per_thread_no_input
: ::cuda::ceil_div(ActivePolicy::min_bif, max_occupancy * block_dim * loaded_bytes_per_iter);
const int items_per_thread_clamped =
::cuda::std::clamp(items_per_thread, +policy_t::min_items_per_thread, +policy_t::max_items_per_thread);
: ::cuda::ceil_div(ActivePolicy::min_bif, config->max_occupancy * block_dim * loaded_bytes_per_iter);

// Generate at least one block per SM. This improves tiny problem sizes (e.g. 2^16 elements).
const int items_per_thread_evenly_spread =
static_cast<int>(::cuda::std::min(Offset{items_per_thread}, num_items / (config->sm_count * block_dim)));

const int items_per_thread_clamped = ::cuda::std::clamp(
items_per_thread_evenly_spread, +policy_t::min_items_per_thread, +policy_t::max_items_per_thread);
const int tile_size = block_dim * items_per_thread_clamped;
const auto grid_dim = static_cast<unsigned int>(::cuda::ceil_div(num_items, Offset{tile_size}));
return CubDebug(
Expand Down

0 comments on commit 074ad49

Please sign in to comment.