Skip to content

Commit

Permalink
Fix issues from bad rebase
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelVarvarin committed Aug 12, 2024
1 parent c6c12fd commit 4ad8bae
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 2 deletions.
4 changes: 2 additions & 2 deletions include/alpaka/kernel/TaskKernelGpuCudaRt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@

namespace alpaka
{
template<typename TAcc, typename TDev, typename TDim, typename TIdx, typename TKernelFnObj, bool TCooperative, typename... TArgs>
template<typename TAcc, typename TDim, typename TIdx, typename TKernelFnObj, bool TCooperative, typename... TArgs>
using TaskKernelGpuCudaRt
= TaskKernelGpuUniformCudaHipRt<ApiCudaRt, TAcc, TDev, TDim, TIdx, TKernelFnObj, TCooperative, TArgs...>;
= TaskKernelGpuUniformCudaHipRt<ApiCudaRt, TAcc, TDim, TIdx, TKernelFnObj, TCooperative, TArgs...>;
} // namespace alpaka

#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
28 changes: 28 additions & 0 deletions include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,34 @@ namespace alpaka
return kernelFunctionAttributes;
}
};

//! The CUDA/HIP get max active blocks for cooperative kernel specialization.
template<typename TAcc, typename TKernelFnObj, typename TApi, typename TDim, typename TIdx, typename... TArgs>
struct MaxActiveBlocks<TAcc, DevUniformCudaHipRt<TApi>, TKernelFnObj, TDim, TIdx, TArgs...>
{
ALPAKA_FN_HOST static auto getMaxActiveBlocks(
TKernelFnObj const& kernelFnObj,
DevUniformCudaHipRt<TApi> const& device,
alpaka::Vec<TDim, TIdx> const& blockThreadExtent,
alpaka::Vec<TDim, TIdx> const& threadElemExtent,
TArgs const&... args) -> int
{
auto const blockSharedMemDynSizeBytes
= getBlockSharedMemDynSizeBytes<TAcc>(kernelFnObj, blockThreadExtent, threadElemExtent, args...);

int numBlocksPerSm = 0;
TApi::occupancyMaxActiveBlocksPerMultiprocessor(
&numBlocksPerSm,
alpaka::detail::
gpuKernel<TKernelFnObj, TApi, TAcc, TDim, TIdx, remove_restrict_t<std::decay_t<TArgs>>...>,
blockThreadExtent.prod(),
static_cast<std::size_t>(blockSharedMemDynSizeBytes));

auto multiProcessorCount = trait::GetAccDevProps<TAcc>::getAccDevProps(device).m_multiProcessorCount;

return numBlocksPerSm * multiProcessorCount;
}
};
} // namespace trait
} // namespace alpaka

Expand Down

0 comments on commit 4ad8bae

Please sign in to comment.