Skip to content

Commit

Permalink
Add m_cooperativeLaunch device prop and runtime check for CG support …
Browse files Browse the repository at this point in the history
…for CUDA/HIP
  • Loading branch information
MichaelVarvarin committed Jul 29, 2024
1 parent 9614d9c commit 26c4803
Show file tree
Hide file tree
Showing 12 changed files with 62 additions and 10 deletions.
4 changes: 3 additions & 1 deletion include/alpaka/acc/AccCpuOmp2Blocks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,9 @@ namespace alpaka
// m_sharedMemSizeBytes
static_cast<size_t>(AccCpuOmp2Blocks<TDim, TIdx>::staticAllocBytes()),
// m_globalMemSizeBytes
getMemBytes(dev)};
getMemBytes(dev),
//m_cooperativeLaunch
std::false_type()};
}
};

Expand Down
4 changes: 3 additions & 1 deletion include/alpaka/acc/AccCpuOmp2Threads.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,9 @@ namespace alpaka
// m_sharedMemSizeBytes
memBytes,
// m_globalMemSizeBytes
memBytes};
memBytes,
//m_cooperativeLaunch
std::false_type()};
}
};

Expand Down
4 changes: 3 additions & 1 deletion include/alpaka/acc/AccCpuSerial.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,9 @@ namespace alpaka
// m_sharedMemSizeBytes
static_cast<size_t>(AccCpuSerial<TDim, TIdx>::staticAllocBytes()),
// m_globalMemSizeBytes
getMemBytes(dev)};
getMemBytes(dev),
//m_cooperativeLaunch
std::false_type()};
}
};

Expand Down
4 changes: 3 additions & 1 deletion include/alpaka/acc/AccCpuTbbBlocks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,9 @@ namespace alpaka
// m_sharedMemSizeBytes
static_cast<size_t>(AccCpuTbbBlocks<TDim, TIdx>::staticAllocBytes()),
// m_globalMemSizeBytes
getMemBytes(dev)};
getMemBytes(dev),
//m_cooperativeLaunch
std::false_type()};
}
};

Expand Down
4 changes: 3 additions & 1 deletion include/alpaka/acc/AccCpuThreads.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,9 @@ namespace alpaka
// m_sharedMemSizeBytes
memBytes,
// m_globalMemSizeBytes
memBytes};
memBytes,
//m_cooperativeLaunch
std::false_type()};
}
};

Expand Down
1 change: 1 addition & 0 deletions include/alpaka/acc/AccDevProps.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,6 @@ namespace alpaka
TIdx m_threadElemCountMax; //!< The maximum number of elements in a threads.
size_t m_sharedMemSizeBytes; //!< The size of shared memory per block
size_t m_globalMemSizeBytes; //!< The size of global memory
bool m_cooperativeLaunch; //!< The support for launch of cooperative kernels
};
} // namespace alpaka
4 changes: 3 additions & 1 deletion include/alpaka/acc/AccGenericSycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,9 @@ namespace alpaka::trait
// m_sharedMemSizeBytes
device.template get_info<sycl::info::device::local_mem_size>(),
// m_globalMemSizeBytes
getMemBytes(dev)};
getMemBytes(dev),
//m_cooperativeLaunch
std::false_type()};
}
};

Expand Down
14 changes: 12 additions & 2 deletions include/alpaka/acc/AccGpuUniformCudaHipRt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,12 @@ namespace alpaka
TApi::deviceAttributeMaxSharedMemoryPerBlock,
dev.getNativeHandle()));

int cooperativeLaunch = {};
ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(TApi::deviceGetAttribute(
&cooperativeLaunch,
TApi::deviceAttributeCooperativeLaunch,
dev.getNativeHandle()));

return {// m_multiProcessorCount
alpaka::core::clipCast<TIdx>(multiProcessorCount),
// m_gridBlockExtentMax
Expand All @@ -186,7 +192,9 @@ namespace alpaka
// m_sharedMemSizeBytes
static_cast<size_t>(sharedMemSizeBytes),
// m_globalMemSizeBytes
getMemBytes(dev)};
getMemBytes(dev),
//m_cooperativeLaunch
static_cast<bool>(cooperativeLaunch)};

# else
typename TApi::DeviceProp_t properties;
Expand Down Expand Up @@ -215,7 +223,9 @@ namespace alpaka
// m_sharedMemSizeBytes
static_cast<size_t>(properties.sharedMemPerBlock),
// m_globalMemSizeBytes
getMemBytes(dev)};
getMemBytes(dev)
//m_cooperativeLaunch
static_cast<bool>(properties.cooperativeLaunch)};
# endif
}
};
Expand Down
1 change: 1 addition & 0 deletions include/alpaka/core/ApiCudaRt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ namespace alpaka
static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::cudaDevAttrMaxThreadsPerBlock;
static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::cudaDevAttrMultiProcessorCount;
static constexpr DeviceAttr_t deviceAttributeWarpSize = ::cudaDevAttrWarpSize;
static constexpr DeviceAttr_t deviceAttributeCooperativeLaunch = ::cudaDevAttrCooperativeLaunch;

static constexpr Limit_t limitPrintfFifoSize = ::cudaLimitPrintfFifoSize;
static constexpr Limit_t limitMallocHeapSize = ::cudaLimitMallocHeapSize;
Expand Down
1 change: 1 addition & 0 deletions include/alpaka/core/ApiHipRt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ namespace alpaka
static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::hipDeviceAttributeMaxThreadsPerBlock;
static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::hipDeviceAttributeMultiprocessorCount;
static constexpr DeviceAttr_t deviceAttributeWarpSize = ::hipDeviceAttributeWarpSize;
static constexpr DeviceAttr_t deviceAttributeCooperativeLaunch = ::hipDeviceAttributeCooperativeLaunch;

# if HIP_VERSION >= 40'500'000
static constexpr Limit_t limitPrintfFifoSize = ::hipLimitPrintfFifoSize;
Expand Down
24 changes: 24 additions & 0 deletions include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,18 @@ namespace alpaka
// TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
// std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
# endif

# if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
// This checks if the device supports cooperative kernel launch
if constexpr (TCooperative)
{
if(!trait::GetAccDevProps<TAcc>::getAccDevProps(getDev(queue)).m_cooperativeLaunch)
{
throw std::runtime_error("This accelerator doesn't support cooperative gropus functionality!");
} else std::cout << "This accelerator supports cooperative gropus functionality!";
}
# endif

auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
Expand Down Expand Up @@ -361,6 +373,18 @@ namespace alpaka
// TApi::deviceGetLimit(&printfFifoSize, TApi::limitPrintfFifoSize);
// std::cout << __func__ << "INFO: printfFifoSize: " << printfFifoSize << std::endl;
# endif

# if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
// This checks if the device supports cooperative kernel launch
if constexpr (TCooperative)
{
if(!trait::GetAccDevProps<TAcc>::getAccDevProps(getDev(queue)).m_cooperativeLaunch)
{
throw std::runtime_error("This accelerator doesn't support cooperative groups functionality!");
}
}
# endif

auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(task);
auto const blockThreadExtent = getWorkDiv<Block, Threads>(task);
auto const threadElemExtent = getWorkDiv<Thread, Elems>(task);
Expand Down
7 changes: 5 additions & 2 deletions test/unit/acc/src/AccDevPropsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ TEMPLATE_LIST_TEST_CASE("getAccDevProps", "[acc]", alpaka::test::TestAccs)

TEST_CASE("AccDevProps.aggregate_init", "[acc]")
{
auto const props = alpaka::AccDevProps<alpaka::DimInt<1>, int>{1, {2}, 3, {4}, 5, {6}, 7, 8, 9};
auto const props = alpaka::AccDevProps<alpaka::DimInt<1>, int>{1, {2}, 3, {4}, 5, {6}, 7, 8, 9, std::true_type()};

CHECK(props.m_multiProcessorCount == 1);
CHECK(props.m_gridBlockExtentMax == alpaka::Vec{2});
Expand All @@ -40,6 +40,7 @@ TEST_CASE("AccDevProps.aggregate_init", "[acc]")
CHECK(props.m_threadElemCountMax == 7);
CHECK(props.m_sharedMemSizeBytes == 8);
CHECK(props.m_globalMemSizeBytes == 9);
CHECK(props.m_cooperativeLaunch == true);
}

#ifdef __cpp_designated_initializers
Expand All @@ -54,7 +55,8 @@ TEST_CASE("AccDevProps.designated_initializers", "[acc]")
.m_threadElemExtentMax = {60},
.m_threadElemCountMax = 70,
.m_sharedMemSizeBytes = 80,
.m_globalMemSizeBytes = 90};
.m_globalMemSizeBytes = 90
.m_cooperativeLaunch = std::false_type()};

CHECK(props.m_multiProcessorCount == 10);
CHECK(props.m_gridBlockExtentMax == alpaka::Vec{20});
Expand All @@ -65,5 +67,6 @@ TEST_CASE("AccDevProps.designated_initializers", "[acc]")
CHECK(props.m_threadElemCountMax == 70);
CHECK(props.m_sharedMemSizeBytes == 80);
CHECK(props.m_globalMemSizeBytes == 90);
CHECK(props.m_cooperativeLaunch == false);
}
#endif

0 comments on commit 26c4803

Please sign in to comment.