Skip to content

Commit

Permalink
Catch2 Benchmarking
Browse files Browse the repository at this point in the history
  • Loading branch information
sliwowitz committed May 12, 2022
1 parent 2fc0f04 commit d386357
Show file tree
Hide file tree
Showing 8 changed files with 265 additions and 7 deletions.
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON)

option(alpaka_BUILD_EXAMPLES "Build the examples" OFF)

option(alpaka_BUILD_BENCHMARK "Build the benchmarks." OFF)

option(BUILD_TESTING "Build the testing tree." OFF)

option(alpaka_INSTALL_TEST_HEADER "Install headers of the namespace alpaka::test. Attention, headers are not designed for production code, see documentation." OFF)
Expand Down Expand Up @@ -138,7 +140,8 @@ endif()
if(alpaka_BUILD_EXAMPLES)
add_subdirectory("example/")
endif()
if(BUILD_TESTING)
if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
enable_testing()
add_subdirectory("test/")
endif()

Expand Down
4 changes: 2 additions & 2 deletions cmake/alpakaCommon.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ if(alpaka_ACC_SYCL_ENABLE)

# Enable device-side printing to stdout
cmake_dependent_option(alpaka_SYCL_ENABLE_IOSTREAM "Enable device-side printing to stdout" OFF "alpaka_ACC_SYCL_ENABLE" OFF)
if(BUILD_TESTING)
if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
set(alpaka_SYCL_ENABLE_IOSTREAM ON CACHE BOOL "Enable device-side printing to stdout" FORCE)
endif()

Expand Down Expand Up @@ -851,7 +851,7 @@ if(TARGET alpaka)

# the alpaka library itself
# SYSTEM voids showing warnings produced by alpaka when used in user applications.
if(BUILD_TESTING)
if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
target_include_directories(alpaka INTERFACE ${_alpaka_INCLUDE_DIRECTORY})
else()
target_include_directories(alpaka SYSTEM INTERFACE ${_alpaka_INCLUDE_DIRECTORY})
Expand Down
120 changes: 120 additions & 0 deletions include/alpaka/test/KernelExecutionBenchmarkFixture.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/* Copyright 2022 Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber
*
* This file is part of alpaka.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/

#pragma once

#include <alpaka/alpaka.hpp>

#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
#endif

#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
#endif

#include <alpaka/test/Check.hpp>
#include <alpaka/test/queue/Queue.hpp>

#include <catch2/catch.hpp>

#include <string>
#include <utility>

namespace alpaka::test
{
//! The fixture for executing a kernel on a given accelerator.
template<typename TAcc>
class KernelExecutionBenchmarkFixture
{
public:
using Acc = TAcc;
using Dim = alpaka::Dim<Acc>;
using Idx = alpaka::Idx<Acc>;
using DevAcc = Dev<Acc>;
using PltfAcc = Pltf<DevAcc>;
using QueueAcc = test::DefaultQueue<DevAcc>;
using WorkDiv = WorkDivMembers<Dim, Idx>;

KernelExecutionBenchmarkFixture(WorkDiv workDiv)
: m_devHost(getDevByIdx<PltfCpu>(0u))
, m_devAcc(getDevByIdx<PltfAcc>(0u))
, m_queue(m_devAcc)
, m_workDiv(std::move(workDiv))
{
}

template<typename TExtent>
KernelExecutionBenchmarkFixture(TExtent const& extent)
: KernelExecutionBenchmarkFixture(getValidWorkDiv<Acc>(
getDevByIdx<PltfAcc>(0u),
extent,
Vec<Dim, Idx>::ones(),
false,
GridBlockExtentSubDivRestrictions::Unrestricted))
{
}

template<typename TKernelFnObj, typename... TArgs>
auto operator()(
TKernelFnObj const& kernelFnObj,
std::string const& benchmarkName,
float& result,
TArgs&&... args) -> bool
{
// Allocate result buffers
auto bufAccResult = allocBuf<float, Idx>(m_devAcc, static_cast<Idx>(1u));
auto bufHostResult = allocBuf<float, Idx>(m_devHost, static_cast<Idx>(1u));

int numRuns = 0;
result = 0.0f;

// The following block is executed unknown times during estimation phase, then once per benchmark sample
BENCHMARK_ADVANCED(std::string(benchmarkName))(Catch::Benchmark::Chronometer meter)
{
numRuns++;
memset(m_queue, bufAccResult, 0.0f);
wait(m_queue);

// Only the following part is measured as the benchmark part
meter.measure(
[&]
{
exec<Acc>(
m_queue,
m_workDiv,
kernelFnObj,
getPtrNative(bufAccResult),
std::forward<TArgs>(args)...); // run the measured kernel
wait(m_queue); // wait for the kernel to actually run
});

// Copy the result value to the host
memcpy(m_queue, bufHostResult, bufAccResult);
wait(m_queue);

auto const resultLocal = *getPtrNative(bufHostResult);
result += resultLocal;
return resultLocal; // make sure the benchmark call is not optimized away
};
result /= static_cast<float>(numRuns);

return true;
// TODO: Can we return the result here and read it from Catch2's REQUIRE or something similar? Or are the
// returns limited to bools?
// return result;
}

protected:
DevCpu m_devHost;
DevAcc m_devAcc;
QueueAcc m_queue;
WorkDiv m_workDiv;
};
} // namespace alpaka::test
13 changes: 9 additions & 4 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright 2015-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
# Copyright 2015-2022 Benjamin Worpitz, Axel Huebl, Jan Stephan, Jiri Vyskocil
#
# This file is part of alpaka.
#
Expand All @@ -21,6 +21,11 @@ add_subdirectory(common)

list(APPEND _alpaka_TEST_OPTIONS --use-colour yes)

add_subdirectory(analysis)
add_subdirectory(integ)
add_subdirectory(unit)
if(BUILD_TESTING)
add_subdirectory(analysis)
add_subdirectory(integ)
add_subdirectory(unit)
endif()
if(alpaka_BUILD_BENCHMARK)
add_subdirectory(benchmark)
endif()
13 changes: 13 additions & 0 deletions test/benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#
# Copyright 2022 Jiri Vyskocil
#
# This file is part of alpaka.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#

cmake_minimum_required(VERSION 3.18)

add_subdirectory("rand/")
29 changes: 29 additions & 0 deletions test/benchmark/rand/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#
# Copyright 2022 Jiri Vyskocil
#
# This file is part of alpaka.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#

set(_TARGET_NAME "randBenchmark")

append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)

alpaka_add_executable(
${_TARGET_NAME}
${_FILES_SOURCE})
target_link_libraries(
${_TARGET_NAME}
PRIVATE common)

set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/benchmark")
target_compile_definitions(${_TARGET_NAME} PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING)

if(alpaka_CI)
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS} --benchmark-samples 1)
else()
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS})
endif()
84 changes: 84 additions & 0 deletions test/benchmark/rand/src/randBenchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/* Copyright 2022 Jiri Vyskocil
*
* This file is part of alpaka.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/

#include <alpaka/example/ExampleDefaultAcc.hpp>
#include <alpaka/rand/Traits.hpp>
#include <alpaka/test/KernelExecutionBenchmarkFixture.hpp>
#include <alpaka/test/acc/TestAccs.hpp>

#include <catch2/catch.hpp>

class RandBenchmarkKernel
{
public:
ALPAKA_NO_HOST_ACC_WARNING
template<typename TAcc, typename TIdx>
ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, TIdx numPoints) const
{
// Get the global linearized thread idx.
auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

auto const linearizedGlobalThreadIdx
= static_cast<TIdx>(alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent)[0]);

// Setup generator engine and distribution.
auto engine = alpaka::rand::engine::createDefault(acc, 42, linearizedGlobalThreadIdx);
auto dist(alpaka::rand::distribution::createUniformReal<float>(acc));

float number = 0;
for(TIdx i = linearizedGlobalThreadIdx; i < numPoints; i += static_cast<TIdx>(globalThreadExtent.prod()))
{
number += dist(engine);
}

alpaka::atomicAdd(
acc,
result,
number); // TODO: we're measuring the atomicAdd time too, this is not what we want
}
};

// TODO: This takes an enormous time to finish and is probably useless anyway:
// TEMPLATE_LIST_TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]", alpaka::test::TestAccs)
// Running the benchmark on a single default accelerator instead
TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]")
{
// using Acc = TestType;
using Acc = alpaka::ExampleDefaultAcc<alpaka::DimInt<1>, std::size_t>;
using Dim = alpaka::Dim<Acc>;
using Idx = alpaka::Idx<Acc>;
using Vec = alpaka::Vec<Dim, Idx>;
using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;

auto const devAcc = alpaka::getDevByIdx<Acc>(0u);

const Idx numThreads = std::thread::hardware_concurrency(); // TODO: GPU?
std::cout << "Hardware threads: " << numThreads << std::endl;

const unsigned numPoints = GENERATE(100'000u, 1'000'000u, 10'000'000u, 100'000'000u, 1'000'000'000u);

WorkDiv workdiv{alpaka::getValidWorkDiv<Acc>(
devAcc,
Vec::all(numThreads * numThreads),
Vec::all(numThreads),
false,
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)};

alpaka::test::KernelExecutionBenchmarkFixture<Acc> fixture(workdiv);

RandBenchmarkKernel kernel;

float result = 0.0f;

REQUIRE(fixture(kernel, "Random sequence N=" + std::to_string(numPoints), result, numPoints));
// TODO: Actually check the result
std::cout << "\ntemp debug normalized result = " << result / static_cast<float>(numPoints)
<< " should probably converge to 0.5." << std::flush;
}
4 changes: 4 additions & 0 deletions test/catch_main/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ set_target_properties(CatchMain PROPERTIES
WINDOWS_EXPORT_ALL_SYMBOLS ON
)

if(alpaka_BUILD_BENCHMARK)
target_compile_definitions(CatchMain PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING)
endif()

target_compile_definitions(CatchMain PUBLIC "CATCH_CONFIG_FAST_COMPILE")
if (CMAKE_CXX_COMPILER_ID STREQUAL "PGI")
# Workaround for STL atomic issue: https://forums.developer.nvidia.com/t/support-for-atomic-in-libstdc-missing/135403/2
Expand Down

0 comments on commit d386357

Please sign in to comment.