From d386357b07857b2478a9949921175407fa6d6274 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Vysko=C4=8Dil?= Date: Thu, 12 May 2022 17:34:51 +0200 Subject: [PATCH] Catch2 Benchmarking --- CMakeLists.txt | 5 +- cmake/alpakaCommon.cmake | 4 +- .../test/KernelExecutionBenchmarkFixture.hpp | 120 ++++++++++++++++++ test/CMakeLists.txt | 13 +- test/benchmark/CMakeLists.txt | 13 ++ test/benchmark/rand/CMakeLists.txt | 29 +++++ test/benchmark/rand/src/randBenchmark.cpp | 84 ++++++++++++ test/catch_main/CMakeLists.txt | 4 + 8 files changed, 265 insertions(+), 7 deletions(-) create mode 100644 include/alpaka/test/KernelExecutionBenchmarkFixture.hpp create mode 100644 test/benchmark/CMakeLists.txt create mode 100644 test/benchmark/rand/CMakeLists.txt create mode 100644 test/benchmark/rand/src/randBenchmark.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b48eaeaa63b..cbe2d3d6092a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,8 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON) option(alpaka_BUILD_EXAMPLES "Build the examples" OFF) +option(alpaka_BUILD_BENCHMARK "Build the benchmarks." OFF) + option(BUILD_TESTING "Build the testing tree." OFF) option(alpaka_INSTALL_TEST_HEADER "Install headers of the namespace alpaka::test. Attention, headers are not designed for production code, see documentation." OFF) @@ -138,7 +140,8 @@ endif() if(alpaka_BUILD_EXAMPLES) add_subdirectory("example/") endif() -if(BUILD_TESTING) +if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK) + enable_testing() add_subdirectory("test/") endif() diff --git a/cmake/alpakaCommon.cmake b/cmake/alpakaCommon.cmake index ad939eaf361b..a0805072f772 100644 --- a/cmake/alpakaCommon.cmake +++ b/cmake/alpakaCommon.cmake @@ -617,7 +617,7 @@ if(alpaka_ACC_SYCL_ENABLE) # Enable device-side printing to stdout cmake_dependent_option(alpaka_SYCL_ENABLE_IOSTREAM "Enable device-side printing to stdout" OFF "alpaka_ACC_SYCL_ENABLE" OFF) - if(BUILD_TESTING) + if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK) set(alpaka_SYCL_ENABLE_IOSTREAM ON CACHE BOOL "Enable device-side printing to stdout" FORCE) endif() @@ -851,7 +851,7 @@ if(TARGET alpaka) # the alpaka library itself # SYSTEM voids showing warnings produced by alpaka when used in user applications. - if(BUILD_TESTING) + if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK) target_include_directories(alpaka INTERFACE ${_alpaka_INCLUDE_DIRECTORY}) else() target_include_directories(alpaka SYSTEM INTERFACE ${_alpaka_INCLUDE_DIRECTORY}) diff --git a/include/alpaka/test/KernelExecutionBenchmarkFixture.hpp b/include/alpaka/test/KernelExecutionBenchmarkFixture.hpp new file mode 100644 index 000000000000..b81fc900ab98 --- /dev/null +++ b/include/alpaka/test/KernelExecutionBenchmarkFixture.hpp @@ -0,0 +1,120 @@ +/* Copyright 2022 Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include + +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA +# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA! +#endif + +#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP +# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP! +#endif + +#include +#include + +#include + +#include +#include + +namespace alpaka::test +{ + //! The fixture for executing a kernel on a given accelerator. + template + class KernelExecutionBenchmarkFixture + { + public: + using Acc = TAcc; + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + using DevAcc = Dev; + using PltfAcc = Pltf; + using QueueAcc = test::DefaultQueue; + using WorkDiv = WorkDivMembers; + + KernelExecutionBenchmarkFixture(WorkDiv workDiv) + : m_devHost(getDevByIdx(0u)) + , m_devAcc(getDevByIdx(0u)) + , m_queue(m_devAcc) + , m_workDiv(std::move(workDiv)) + { + } + + template + KernelExecutionBenchmarkFixture(TExtent const& extent) + : KernelExecutionBenchmarkFixture(getValidWorkDiv( + getDevByIdx(0u), + extent, + Vec::ones(), + false, + GridBlockExtentSubDivRestrictions::Unrestricted)) + { + } + + template + auto operator()( + TKernelFnObj const& kernelFnObj, + std::string const& benchmarkName, + float& result, + TArgs&&... args) -> bool + { + // Allocate result buffers + auto bufAccResult = allocBuf(m_devAcc, static_cast(1u)); + auto bufHostResult = allocBuf(m_devHost, static_cast(1u)); + + int numRuns = 0; + result = 0.0f; + + // The following block is executed unknown times during estimation phase, then once per benchmark sample + BENCHMARK_ADVANCED(std::string(benchmarkName))(Catch::Benchmark::Chronometer meter) + { + numRuns++; + memset(m_queue, bufAccResult, 0.0f); + wait(m_queue); + + // Only the following part is measured as the benchmark part + meter.measure( + [&] + { + exec( + m_queue, + m_workDiv, + kernelFnObj, + getPtrNative(bufAccResult), + std::forward(args)...); // run the measured kernel + wait(m_queue); // wait for the kernel to actually run + }); + + // Copy the result value to the host + memcpy(m_queue, bufHostResult, bufAccResult); + wait(m_queue); + + auto const resultLocal = *getPtrNative(bufHostResult); + result += resultLocal; + return resultLocal; // make sure the benchmark call is not optimized away + }; + result /= static_cast(numRuns); + + return true; + // TODO: Can we return the result here and read it from Catch2's REQUIRE or something similar? Or are the + // returns limited to bools? + // return result; + } + + protected: + DevCpu m_devHost; + DevAcc m_devAcc; + QueueAcc m_queue; + WorkDiv m_workDiv; + }; +} // namespace alpaka::test diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e69fc4799334..6cabb7db6b68 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright 2015-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan +# Copyright 2015-2022 Benjamin Worpitz, Axel Huebl, Jan Stephan, Jiri Vyskocil # # This file is part of alpaka. # @@ -21,6 +21,11 @@ add_subdirectory(common) list(APPEND _alpaka_TEST_OPTIONS --use-colour yes) -add_subdirectory(analysis) -add_subdirectory(integ) -add_subdirectory(unit) +if(BUILD_TESTING) + add_subdirectory(analysis) + add_subdirectory(integ) + add_subdirectory(unit) +endif() +if(alpaka_BUILD_BENCHMARK) + add_subdirectory(benchmark) +endif() diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt new file mode 100644 index 000000000000..55f8ad0d409f --- /dev/null +++ b/test/benchmark/CMakeLists.txt @@ -0,0 +1,13 @@ +# +# Copyright 2022 Jiri Vyskocil +# +# This file is part of alpaka. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +cmake_minimum_required(VERSION 3.18) + +add_subdirectory("rand/") diff --git a/test/benchmark/rand/CMakeLists.txt b/test/benchmark/rand/CMakeLists.txt new file mode 100644 index 000000000000..6a2b0963eb66 --- /dev/null +++ b/test/benchmark/rand/CMakeLists.txt @@ -0,0 +1,29 @@ +# +# Copyright 2022 Jiri Vyskocil +# +# This file is part of alpaka. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +set(_TARGET_NAME "randBenchmark") + +append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE) + +alpaka_add_executable( + ${_TARGET_NAME} + ${_FILES_SOURCE}) +target_link_libraries( + ${_TARGET_NAME} + PRIVATE common) + +set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/benchmark") +target_compile_definitions(${_TARGET_NAME} PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING) + +if(alpaka_CI) + add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS} --benchmark-samples 1) +else() + add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS}) +endif() diff --git a/test/benchmark/rand/src/randBenchmark.cpp b/test/benchmark/rand/src/randBenchmark.cpp new file mode 100644 index 000000000000..a99f880f6c5b --- /dev/null +++ b/test/benchmark/rand/src/randBenchmark.cpp @@ -0,0 +1,84 @@ +/* Copyright 2022 Jiri Vyskocil + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include +#include +#include + +#include + +class RandBenchmarkKernel +{ +public: + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, TIdx numPoints) const + { + // Get the global linearized thread idx. + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const globalThreadExtent = alpaka::getWorkDiv(acc); + + auto const linearizedGlobalThreadIdx + = static_cast(alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent)[0]); + + // Setup generator engine and distribution. + auto engine = alpaka::rand::engine::createDefault(acc, 42, linearizedGlobalThreadIdx); + auto dist(alpaka::rand::distribution::createUniformReal(acc)); + + float number = 0; + for(TIdx i = linearizedGlobalThreadIdx; i < numPoints; i += static_cast(globalThreadExtent.prod())) + { + number += dist(engine); + } + + alpaka::atomicAdd( + acc, + result, + number); // TODO: we're measuring the atomicAdd time too, this is not what we want + } +}; + +// TODO: This takes an enormous time to finish and is probably useless anyway: +// TEMPLATE_LIST_TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]", alpaka::test::TestAccs) +// Running the benchmark on a single default accelerator instead +TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]") +{ + // using Acc = TestType; + using Acc = alpaka::ExampleDefaultAcc, std::size_t>; + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + using Vec = alpaka::Vec; + using WorkDiv = alpaka::WorkDivMembers; + + auto const devAcc = alpaka::getDevByIdx(0u); + + const Idx numThreads = std::thread::hardware_concurrency(); // TODO: GPU? + std::cout << "Hardware threads: " << numThreads << std::endl; + + const unsigned numPoints = GENERATE(100'000u, 1'000'000u, 10'000'000u, 100'000'000u, 1'000'000'000u); + + WorkDiv workdiv{alpaka::getValidWorkDiv( + devAcc, + Vec::all(numThreads * numThreads), + Vec::all(numThreads), + false, + alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)}; + + alpaka::test::KernelExecutionBenchmarkFixture fixture(workdiv); + + RandBenchmarkKernel kernel; + + float result = 0.0f; + + REQUIRE(fixture(kernel, "Random sequence N=" + std::to_string(numPoints), result, numPoints)); + // TODO: Actually check the result + std::cout << "\ntemp debug normalized result = " << result / static_cast(numPoints) + << " should probably converge to 0.5." << std::flush; +} diff --git a/test/catch_main/CMakeLists.txt b/test/catch_main/CMakeLists.txt index cd14b7c063fd..843235b167e7 100644 --- a/test/catch_main/CMakeLists.txt +++ b/test/catch_main/CMakeLists.txt @@ -27,6 +27,10 @@ set_target_properties(CatchMain PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON ) +if(alpaka_BUILD_BENCHMARK) + target_compile_definitions(CatchMain PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING) +endif() + target_compile_definitions(CatchMain PUBLIC "CATCH_CONFIG_FAST_COMPILE") if (CMAKE_CXX_COMPILER_ID STREQUAL "PGI") # Workaround for STL atomic issue: https://forums.developer.nvidia.com/t/support-for-atomic-in-libstdc-missing/135403/2