From 37351aa80dbf212ecd1dc6022cdd6f8afb888d66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Vysko=C4=8Dil?= Date: Tue, 15 Nov 2022 13:37:20 +0100 Subject: [PATCH] Catch2 Benchmarking --- CMakeLists.txt | 5 +- cmake/alpakaCommon.cmake | 12 +- .../test/KernelExecutionBenchmarkFixture.hpp | 118 ++++++++++++++++++ test/CMakeLists.txt | 11 +- test/analysis/headerCheck/CMakeLists.txt | 2 + test/benchmark/CMakeLists.txt | 13 ++ test/benchmark/rand/CMakeLists.txt | 33 +++++ test/benchmark/rand/src/randBenchmark.cpp | 91 ++++++++++++++ thirdParty/CMakeLists.txt | 2 +- 9 files changed, 275 insertions(+), 12 deletions(-) create mode 100644 include/alpaka/test/KernelExecutionBenchmarkFixture.hpp create mode 100644 test/benchmark/CMakeLists.txt create mode 100644 test/benchmark/rand/CMakeLists.txt create mode 100644 test/benchmark/rand/src/randBenchmark.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e0531010054a..e1bc7d0c8598 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,7 @@ option(alpaka_BUILD_BENCHMARKS "Build the benchmarks" OFF) if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) option(alpaka_ENABLE_WERROR "Treat all warnings as errors." OFF) option(BUILD_TESTING "Build the testing tree." OFF) + option(alpaka_BUILD_BENCHMARK "Build the benchmarks." OFF) include(CTest) endif() @@ -48,7 +49,7 @@ option(alpaka_INSTALL_TEST_HEADER "Install headers of the namespace alpaka::test include(CMakeDependentOption) cmake_dependent_option(alpaka_CHECK_HEADERS "Check all alpaka headers as part of the tests whether they can be compiled standalone." OFF BUILD_TESTING OFF) -cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON BUILD_TESTING OFF) +cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON "BUILD_TESTING OR alpaka_BUILD_BENCHMARK" OFF) ################################################################################ # Internal variables. @@ -154,7 +155,7 @@ if(alpaka_BUILD_BENCHMARKS) endif() # Only build the tests if alpaka is the top-level project and BUILD_TESTING is ON -if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND BUILD_TESTING) +if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND (BUILD_TESTING OR alpaka_BUILD_BENCHMARK)) add_subdirectory("test/") endif() diff --git a/cmake/alpakaCommon.cmake b/cmake/alpakaCommon.cmake index b17fefa46539..137ba00322f3 100644 --- a/cmake/alpakaCommon.cmake +++ b/cmake/alpakaCommon.cmake @@ -200,7 +200,7 @@ else() "$<$,$,$>:SHELL:-Xcompiler -Og>" "$<$,$>:SHELL:-O0>" "$<$,$>:SHELL:/Od>") - + target_link_options(alpaka INTERFACE "$<$,$>:SHELL:-Og>" "$<$,$>:SHELL:-O0>") endif() @@ -358,7 +358,7 @@ endif() if(alpaka_ACC_GPU_CUDA_ENABLE) # Save the user-defined host compiler (if any) set(_alpaka_CUDA_HOST_COMPILER ${CMAKE_CUDA_HOST_COMPILER}) - + check_language(CUDA) if(CMAKE_CUDA_COMPILER) @@ -619,9 +619,9 @@ if(alpaka_ACC_SYCL_ENABLE) list(JOIN alpaka_SYCL_TARGETS "," alpaka_SYCL_TARGETS_CONCAT) alpaka_set_compiler_options(HOST_DEVICE target alpaka "-fsycl-targets=${alpaka_SYCL_TARGETS_CONCAT}") target_link_options(alpaka INTERFACE "-fsycl-targets=${alpaka_SYCL_TARGETS_CONCAT}") - + #----------------------------------------------------------------------------------------------------------------- - # Determine actual hardware to compile for + # Determine actual hardware to compile for if(alpaka_SYCL_ONEAPI_CPU) set(alpaka_SYCL_ONEAPI_CPU_ISA "avx2" CACHE STRING "Intel ISA to compile for") set_property(CACHE alpaka_SYCL_ONEAPI_CPU_ISA PROPERTY STRINGS "sse4.2;avx;avx2;avx512") @@ -663,7 +663,7 @@ if(alpaka_ACC_SYCL_ENABLE) PROPERTY STRINGS "intel_gpu_pvc;intel_gpu_acm_g12;intel_gpu_acm_g11;intel_gpu_acm_g10;intel_gpu_dg1;intel_gpu_adl_n;intel_gpu_adl_p;intel_gpu_rpl_s;intel_gpu_adl_s;intel_gpu_rkl;intel_gpu_tgllp;intel_gpu_icllp;intel_gpu_cml;intel_gpu_aml;intel_gpu_whl;intel_gpu_glk;intel_gpu_apl;intel_gpu_cfl;intel_gpu_kbl;intel_gpu_skl;intel_gpu_bdw") # If the user has given us a list turn all ';' into ',' to pacify the Intel OpenCL compiler. string(REPLACE ";" "," alpaka_SYCL_ONEAPI_GPU_DEVICES "${alpaka_SYCL_ONEAPI_GPU_DEVICES}") - + target_compile_definitions(alpaka INTERFACE "ALPAKA_SYCL_ONEAPI_GPU") endif() @@ -781,7 +781,7 @@ if(TARGET alpaka) # the alpaka library itself # SYSTEM voids showing warnings produced by alpaka when used in user applications. - if(BUILD_TESTING) + if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK) target_include_directories(alpaka INTERFACE ${_alpaka_INCLUDE_DIRECTORY}) else() target_include_directories(alpaka SYSTEM INTERFACE ${_alpaka_INCLUDE_DIRECTORY}) diff --git a/include/alpaka/test/KernelExecutionBenchmarkFixture.hpp b/include/alpaka/test/KernelExecutionBenchmarkFixture.hpp new file mode 100644 index 000000000000..ab159354168f --- /dev/null +++ b/include/alpaka/test/KernelExecutionBenchmarkFixture.hpp @@ -0,0 +1,118 @@ +/* Copyright 2022 Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include + +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA +# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA! +#endif + +#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP +# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP! +#endif + +#include +#include + +#include + +#include +#include + +namespace alpaka::test +{ + //! The fixture for executing a kernel on a given accelerator. + template + class KernelExecutionBenchmarkFixture + { + public: + using Acc = TAcc; + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + using Device = Dev; + using Platform = alpaka::Platform; + using Queue = test::DefaultQueue; + using WorkDiv = WorkDivMembers; + + KernelExecutionBenchmarkFixture(WorkDiv workDiv) : m_workDiv(std::move(workDiv)) + { + } + + template + KernelExecutionBenchmarkFixture(TExtent const& extent) + : KernelExecutionBenchmarkFixture(getValidWorkDiv( + getDevByIdx(0u), + extent, + Vec::ones(), + false, + GridBlockExtentSubDivRestrictions::Unrestricted)) + { + } + + template + auto operator()( + TKernelFnObj const& kernelFnObj, + std::string const& benchmarkName, + float& result, + TArgs&&... args) -> bool + { + // Allocate result buffers + auto bufAccResult = allocBuf(m_device, static_cast(1u)); + auto bufHostResult = allocBuf(m_devHost, static_cast(1u)); + + int numRuns = 0; + result = 0.0f; + + // The following block is executed unknown times during estimation phase, then once per benchmark sample + BENCHMARK_ADVANCED(std::string(benchmarkName))(Catch::Benchmark::Chronometer meter) + { + numRuns++; + memset(m_queue, bufAccResult, 0); + wait(m_queue); + + // Only the following part is measured as the benchmark part + meter.measure( + [&] + { + exec( + m_queue, + m_workDiv, + kernelFnObj, + getPtrNative(bufAccResult), + std::forward(args)...); // run the measured kernel + wait(m_queue); // wait for the kernel to actually run + }); + + // Copy the result value to the host + memcpy(m_queue, bufHostResult, bufAccResult); + wait(m_queue); + + auto const resultLocal = *getPtrNative(bufHostResult); + result += resultLocal; + return resultLocal; // make sure the benchmark call is not optimized away + }; + result /= static_cast(numRuns); + + return true; + // TODO: Can we return the result here and read it from Catch2's REQUIRE or something similar? Or are the + // returns limited to bools? + // return result; + } + + protected: + PlatformCpu m_platformHost{}; + DevCpu m_devHost{getDevByIdx(m_platformHost, 0)}; + Platform m_platform{}; + Device m_device{getDevByIdx(m_platform, 0)}; + Queue m_queue{m_device}; + WorkDiv m_workDiv; + }; +} // namespace alpaka::test diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index be08194b4234..93ce25e9f551 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -14,6 +14,11 @@ add_subdirectory(common) list(APPEND _alpaka_TEST_OPTIONS --colour-mode default) -add_subdirectory(analysis) -add_subdirectory(integ) -add_subdirectory(unit) +if(BUILD_TESTING) + add_subdirectory(analysis) + add_subdirectory(integ) + add_subdirectory(unit) +endif() +if(alpaka_BUILD_BENCHMARK) + add_subdirectory(benchmark) +endif() diff --git a/test/analysis/headerCheck/CMakeLists.txt b/test/analysis/headerCheck/CMakeLists.txt index 9a2aade3b1de..ee6bcc4bafdf 100644 --- a/test/analysis/headerCheck/CMakeLists.txt +++ b/test/analysis/headerCheck/CMakeLists.txt @@ -44,6 +44,8 @@ if(alpaka_CHECK_HEADERS) PRIVATE common) set_target_properties(headerCheckTest PROPERTIES FOLDER "test/analysis") + # Catch2 benchmark macros must be defined, otherwise the benchmarking headers will not pass the check. + target_compile_definitions(${_TARGET_NAME} PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING) add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS}) diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt new file mode 100644 index 000000000000..55f8ad0d409f --- /dev/null +++ b/test/benchmark/CMakeLists.txt @@ -0,0 +1,13 @@ +# +# Copyright 2022 Jiri Vyskocil +# +# This file is part of alpaka. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +cmake_minimum_required(VERSION 3.18) + +add_subdirectory("rand/") diff --git a/test/benchmark/rand/CMakeLists.txt b/test/benchmark/rand/CMakeLists.txt new file mode 100644 index 000000000000..81a43740dd6c --- /dev/null +++ b/test/benchmark/rand/CMakeLists.txt @@ -0,0 +1,33 @@ +# +# Copyright 2022 Jiri Vyskocil +# +# This file is part of alpaka. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +set(_TARGET_NAME "randBenchmark") + +append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE) + +alpaka_add_executable( + ${_TARGET_NAME} + ${_FILES_SOURCE}) +target_link_libraries( + ${_TARGET_NAME} + PRIVATE common) + +set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/benchmark") +target_compile_definitions(${_TARGET_NAME} PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING) + +if(alpaka_CI) + # For non-benchmarking CI test runs - It will only run the benchmark once to see if it works at all. + add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS} --benchmark-samples 1) + # Real automated benchmark runs will need to collect more samples (the default 100 is fine). The CI will then + # have to set another variable to indicate if it is only testing, or if it wants to do a full benchmark. +else() + # For full benchmark run - will collect 100 samples for good benchmark statistics. + add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS}) +endif() diff --git a/test/benchmark/rand/src/randBenchmark.cpp b/test/benchmark/rand/src/randBenchmark.cpp new file mode 100644 index 000000000000..45b2bcaa8338 --- /dev/null +++ b/test/benchmark/rand/src/randBenchmark.cpp @@ -0,0 +1,91 @@ +/* Copyright 2022 Jiri Vyskocil + * + * This file is part of alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include +#include +#include + +#include +#include +#include + +class RandBenchmarkKernel +{ +public: + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, TIdx numPoints) const + { + // Get the global linearized thread idx. + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const globalThreadExtent = alpaka::getWorkDiv(acc); + + auto const linearizedGlobalThreadIdx + = static_cast(alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent)[0]); + + // Setup generator engine and distribution. + auto engine = alpaka::rand::engine::createDefault(acc, 42, linearizedGlobalThreadIdx); + auto dist(alpaka::rand::distribution::createUniformReal(acc)); + + float number = 0; + for(TIdx i = linearizedGlobalThreadIdx; i < numPoints; i += static_cast(globalThreadExtent.prod())) + { + number += dist(engine); + } + + alpaka::atomicAdd( + acc, + result, + number); // TODO: we're measuring the atomicAdd time too, this is not what we want + } +}; + +// TODO: This takes an enormous time to finish and is probably useless anyway: +// TEMPLATE_LIST_TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]", alpaka::test::TestAccs) +// Running the benchmark on a single default accelerator instead +TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]") +{ + // using Acc = TestType; + using Acc = alpaka::ExampleDefaultAcc, std::size_t>; + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + using Vec = alpaka::Vec; + using WorkDiv = alpaka::WorkDivMembers; + + auto const platform = alpaka::Platform{}; + auto const dev = alpaka::getDevByIdx(platform, 0); + + Idx const numThreads = std::thread::hardware_concurrency(); // TODO: GPU? + std::cout << "Hardware threads: " << numThreads << std::endl; + +#ifdef ALPAKA_CI // Reduced benchmark set for automated test runs. + unsigned const numPoints = GENERATE(10u, 1'000'000u); +#else + unsigned const numPoints = GENERATE(10u, 100000u, 1'000'000u, 10'000'000u, 100'000'000u, 1'000'000'000u); +#endif + + WorkDiv workdiv{alpaka::getValidWorkDiv( + dev, + Vec::all(numThreads * numThreads), + Vec::all(numThreads), + false, + alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)}; + + alpaka::test::KernelExecutionBenchmarkFixture fixture(workdiv); + + RandBenchmarkKernel kernel; + + float result = 0.0f; + + REQUIRE(fixture(kernel, "Random sequence N=" + std::to_string(numPoints), result, numPoints)); + // TODO: Actually check the result + std::cout << "\ntemp debug normalized result = " << result / static_cast(numPoints) + << " should probably converge to 0.5." << std::flush; +} diff --git a/thirdParty/CMakeLists.txt b/thirdParty/CMakeLists.txt index e8f03d283455..dfffc0ce78b8 100644 --- a/thirdParty/CMakeLists.txt +++ b/thirdParty/CMakeLists.txt @@ -3,7 +3,7 @@ # SPDX-License-Identifier: MPL-2.0 # -if(BUILD_TESTING) +if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK) if(alpaka_USE_INTERNAL_CATCH2) message(STATUS "Catch2: Using INTERNAL version 3.3.2") # Force Catch2's CMake to pick up the variables we set below