From 37351aa80dbf212ecd1dc6022cdd6f8afb888d66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ji=C5=99=C3=AD=20Vysko=C4=8Dil?= <jiri@vyskocil.com>
Date: Tue, 15 Nov 2022 13:37:20 +0100
Subject: [PATCH] Catch2 Benchmarking

---
 CMakeLists.txt                                |   5 +-
 cmake/alpakaCommon.cmake                      |  12 +-
 .../test/KernelExecutionBenchmarkFixture.hpp  | 118 ++++++++++++++++++
 test/CMakeLists.txt                           |  11 +-
 test/analysis/headerCheck/CMakeLists.txt      |   2 +
 test/benchmark/CMakeLists.txt                 |  13 ++
 test/benchmark/rand/CMakeLists.txt            |  33 +++++
 test/benchmark/rand/src/randBenchmark.cpp     |  91 ++++++++++++++
 thirdParty/CMakeLists.txt                     |   2 +-
 9 files changed, 275 insertions(+), 12 deletions(-)
 create mode 100644 include/alpaka/test/KernelExecutionBenchmarkFixture.hpp
 create mode 100644 test/benchmark/CMakeLists.txt
 create mode 100644 test/benchmark/rand/CMakeLists.txt
 create mode 100644 test/benchmark/rand/src/randBenchmark.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e0531010054a..e1bc7d0c8598 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,6 +41,7 @@ option(alpaka_BUILD_BENCHMARKS "Build the benchmarks" OFF)
 if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
     option(alpaka_ENABLE_WERROR "Treat all warnings as errors." OFF)
     option(BUILD_TESTING "Build the testing tree." OFF)
+    option(alpaka_BUILD_BENCHMARK "Build the benchmarks." OFF)
     include(CTest)
 endif()
 
@@ -48,7 +49,7 @@ option(alpaka_INSTALL_TEST_HEADER "Install headers of the namespace alpaka::test
 
 include(CMakeDependentOption)
 cmake_dependent_option(alpaka_CHECK_HEADERS "Check all alpaka headers as part of the tests whether they can be compiled standalone." OFF BUILD_TESTING OFF)
-cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON BUILD_TESTING OFF)
+cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON "BUILD_TESTING OR alpaka_BUILD_BENCHMARK" OFF)
 
 ################################################################################
 # Internal variables.
@@ -154,7 +155,7 @@ if(alpaka_BUILD_BENCHMARKS)
 endif()
 
 # Only build the tests if alpaka is the top-level project and BUILD_TESTING is ON
-if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND BUILD_TESTING)
+if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND (BUILD_TESTING OR alpaka_BUILD_BENCHMARK))
     add_subdirectory("test/")
 endif()
 
diff --git a/cmake/alpakaCommon.cmake b/cmake/alpakaCommon.cmake
index b17fefa46539..137ba00322f3 100644
--- a/cmake/alpakaCommon.cmake
+++ b/cmake/alpakaCommon.cmake
@@ -200,7 +200,7 @@ else()
                                                           "$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:GNU>,$<COMPILE_LANGUAGE:CUDA>>:SHELL:-Xcompiler -Og>"
                                                           "$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:Clang,AppleClang,IntelLLVM>>:SHELL:-O0>"
                                                           "$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:MSVC>>:SHELL:/Od>")
-    
+
     target_link_options(alpaka INTERFACE "$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:GNU>>:SHELL:-Og>"
                                          "$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:Clang,AppleClang,IntelLLVM>>:SHELL:-O0>")
 endif()
@@ -358,7 +358,7 @@ endif()
 if(alpaka_ACC_GPU_CUDA_ENABLE)
     # Save the user-defined host compiler (if any)
     set(_alpaka_CUDA_HOST_COMPILER ${CMAKE_CUDA_HOST_COMPILER})
-    
+
     check_language(CUDA)
 
     if(CMAKE_CUDA_COMPILER)
@@ -619,9 +619,9 @@ if(alpaka_ACC_SYCL_ENABLE)
         list(JOIN alpaka_SYCL_TARGETS "," alpaka_SYCL_TARGETS_CONCAT)
         alpaka_set_compiler_options(HOST_DEVICE target alpaka "-fsycl-targets=${alpaka_SYCL_TARGETS_CONCAT}")
         target_link_options(alpaka INTERFACE "-fsycl-targets=${alpaka_SYCL_TARGETS_CONCAT}")
-        
+
         #-----------------------------------------------------------------------------------------------------------------
-        # Determine actual hardware to compile for 
+        # Determine actual hardware to compile for
         if(alpaka_SYCL_ONEAPI_CPU)
             set(alpaka_SYCL_ONEAPI_CPU_ISA "avx2" CACHE STRING "Intel ISA to compile for")
             set_property(CACHE alpaka_SYCL_ONEAPI_CPU_ISA PROPERTY STRINGS "sse4.2;avx;avx2;avx512")
@@ -663,7 +663,7 @@ if(alpaka_ACC_SYCL_ENABLE)
                         PROPERTY STRINGS "intel_gpu_pvc;intel_gpu_acm_g12;intel_gpu_acm_g11;intel_gpu_acm_g10;intel_gpu_dg1;intel_gpu_adl_n;intel_gpu_adl_p;intel_gpu_rpl_s;intel_gpu_adl_s;intel_gpu_rkl;intel_gpu_tgllp;intel_gpu_icllp;intel_gpu_cml;intel_gpu_aml;intel_gpu_whl;intel_gpu_glk;intel_gpu_apl;intel_gpu_cfl;intel_gpu_kbl;intel_gpu_skl;intel_gpu_bdw")
             # If the user has given us a list turn all ';' into ',' to pacify the Intel OpenCL compiler.
             string(REPLACE ";" "," alpaka_SYCL_ONEAPI_GPU_DEVICES "${alpaka_SYCL_ONEAPI_GPU_DEVICES}")
-            
+
             target_compile_definitions(alpaka INTERFACE "ALPAKA_SYCL_ONEAPI_GPU")
         endif()
 
@@ -781,7 +781,7 @@ if(TARGET alpaka)
 
     # the alpaka library itself
     # SYSTEM voids showing warnings produced by alpaka when used in user applications.
-    if(BUILD_TESTING)
+    if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
         target_include_directories(alpaka INTERFACE ${_alpaka_INCLUDE_DIRECTORY})
     else()
         target_include_directories(alpaka SYSTEM INTERFACE ${_alpaka_INCLUDE_DIRECTORY})
diff --git a/include/alpaka/test/KernelExecutionBenchmarkFixture.hpp b/include/alpaka/test/KernelExecutionBenchmarkFixture.hpp
new file mode 100644
index 000000000000..ab159354168f
--- /dev/null
+++ b/include/alpaka/test/KernelExecutionBenchmarkFixture.hpp
@@ -0,0 +1,118 @@
+/* Copyright 2022 Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/alpaka.hpp>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#    error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#endif
+
+#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#    error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#endif
+
+#include <alpaka/test/Check.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+
+#include <catch2/benchmark/catch_benchmark.hpp>
+
+#include <string>
+#include <utility>
+
+namespace alpaka::test
+{
+    //! The fixture for executing a kernel on a given accelerator.
+    template<typename TAcc>
+    class KernelExecutionBenchmarkFixture
+    {
+    public:
+        using Acc = TAcc;
+        using Dim = alpaka::Dim<Acc>;
+        using Idx = alpaka::Idx<Acc>;
+        using Device = Dev<Acc>;
+        using Platform = alpaka::Platform<Acc>;
+        using Queue = test::DefaultQueue<Device>;
+        using WorkDiv = WorkDivMembers<Dim, Idx>;
+
+        KernelExecutionBenchmarkFixture(WorkDiv workDiv) : m_workDiv(std::move(workDiv))
+        {
+        }
+
+        template<typename TExtent>
+        KernelExecutionBenchmarkFixture(TExtent const& extent)
+            : KernelExecutionBenchmarkFixture(getValidWorkDiv<Acc>(
+                getDevByIdx<Acc>(0u),
+                extent,
+                Vec<Dim, Idx>::ones(),
+                false,
+                GridBlockExtentSubDivRestrictions::Unrestricted))
+        {
+        }
+
+        template<typename TKernelFnObj, typename... TArgs>
+        auto operator()(
+            TKernelFnObj const& kernelFnObj,
+            std::string const& benchmarkName,
+            float& result,
+            TArgs&&... args) -> bool
+        {
+            // Allocate result buffers
+            auto bufAccResult = allocBuf<float, Idx>(m_device, static_cast<Idx>(1u));
+            auto bufHostResult = allocBuf<float, Idx>(m_devHost, static_cast<Idx>(1u));
+
+            int numRuns = 0;
+            result = 0.0f;
+
+            // The following block is executed unknown times during estimation phase, then once per benchmark sample
+            BENCHMARK_ADVANCED(std::string(benchmarkName))(Catch::Benchmark::Chronometer meter)
+            {
+                numRuns++;
+                memset(m_queue, bufAccResult, 0);
+                wait(m_queue);
+
+                // Only the following part is measured as the benchmark part
+                meter.measure(
+                    [&]
+                    {
+                        exec<Acc>(
+                            m_queue,
+                            m_workDiv,
+                            kernelFnObj,
+                            getPtrNative(bufAccResult),
+                            std::forward<TArgs>(args)...); // run the measured kernel
+                        wait(m_queue); // wait for the kernel to actually run
+                    });
+
+                // Copy the result value to the host
+                memcpy(m_queue, bufHostResult, bufAccResult);
+                wait(m_queue);
+
+                auto const resultLocal = *getPtrNative(bufHostResult);
+                result += resultLocal;
+                return resultLocal; // make sure the benchmark call is not optimized away
+            };
+            result /= static_cast<float>(numRuns);
+
+            return true;
+            // TODO: Can we return the result here and read it from Catch2's REQUIRE or something similar? Or are the
+            // returns limited to bools?
+            //            return result;
+        }
+
+    protected:
+        PlatformCpu m_platformHost{};
+        DevCpu m_devHost{getDevByIdx(m_platformHost, 0)};
+        Platform m_platform{};
+        Device m_device{getDevByIdx(m_platform, 0)};
+        Queue m_queue{m_device};
+        WorkDiv m_workDiv;
+    };
+} // namespace alpaka::test
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index be08194b4234..93ce25e9f551 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -14,6 +14,11 @@ add_subdirectory(common)
 
 list(APPEND _alpaka_TEST_OPTIONS --colour-mode default)
 
-add_subdirectory(analysis)
-add_subdirectory(integ)
-add_subdirectory(unit)
+if(BUILD_TESTING)
+    add_subdirectory(analysis)
+    add_subdirectory(integ)
+    add_subdirectory(unit)
+endif()
+if(alpaka_BUILD_BENCHMARK)
+    add_subdirectory(benchmark)
+endif()
diff --git a/test/analysis/headerCheck/CMakeLists.txt b/test/analysis/headerCheck/CMakeLists.txt
index 9a2aade3b1de..ee6bcc4bafdf 100644
--- a/test/analysis/headerCheck/CMakeLists.txt
+++ b/test/analysis/headerCheck/CMakeLists.txt
@@ -44,6 +44,8 @@ if(alpaka_CHECK_HEADERS)
         PRIVATE common)
 
     set_target_properties(headerCheckTest PROPERTIES FOLDER "test/analysis")
+    # Catch2 benchmark macros must be defined, otherwise the benchmarking headers will not pass the check.
+    target_compile_definitions(${_TARGET_NAME} PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING)	 
 
     add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS})
 
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
new file mode 100644
index 000000000000..55f8ad0d409f
--- /dev/null
+++ b/test/benchmark/CMakeLists.txt
@@ -0,0 +1,13 @@
+#
+# Copyright 2022 Jiri Vyskocil
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+cmake_minimum_required(VERSION 3.18)
+
+add_subdirectory("rand/")
diff --git a/test/benchmark/rand/CMakeLists.txt b/test/benchmark/rand/CMakeLists.txt
new file mode 100644
index 000000000000..81a43740dd6c
--- /dev/null
+++ b/test/benchmark/rand/CMakeLists.txt
@@ -0,0 +1,33 @@
+#
+# Copyright 2022 Jiri Vyskocil
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+set(_TARGET_NAME "randBenchmark")
+
+append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    ${_FILES_SOURCE})
+target_link_libraries(
+    ${_TARGET_NAME}
+    PRIVATE common)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/benchmark")
+target_compile_definitions(${_TARGET_NAME} PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING)
+
+if(alpaka_CI)
+	# For non-benchmarking CI test runs - It will only run the benchmark once to see if it works at all.
+	add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS} --benchmark-samples 1)
+  	# Real automated benchmark runs will need to collect more samples (the default 100 is fine). The CI will then 
+  	# have to set another variable to indicate if it is only testing, or if it wants to do a full benchmark. 
+else()
+	# For full benchmark run - will collect 100 samples for good benchmark statistics.
+	add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS})
+endif()
diff --git a/test/benchmark/rand/src/randBenchmark.cpp b/test/benchmark/rand/src/randBenchmark.cpp
new file mode 100644
index 000000000000..45b2bcaa8338
--- /dev/null
+++ b/test/benchmark/rand/src/randBenchmark.cpp
@@ -0,0 +1,91 @@
+/* Copyright 2022 Jiri Vyskocil
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/rand/Traits.hpp>
+#include <alpaka/test/KernelExecutionBenchmarkFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/generators/catch_generators.hpp>
+
+class RandBenchmarkKernel
+{
+public:
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc, typename TIdx>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, TIdx numPoints) const
+    {
+        // Get the global linearized thread idx.
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+
+        auto const linearizedGlobalThreadIdx
+            = static_cast<TIdx>(alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent)[0]);
+
+        // Setup generator engine and distribution.
+        auto engine = alpaka::rand::engine::createDefault(acc, 42, linearizedGlobalThreadIdx);
+        auto dist(alpaka::rand::distribution::createUniformReal<float>(acc));
+
+        float number = 0;
+        for(TIdx i = linearizedGlobalThreadIdx; i < numPoints; i += static_cast<TIdx>(globalThreadExtent.prod()))
+        {
+            number += dist(engine);
+        }
+
+        alpaka::atomicAdd(
+            acc,
+            result,
+            number); // TODO: we're measuring the atomicAdd time too, this is not what we want
+    }
+};
+
+// TODO: This takes an enormous time to finish and is probably useless anyway:
+//   TEMPLATE_LIST_TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]", alpaka::test::TestAccs)
+// Running the benchmark on a single default accelerator instead
+TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]")
+{
+    //    using Acc = TestType;
+    using Acc = alpaka::ExampleDefaultAcc<alpaka::DimInt<1>, std::size_t>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+    using Vec = alpaka::Vec<Dim, Idx>;
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+
+    auto const platform = alpaka::Platform<Acc>{};
+    auto const dev = alpaka::getDevByIdx(platform, 0);
+
+    Idx const numThreads = std::thread::hardware_concurrency(); // TODO: GPU?
+    std::cout << "Hardware threads: " << numThreads << std::endl;
+
+#ifdef ALPAKA_CI // Reduced benchmark set for automated test runs.
+    unsigned const numPoints = GENERATE(10u, 1'000'000u);
+#else
+    unsigned const numPoints = GENERATE(10u, 100000u, 1'000'000u, 10'000'000u, 100'000'000u, 1'000'000'000u);
+#endif
+
+    WorkDiv workdiv{alpaka::getValidWorkDiv<Acc>(
+        dev,
+        Vec::all(numThreads * numThreads),
+        Vec::all(numThreads),
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)};
+
+    alpaka::test::KernelExecutionBenchmarkFixture<Acc> fixture(workdiv);
+
+    RandBenchmarkKernel kernel;
+
+    float result = 0.0f;
+
+    REQUIRE(fixture(kernel, "Random sequence N=" + std::to_string(numPoints), result, numPoints));
+    // TODO: Actually check the result
+    std::cout << "\ntemp debug normalized result = " << result / static_cast<float>(numPoints)
+              << " should probably converge to 0.5." << std::flush;
+}
diff --git a/thirdParty/CMakeLists.txt b/thirdParty/CMakeLists.txt
index e8f03d283455..dfffc0ce78b8 100644
--- a/thirdParty/CMakeLists.txt
+++ b/thirdParty/CMakeLists.txt
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
-if(BUILD_TESTING)
+if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
     if(alpaka_USE_INTERNAL_CATCH2)
         message(STATUS "Catch2: Using INTERNAL version 3.3.2")
         # Force Catch2's CMake to pick up the variables we set below