Catch2 Benchmarking

alpaka-group · May 12, 2022 · 6e4fd58 · 6e4fd58
1 parent 2fc0f04
commit 6e4fd58
Show file tree

Hide file tree

Showing 9 changed files with 257 additions and 8 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -41,6 +41,8 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
 option(alpaka_BUILD_EXAMPLES "Build the examples" OFF)
 
+option(alpaka_BUILD_BENCHMARK "Build the benchmarks." OFF)
+
 option(BUILD_TESTING "Build the testing tree." OFF)
 
 option(alpaka_INSTALL_TEST_HEADER "Install headers of the namespace alpaka::test. Attention, headers are not designed for production code, see documentation." OFF)
@@ -138,7 +140,8 @@ endif()
 if(alpaka_BUILD_EXAMPLES)
     add_subdirectory("example/")
 endif()
-if(BUILD_TESTING)
+if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
+	enable_testing()
     add_subdirectory("test/")
 endif()
 

diff --git a/cmake/alpakaCommon.cmake b/cmake/alpakaCommon.cmake
@@ -617,7 +617,7 @@ if(alpaka_ACC_SYCL_ENABLE)
 
     # Enable device-side printing to stdout
     cmake_dependent_option(alpaka_SYCL_ENABLE_IOSTREAM "Enable device-side printing to stdout" OFF "alpaka_ACC_SYCL_ENABLE" OFF)
-    if(BUILD_TESTING)
+    if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
         set(alpaka_SYCL_ENABLE_IOSTREAM ON CACHE BOOL "Enable device-side printing to stdout" FORCE)
     endif()
 
@@ -851,7 +851,7 @@ if(TARGET alpaka)
 
     # the alpaka library itself
     # SYSTEM voids showing warnings produced by alpaka when used in user applications.
-    if(BUILD_TESTING)
+    if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
         target_include_directories(alpaka INTERFACE ${_alpaka_INCLUDE_DIRECTORY})
     else()
         target_include_directories(alpaka SYSTEM INTERFACE ${_alpaka_INCLUDE_DIRECTORY})

diff --git a/include/alpaka/test/KernelExecutionBenchmarkFixture.hpp b/include/alpaka/test/KernelExecutionBenchmarkFixture.hpp
@@ -0,0 +1,115 @@
+/* Copyright 2022 Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <alpaka/alpaka.hpp>
+
+#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
+#    error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
+#endif
+
+#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
+#    error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
+#endif
+
+#include <alpaka/test/Check.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+
+#include <catch2/catch.hpp>
+
+#include <string>
+#include <utility>
+
+namespace alpaka::test
+{
+    //! The fixture for executing a kernel on a given accelerator.
+    template<typename TAcc>
+    class KernelExecutionBenchmarkFixture
+    {
+    public:
+        using Acc = TAcc;
+        using Dim = alpaka::Dim<Acc>;
+        using Idx = alpaka::Idx<Acc>;
+        using DevAcc = Dev<Acc>;
+        using PltfAcc = Pltf<DevAcc>;
+        using QueueAcc = test::DefaultQueue<DevAcc>;
+        using WorkDiv = WorkDivMembers<Dim, Idx>;
+
+        KernelExecutionBenchmarkFixture(WorkDiv workDiv)
+            : m_devHost(getDevByIdx<PltfCpu>(0u))
+            , m_devAcc(getDevByIdx<PltfAcc>(0u))
+            , m_queue(m_devAcc)
+            , m_workDiv(std::move(workDiv))
+        {
+        }
+
+        template<typename TExtent>
+        KernelExecutionBenchmarkFixture(TExtent const& extent)
+            : KernelExecutionBenchmarkFixture(getValidWorkDiv<Acc>(
+                getDevByIdx<PltfAcc>(0u),
+                extent,
+                Vec<Dim, Idx>::ones(),
+                false,
+                GridBlockExtentSubDivRestrictions::Unrestricted))
+        {
+        }
+
+        template<typename TKernelFnObj, typename... TArgs>
+        auto operator()(TKernelFnObj const& kernelFnObj, std::string const& benchmarkName, float& result, TArgs&&... args) -> bool
+        {
+            // Allocate result buffers
+            auto bufAccResult = allocBuf<float, Idx>(m_devAcc, static_cast<Idx>(1u));
+            auto bufHostResult = allocBuf<float, Idx>(m_devHost, static_cast<Idx>(1u));
+
+            int numRuns = 0;
+            result = 0.0f;
+
+            // The following block is executed unknown times during estimation phase, then once per benchmark sample
+            BENCHMARK_ADVANCED(std::string(benchmarkName))(Catch::Benchmark::Chronometer meter)
+            {
+                numRuns++;
+                memset(m_queue, bufAccResult, 0.0f);
+                wait(m_queue);
+
+                // Only the following part is measured as the benchmark part
+                meter.measure(
+                    [&]
+                    {
+                        exec<Acc>(
+                            m_queue,
+                            m_workDiv,
+                            kernelFnObj,
+                            getPtrNative(bufAccResult),
+                            std::forward<TArgs>(args)...); // run the measured kernel
+                        wait(m_queue);  // wait for the kernel to actually run
+                    });
+
+                // Copy the result value to the host
+                memcpy(m_queue, bufHostResult, bufAccResult);
+                wait(m_queue);
+
+                auto const resultLocal = *getPtrNative(bufHostResult);
+                result += resultLocal;
+                return resultLocal; // make sure the benchmark call is not optimized away
+            };
+            result /= static_cast<float>(numRuns);
+
+            return true;
+            //TODO: Can we return the result here and read it from Catch2's REQUIRE or something similar? Or are the returns limited to bools?
+//            return result;
+        }
+
+    protected:
+        DevCpu m_devHost;
+        DevAcc m_devAcc;
+        QueueAcc m_queue;
+        WorkDiv m_workDiv;
+    };
+} // namespace alpaka::test
diff --git a/include/alpaka/test/KernelExecutionFixture.hpp b/include/alpaka/test/KernelExecutionFixture.hpp
@@ -77,7 +77,7 @@ namespace alpaka::test
             return result;
         }
 
-    private:
+    protected:
         DevCpu m_devHost;
         DevAcc m_devAcc;
         QueueAcc m_queue;

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright 2015-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
+# Copyright 2015-2022 Benjamin Worpitz, Axel Huebl, Jan Stephan, Jiri Vyskocil
 #
 # This file is part of alpaka.
 #
@@ -21,6 +21,11 @@ add_subdirectory(common)
 
 list(APPEND _alpaka_TEST_OPTIONS --use-colour yes)
 
-add_subdirectory(analysis)
-add_subdirectory(integ)
-add_subdirectory(unit)
+if(BUILD_TESTING)
+    add_subdirectory(analysis)
+    add_subdirectory(integ)
+    add_subdirectory(unit)
+endif()
+if(alpaka_BUILD_BENCHMARK)
+    add_subdirectory(benchmark)
+endif()
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
@@ -0,0 +1,13 @@
+#
+# Copyright 2022 Jiri Vyskocil
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+cmake_minimum_required(VERSION 3.18)
+
+add_subdirectory("rand/")
diff --git a/test/benchmark/rand/CMakeLists.txt b/test/benchmark/rand/CMakeLists.txt
@@ -0,0 +1,29 @@
+#
+# Copyright 2022 Jiri Vyskocil
+#
+# This file is part of alpaka.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+set(_TARGET_NAME "randBenchmark")
+
+append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
+
+alpaka_add_executable(
+    ${_TARGET_NAME}
+    ${_FILES_SOURCE})
+target_link_libraries(
+    ${_TARGET_NAME}
+    PRIVATE common)
+
+set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/benchmark")
+target_compile_definitions(${_TARGET_NAME} PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING)
+
+if(alpaka_CI)
+	add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS} --benchmark-samples 1)    
+else()
+	add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS})
+endif()
diff --git a/test/benchmark/rand/src/randBenchmark.cpp b/test/benchmark/rand/src/randBenchmark.cpp
@@ -0,0 +1,80 @@
+/* Copyright 2022 Jiri Vyskocil
+ *
+ * This file is part of alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/example/ExampleDefaultAcc.hpp>
+#include <alpaka/rand/Traits.hpp>
+#include <alpaka/test/KernelExecutionBenchmarkFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+
+#include <catch2/catch.hpp>
+
+class RandBenchmarkKernel
+{
+public:
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc, typename TIdx>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, TIdx numPoints) const
+    {
+        // Get the global linearized thread idx.
+        auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
+
+        auto const linearizedGlobalThreadIdx
+            = static_cast<TIdx>(alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent)[0]);
+
+        // Setup generator engine and distribution.
+        auto engine = alpaka::rand::engine::createDefault(acc, 42, linearizedGlobalThreadIdx);
+        auto dist(alpaka::rand::distribution::createUniformReal<float>(acc));
+
+        float number = 0;
+        for(TIdx i = linearizedGlobalThreadIdx; i < numPoints; i += static_cast<TIdx>(globalThreadExtent.prod()))
+        {
+            number += dist(engine);
+        }
+
+        alpaka::atomicAdd(acc, result, number); //TODO: we're measuring the atomicAdd time too, this is not what we want
+    }
+};
+
+// TODO: This takes an enormous time to finish and is probably useless anyway:
+//   TEMPLATE_LIST_TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]", alpaka::test::TestAccs)
+// Running the benchmark on a single default accelerator instead
+TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]")
+{
+    //    using Acc = TestType;
+    using Acc = alpaka::ExampleDefaultAcc<alpaka::DimInt<1>, std::size_t>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+    using Vec = alpaka::Vec<Dim, Idx>;
+    using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+
+    auto const devAcc = alpaka::getDevByIdx<Acc>(0u);
+
+    const Idx numThreads = std::thread::hardware_concurrency(); // TODO: GPU?
+    std::cout << "Hardware threads: " << numThreads << std::endl;
+
+    const unsigned numPoints = GENERATE(100'000u, 1'000'000u, 10'000'000u, 100'000'000u, 1'000'000'000u);
+
+    WorkDiv workdiv{alpaka::getValidWorkDiv<Acc>(
+        devAcc,
+        Vec::all(numThreads * numThreads),
+        Vec::all(numThreads),
+        false,
+        alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)};
+
+    alpaka::test::KernelExecutionBenchmarkFixture<Acc> fixture(workdiv);
+
+    RandBenchmarkKernel kernel;
+
+    float result = 0.0f;
+
+    REQUIRE(fixture(kernel, "Random sequence N=" + std::to_string(numPoints), result, numPoints));
+    // TODO: Actually check the result
+    std::cout << "\ntemp debug normalized result = " << result / static_cast<float>(numPoints) << " should probably converge to 0.5." << std::flush;
+}
diff --git a/test/catch_main/CMakeLists.txt b/test/catch_main/CMakeLists.txt
@@ -27,6 +27,10 @@ set_target_properties(CatchMain PROPERTIES
     WINDOWS_EXPORT_ALL_SYMBOLS ON
 )
 
+if(alpaka_BUILD_BENCHMARK)
+    target_compile_definitions(CatchMain PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING)
+endif()
+
 target_compile_definitions(CatchMain PUBLIC "CATCH_CONFIG_FAST_COMPILE")
 if (CMAKE_CXX_COMPILER_ID STREQUAL "PGI")
     # Workaround for STL atomic issue: https://forums.developer.nvidia.com/t/support-for-atomic-in-libstdc-missing/135403/2