Skip to content

Commit

Permalink
Catch2 Benchmarking
Browse files Browse the repository at this point in the history
  • Loading branch information
sliwowitz committed Mar 22, 2024
1 parent 7a8b205 commit 37351aa
Show file tree
Hide file tree
Showing 9 changed files with 275 additions and 12 deletions.
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,15 @@ option(alpaka_BUILD_BENCHMARKS "Build the benchmarks" OFF)
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
option(alpaka_ENABLE_WERROR "Treat all warnings as errors." OFF)
option(BUILD_TESTING "Build the testing tree." OFF)
option(alpaka_BUILD_BENCHMARK "Build the benchmarks." OFF)
include(CTest)
endif()

option(alpaka_INSTALL_TEST_HEADER "Install headers of the namespace alpaka::test. Attention, headers are not designed for production code, see documentation." OFF)

include(CMakeDependentOption)
cmake_dependent_option(alpaka_CHECK_HEADERS "Check all alpaka headers as part of the tests whether they can be compiled standalone." OFF BUILD_TESTING OFF)
cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON BUILD_TESTING OFF)
cmake_dependent_option(alpaka_USE_INTERNAL_CATCH2 "Use internally shipped Catch2" ON "BUILD_TESTING OR alpaka_BUILD_BENCHMARK" OFF)

################################################################################
# Internal variables.
Expand Down Expand Up @@ -154,7 +155,7 @@ if(alpaka_BUILD_BENCHMARKS)
endif()

# Only build the tests if alpaka is the top-level project and BUILD_TESTING is ON
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND BUILD_TESTING)
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND (BUILD_TESTING OR alpaka_BUILD_BENCHMARK))
add_subdirectory("test/")
endif()

Expand Down
12 changes: 6 additions & 6 deletions cmake/alpakaCommon.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ else()
"$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:GNU>,$<COMPILE_LANGUAGE:CUDA>>:SHELL:-Xcompiler -Og>"
"$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:Clang,AppleClang,IntelLLVM>>:SHELL:-O0>"
"$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:MSVC>>:SHELL:/Od>")

target_link_options(alpaka INTERFACE "$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:GNU>>:SHELL:-Og>"
"$<$<AND:$<CONFIG:Debug>,$<CXX_COMPILER_ID:Clang,AppleClang,IntelLLVM>>:SHELL:-O0>")
endif()
Expand Down Expand Up @@ -358,7 +358,7 @@ endif()
if(alpaka_ACC_GPU_CUDA_ENABLE)
# Save the user-defined host compiler (if any)
set(_alpaka_CUDA_HOST_COMPILER ${CMAKE_CUDA_HOST_COMPILER})

check_language(CUDA)

if(CMAKE_CUDA_COMPILER)
Expand Down Expand Up @@ -619,9 +619,9 @@ if(alpaka_ACC_SYCL_ENABLE)
list(JOIN alpaka_SYCL_TARGETS "," alpaka_SYCL_TARGETS_CONCAT)
alpaka_set_compiler_options(HOST_DEVICE target alpaka "-fsycl-targets=${alpaka_SYCL_TARGETS_CONCAT}")
target_link_options(alpaka INTERFACE "-fsycl-targets=${alpaka_SYCL_TARGETS_CONCAT}")

#-----------------------------------------------------------------------------------------------------------------
# Determine actual hardware to compile for
# Determine actual hardware to compile for
if(alpaka_SYCL_ONEAPI_CPU)
set(alpaka_SYCL_ONEAPI_CPU_ISA "avx2" CACHE STRING "Intel ISA to compile for")
set_property(CACHE alpaka_SYCL_ONEAPI_CPU_ISA PROPERTY STRINGS "sse4.2;avx;avx2;avx512")
Expand Down Expand Up @@ -663,7 +663,7 @@ if(alpaka_ACC_SYCL_ENABLE)
PROPERTY STRINGS "intel_gpu_pvc;intel_gpu_acm_g12;intel_gpu_acm_g11;intel_gpu_acm_g10;intel_gpu_dg1;intel_gpu_adl_n;intel_gpu_adl_p;intel_gpu_rpl_s;intel_gpu_adl_s;intel_gpu_rkl;intel_gpu_tgllp;intel_gpu_icllp;intel_gpu_cml;intel_gpu_aml;intel_gpu_whl;intel_gpu_glk;intel_gpu_apl;intel_gpu_cfl;intel_gpu_kbl;intel_gpu_skl;intel_gpu_bdw")
# If the user has given us a list turn all ';' into ',' to pacify the Intel OpenCL compiler.
string(REPLACE ";" "," alpaka_SYCL_ONEAPI_GPU_DEVICES "${alpaka_SYCL_ONEAPI_GPU_DEVICES}")

target_compile_definitions(alpaka INTERFACE "ALPAKA_SYCL_ONEAPI_GPU")
endif()

Expand Down Expand Up @@ -781,7 +781,7 @@ if(TARGET alpaka)

# the alpaka library itself
# SYSTEM voids showing warnings produced by alpaka when used in user applications.
if(BUILD_TESTING)
if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
target_include_directories(alpaka INTERFACE ${_alpaka_INCLUDE_DIRECTORY})
else()
target_include_directories(alpaka SYSTEM INTERFACE ${_alpaka_INCLUDE_DIRECTORY})
Expand Down
118 changes: 118 additions & 0 deletions include/alpaka/test/KernelExecutionBenchmarkFixture.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/* Copyright 2022 Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber
*
* This file is part of alpaka.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/

#pragma once

#include <alpaka/alpaka.hpp>

#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
#endif

#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
#endif

#include <alpaka/test/Check.hpp>
#include <alpaka/test/queue/Queue.hpp>

#include <catch2/benchmark/catch_benchmark.hpp>

#include <string>
#include <utility>

namespace alpaka::test
{
//! The fixture for executing a kernel on a given accelerator.
template<typename TAcc>
class KernelExecutionBenchmarkFixture
{
public:
using Acc = TAcc;
using Dim = alpaka::Dim<Acc>;
using Idx = alpaka::Idx<Acc>;
using Device = Dev<Acc>;
using Platform = alpaka::Platform<Acc>;
using Queue = test::DefaultQueue<Device>;
using WorkDiv = WorkDivMembers<Dim, Idx>;

KernelExecutionBenchmarkFixture(WorkDiv workDiv) : m_workDiv(std::move(workDiv))
{
}

template<typename TExtent>
KernelExecutionBenchmarkFixture(TExtent const& extent)
: KernelExecutionBenchmarkFixture(getValidWorkDiv<Acc>(
getDevByIdx<Acc>(0u),
extent,
Vec<Dim, Idx>::ones(),
false,
GridBlockExtentSubDivRestrictions::Unrestricted))
{
}

template<typename TKernelFnObj, typename... TArgs>
auto operator()(
TKernelFnObj const& kernelFnObj,
std::string const& benchmarkName,
float& result,
TArgs&&... args) -> bool
{
// Allocate result buffers
auto bufAccResult = allocBuf<float, Idx>(m_device, static_cast<Idx>(1u));
auto bufHostResult = allocBuf<float, Idx>(m_devHost, static_cast<Idx>(1u));

int numRuns = 0;
result = 0.0f;

// The following block is executed unknown times during estimation phase, then once per benchmark sample
BENCHMARK_ADVANCED(std::string(benchmarkName))(Catch::Benchmark::Chronometer meter)
{
numRuns++;
memset(m_queue, bufAccResult, 0);
wait(m_queue);

// Only the following part is measured as the benchmark part
meter.measure(
[&]
{
exec<Acc>(
m_queue,
m_workDiv,
kernelFnObj,
getPtrNative(bufAccResult),
std::forward<TArgs>(args)...); // run the measured kernel
wait(m_queue); // wait for the kernel to actually run
});

// Copy the result value to the host
memcpy(m_queue, bufHostResult, bufAccResult);
wait(m_queue);

auto const resultLocal = *getPtrNative(bufHostResult);
result += resultLocal;
return resultLocal; // make sure the benchmark call is not optimized away
};
result /= static_cast<float>(numRuns);

return true;
// TODO: Can we return the result here and read it from Catch2's REQUIRE or something similar? Or are the
// returns limited to bools?
// return result;
}

protected:
PlatformCpu m_platformHost{};
DevCpu m_devHost{getDevByIdx(m_platformHost, 0)};
Platform m_platform{};
Device m_device{getDevByIdx(m_platform, 0)};
Queue m_queue{m_device};
WorkDiv m_workDiv;
};
} // namespace alpaka::test
11 changes: 8 additions & 3 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ add_subdirectory(common)

list(APPEND _alpaka_TEST_OPTIONS --colour-mode default)

add_subdirectory(analysis)
add_subdirectory(integ)
add_subdirectory(unit)
if(BUILD_TESTING)
add_subdirectory(analysis)
add_subdirectory(integ)
add_subdirectory(unit)
endif()
if(alpaka_BUILD_BENCHMARK)
add_subdirectory(benchmark)
endif()
2 changes: 2 additions & 0 deletions test/analysis/headerCheck/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ if(alpaka_CHECK_HEADERS)
PRIVATE common)

set_target_properties(headerCheckTest PROPERTIES FOLDER "test/analysis")
# Catch2 benchmark macros must be defined, otherwise the benchmarking headers will not pass the check.
target_compile_definitions(${_TARGET_NAME} PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING)

add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS})

Expand Down
13 changes: 13 additions & 0 deletions test/benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#
# Copyright 2022 Jiri Vyskocil
#
# This file is part of alpaka.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#

cmake_minimum_required(VERSION 3.18)

add_subdirectory("rand/")
33 changes: 33 additions & 0 deletions test/benchmark/rand/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#
# Copyright 2022 Jiri Vyskocil
#
# This file is part of alpaka.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#

set(_TARGET_NAME "randBenchmark")

append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)

alpaka_add_executable(
${_TARGET_NAME}
${_FILES_SOURCE})
target_link_libraries(
${_TARGET_NAME}
PRIVATE common)

set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/benchmark")
target_compile_definitions(${_TARGET_NAME} PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING)

if(alpaka_CI)
# For non-benchmarking CI test runs - It will only run the benchmark once to see if it works at all.
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS} --benchmark-samples 1)
# Real automated benchmark runs will need to collect more samples (the default 100 is fine). The CI will then
# have to set another variable to indicate if it is only testing, or if it wants to do a full benchmark.
else()
# For full benchmark run - will collect 100 samples for good benchmark statistics.
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS})
endif()
91 changes: 91 additions & 0 deletions test/benchmark/rand/src/randBenchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/* Copyright 2022 Jiri Vyskocil
*
* This file is part of alpaka.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/

#include <alpaka/example/ExampleDefaultAcc.hpp>
#include <alpaka/rand/Traits.hpp>
#include <alpaka/test/KernelExecutionBenchmarkFixture.hpp>
#include <alpaka/test/acc/TestAccs.hpp>

#include <catch2/catch_template_test_macros.hpp>
#include <catch2/catch_test_macros.hpp>
#include <catch2/generators/catch_generators.hpp>

class RandBenchmarkKernel
{
public:
ALPAKA_NO_HOST_ACC_WARNING
template<typename TAcc, typename TIdx>
ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, TIdx numPoints) const
{
// Get the global linearized thread idx.
auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

auto const linearizedGlobalThreadIdx
= static_cast<TIdx>(alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent)[0]);

// Setup generator engine and distribution.
auto engine = alpaka::rand::engine::createDefault(acc, 42, linearizedGlobalThreadIdx);
auto dist(alpaka::rand::distribution::createUniformReal<float>(acc));

float number = 0;
for(TIdx i = linearizedGlobalThreadIdx; i < numPoints; i += static_cast<TIdx>(globalThreadExtent.prod()))
{
number += dist(engine);
}

alpaka::atomicAdd(
acc,
result,
number); // TODO: we're measuring the atomicAdd time too, this is not what we want
}
};

// TODO: This takes an enormous time to finish and is probably useless anyway:
// TEMPLATE_LIST_TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]", alpaka::test::TestAccs)
// Running the benchmark on a single default accelerator instead
TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]")
{
// using Acc = TestType;
using Acc = alpaka::ExampleDefaultAcc<alpaka::DimInt<1>, std::size_t>;
using Dim = alpaka::Dim<Acc>;
using Idx = alpaka::Idx<Acc>;
using Vec = alpaka::Vec<Dim, Idx>;
using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;

auto const platform = alpaka::Platform<Acc>{};
auto const dev = alpaka::getDevByIdx(platform, 0);

Idx const numThreads = std::thread::hardware_concurrency(); // TODO: GPU?
std::cout << "Hardware threads: " << numThreads << std::endl;

#ifdef ALPAKA_CI // Reduced benchmark set for automated test runs.
unsigned const numPoints = GENERATE(10u, 1'000'000u);
#else
unsigned const numPoints = GENERATE(10u, 100000u, 1'000'000u, 10'000'000u, 100'000'000u, 1'000'000'000u);
#endif

WorkDiv workdiv{alpaka::getValidWorkDiv<Acc>(
dev,
Vec::all(numThreads * numThreads),
Vec::all(numThreads),
false,
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)};

alpaka::test::KernelExecutionBenchmarkFixture<Acc> fixture(workdiv);

RandBenchmarkKernel kernel;

float result = 0.0f;

REQUIRE(fixture(kernel, "Random sequence N=" + std::to_string(numPoints), result, numPoints));
// TODO: Actually check the result
std::cout << "\ntemp debug normalized result = " << result / static_cast<float>(numPoints)
<< " should probably converge to 0.5." << std::flush;
}
2 changes: 1 addition & 1 deletion thirdParty/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: MPL-2.0
#

if(BUILD_TESTING)
if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
if(alpaka_USE_INTERNAL_CATCH2)
message(STATUS "Catch2: Using INTERNAL version 3.3.2")
# Force Catch2's CMake to pick up the variables we set below
Expand Down

0 comments on commit 37351aa

Please sign in to comment.