From 792224f57867af16ef1c770b73dfe43631bcb004 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Mon, 27 Jan 2020 12:25:51 -0700 Subject: [PATCH 001/124] First pass at Kokkos in the build system --- .gitmodules | 3 +++ CMakeLists.txt | 12 ++++++++++++ tpl/kokkos | 1 + 3 files changed, 16 insertions(+) create mode 160000 tpl/kokkos diff --git a/.gitmodules b/.gitmodules index e6a012fbe..9aec11a63 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "tpl/RAJA"] path = tpl/RAJA url = https://github.com/LLNL/RAJA.git +[submodule "tpl/kokkos"] + path = tpl/kokkos + url = https://github.com/kokkos/kokkos diff --git a/CMakeLists.txt b/CMakeLists.txt index 9eb50a8d6..967c0a592 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,7 @@ cmake_minimum_required(VERSION 3.9) option(ENABLE_RAJA_SEQUENTIAL "Run sequential variants of RAJA kernels. Disable this, and all other variants, to run _only_ raw C loops." On) +option(ENABLE_KOKKOS "Include Kokkos implementations of the kernels in the RAJA Perfsuite" Off) # # Initialize the BLT build system @@ -105,6 +106,17 @@ configure_file(${CMAKE_SOURCE_DIR}/src/rajaperf_config.hpp.in # Make sure RAJA flag propagate set (CUDA_NVCC_FLAGS ${RAJA_NVCC_FLAGS}) +if(ENABLE_KOKKOS) + add_definitions(-DRUN_KOKKOS) + + add_subdirectory(tpl/kokkos) + + get_property(KOKKOS_INCLUDE_DIRS DIRECTORY tpl/kokkos PROPERTY INCLUDE_DIRECTORIES) + include_directories(${KOKKOS_INCLUDE_DIRS}) + list(APPEND RAJA_PERFSUITE_DEPENDS kokkos) +endif() + + # # Each directory in the perf suite has its own CMakeLists.txt file. # diff --git a/tpl/kokkos b/tpl/kokkos new file mode 160000 index 000000000..2983b80d9 --- /dev/null +++ b/tpl/kokkos @@ -0,0 +1 @@ +Subproject commit 2983b80d9aeafabb81f2c8c1c5a49b40cc0856cb From 9456d76fd7e5d63f0aa15dc71de1b4346c690f36 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Mon, 27 Jan 2020 12:57:00 -0700 Subject: [PATCH 002/124] First example --- src/basic/DAXPY-Seq.cpp | 32 ++++++++++++++++++++++++++++++++ src/common/Executor.cpp | 7 ++++++- src/common/RAJAPerfSuite.cpp | 23 +++++++++++++++++++++++ src/common/RAJAPerfSuite.hpp | 25 +++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 1 deletion(-) diff --git a/src/basic/DAXPY-Seq.cpp b/src/basic/DAXPY-Seq.cpp index 32a6442f7..0e745e22a 100644 --- a/src/basic/DAXPY-Seq.cpp +++ b/src/basic/DAXPY-Seq.cpp @@ -17,6 +17,13 @@ namespace rajaperf namespace basic { +struct DaxpyFunctor { + Real_ptr y; + Real_ptr x; + Real_type a; + DaxpyFunctor(Real_ptr m_x, Real_ptr m_y, Real_type m_a) { DAXPY_DATA_SETUP; } + void operator()(Index_type i) const { DAXPY_BODY; } +}; void DAXPY::runSeqVariant(VariantID vid) { @@ -78,6 +85,31 @@ void DAXPY::runSeqVariant(VariantID vid) } #endif +#if defined(RUN_KOKKOS) +#if defined(RUN_RAJA_SEQ) + case Kokkos_Lambda_Seq: { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + Kokkos::parallel_for("perfsuite.kokkos.seq.lambda", Kokkos::RangePolicy(ibegin, iend), + [=](Index_type i) { DAXPY_BODY; }); + } + stopTimer(); + + break; + } + case Kokkos_Functor_Seq: { + DaxpyFunctor daxpy_functor_instance(y,x,a); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + Kokkos::parallel_for("perfsuite.kokkos.seq.lambda", Kokkos::RangePolicy(ibegin, iend), + daxpy_functor_instance); + } + stopTimer(); + + break; + } +#endif // RUN_KOKKOS +#endif // RUN_RAJA_SEQ default : { std::cout << "\n DAXPY : Unknown variant id = " << vid << std::endl; } diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index fd1963470..4c1edef36 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -259,7 +259,9 @@ void Executor::setupSuite() } } // kernel and variant input both look good - +#if defined(RUN_KOKKOS) + Kokkos::initialize(); +#endif } // if kernel input looks good } @@ -422,6 +424,9 @@ void Executor::outputRunData() filename = out_fprefix + "-fom.csv"; writeFOMReport(filename); +#if defined(RUN_KOKKOS) + Kokkos::finalize(); // TODO DZP: should this be here? +#endif } diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 97f670ca6..2c0c7a22e 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -232,6 +232,29 @@ static const std::string VariantNames [] = std::string("RAJA_CUDA"), #endif +#if defined(RUN_KOKKOS) +#if defined(RUN_RAJA_SEQ) + std::string("Kokkos_Lambda_Seq"), + std::string("Kokkos_Functor_Seq"), +#endif + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + std::string("Kokkos_Lambda_OpenMP"), + std::string("Kokkos_Functor_OpenMP"), +#endif + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + std::string("Kokkos_Lambda_OMPTarget"), + std::string("Kokkos_Functor_OMPTarget"), +#endif + +#if defined(RAJA_ENABLE_CUDA) + std::string("Kokkos_Lambda_CUDA"), + std::string("Kokkos_Functor_CUDA"), +#endif + +#endif // RUN_KOKKOS + std::string("Unknown Variant") // Keep this at the end and DO NOT remove.... }; // END VariantNames diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 38ccaf042..cc2099ff3 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -15,6 +15,10 @@ #include "RAJA/config.hpp" +#if defined(RUN_KOKKOS) +#include "Kokkos_Core.hpp" +#endif + #include namespace rajaperf @@ -203,6 +207,27 @@ enum VariantID { RAJA_CUDA, #endif +#if defined(RUN_KOKKOS) +#if defined(RUN_RAJA_SEQ) + Kokkos_Lambda_Seq, + Kokkos_Functor_Seq, +#endif + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + Kokkos_Lambda_OpenMP, + Kokkos_Functor_OpenMP, +#endif + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + Kokkos_Lambda_OMPTarget, + Kokkos_Functor_OMPTarget, +#endif + +#if defined(RAJA_ENABLE_CUDA) + Kokkos_Lambda_CUDA, + Kokkos_Functor_CUDA, +#endif +#endif // RUN_KOKKOS NumVariants // Keep this one last and NEVER comment out (!!) }; From 19afb975af72de8d8aec908827fc6abe745638d4 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Thu, 2 Apr 2020 08:51:19 -0600 Subject: [PATCH 003/124] Reworking to split out the Kokkos implementations --- src/basic/CMakeLists.txt | 1 + src/basic/DAXPY-Seq.cpp | 25 ------------------------- src/basic/DAXPY.hpp | 4 ++++ src/common/KernelBase.cpp | 35 +++++++++++++++++++++++++++++++++++ src/common/KernelBase.hpp | 13 +++++++++++++ src/common/RAJAPerfSuite.cpp | 4 ++-- src/common/RAJAPerfSuite.hpp | 4 ++-- 7 files changed, 57 insertions(+), 29 deletions(-) diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index b218e7e94..62e6f18fb 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -15,6 +15,7 @@ blt_add_library( ATOMIC_PI-OMPTarget.cpp DAXPY.cpp DAXPY-Seq.cpp + DAXPY-KokkosSeq.cpp DAXPY-Cuda.cpp DAXPY-OMP.cpp DAXPY-OMPTarget.cpp diff --git a/src/basic/DAXPY-Seq.cpp b/src/basic/DAXPY-Seq.cpp index a35b982c0..a69385974 100644 --- a/src/basic/DAXPY-Seq.cpp +++ b/src/basic/DAXPY-Seq.cpp @@ -85,31 +85,6 @@ void DAXPY::runSeqVariant(VariantID vid) } #endif -#if defined(RUN_KOKKOS) -#if defined(RUN_RAJA_SEQ) - case Kokkos_Lambda_Seq: { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Kokkos::parallel_for("perfsuite.kokkos.seq.lambda", Kokkos::RangePolicy(ibegin, iend), - [=](Index_type i) { DAXPY_BODY; }); - } - stopTimer(); - - break; - } - case Kokkos_Functor_Seq: { - DaxpyFunctor daxpy_functor_instance(y,x,a); - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Kokkos::parallel_for("perfsuite.kokkos.seq.lambda", Kokkos::RangePolicy(ibegin, iend), - daxpy_functor_instance); - } - stopTimer(); - - break; - } -#endif // RUN_KOKKOS -#endif // RUN_RAJA_SEQ default : { std::cout << "\n DAXPY : Unknown variant id = " << vid << std::endl; } diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index c420759c4..d3a43135c 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -52,6 +52,10 @@ class DAXPY : public KernelBase void runCudaVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosSeqVariant(VariantID vid); + void runKokkosOpenMPVariant(VariantID vid); + void runKokkosCudaVariant(VariantID vid); + void runKokkosOpenMPTargetVariant(VariantID vid); private: Real_ptr m_x; Real_ptr m_y; diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 938abb501..c7769e81a 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -122,6 +122,41 @@ void KernelBase::runKernel(VariantID vid) } #endif +#if defined(RUN_KOKKOS) + case Kokkos_Lambda_Seq : + case Kokkos_Functor_Seq : + { + runKokkosSeqVariant(vid); + break; + } + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + case Kokkos_Lambda_OpenMP : + case Kokkos_Functor_OpenMP : + { + runKokkosOpenMPVariant(vid); + break; + } +#endif + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + case Kokkos_Lambda_OpenMPTarget : + case Kokkos_Functor_OpenMPTarget : + { + runKokkosOpenMPTargetVariant(vid); + break; + } +#endif + +#if defined(RAJA_ENABLE_CUDA) + case Kokkos_Lambda_CUDA : + case Kokkos_Functor_CUDA : + { + runKokkosCudaVariant(vid); + break; + } +#endif +#endif // RUN_KOKKOS default : { std::cout << "\n " << getName() << " : Unknown variant id = " << vid << std::endl; diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 103072f94..63a882066 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -108,6 +108,19 @@ class KernelBase virtual void runOpenMPTargetVariant(VariantID vid) = 0; #endif +#if defined(RUN_KOKKOS) + virtual void runKokkosSeqVariant(VariantID vid) = 0; +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + virtual void runKokkosOpenMPVariant(VariantID vid) = 0; +#endif +#if defined(RAJA_ENABLE_CUDA) + virtual void runKokkosCudaVariant(VariantID vid) = 0; +#endif +#if defined(RAJA_ENABLE_TARGET_OPENMP) + virtual void runKokkosOpenMPTargetVariant(VariantID vid) = 0; +#endif +#endif // RUN_KOKKOS + protected: int num_exec[NumVariants]; diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index e46590224..b302c65b9 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -244,8 +244,8 @@ static const std::string VariantNames [] = #endif #if defined(RAJA_ENABLE_TARGET_OPENMP) - std::string("Kokkos_Lambda_OMPTarget"), - std::string("Kokkos_Functor_OMPTarget"), + std::string("Kokkos_Lambda_OpenMPTarget"), + std::string("Kokkos_Functor_OpenMPTarget"), #endif #if defined(RAJA_ENABLE_CUDA) diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index bfbc44435..283ae3138 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -219,8 +219,8 @@ enum VariantID { #endif #if defined(RAJA_ENABLE_TARGET_OPENMP) - Kokkos_Lambda_OMPTarget, - Kokkos_Functor_OMPTarget, + Kokkos_Lambda_OpenMPTarget, + Kokkos_Functor_OpenMPTarget, #endif #if defined(RAJA_ENABLE_CUDA) From d658a4d1e83eb35f2068d4873788a600ee65e25c Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Thu, 14 May 2020 07:53:41 -0600 Subject: [PATCH 004/124] Added in the Kokkos implementation file (whoops) --- src/basic/DAXPY-KokkosSeq.cpp | 76 +++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 src/basic/DAXPY-KokkosSeq.cpp diff --git a/src/basic/DAXPY-KokkosSeq.cpp b/src/basic/DAXPY-KokkosSeq.cpp new file mode 100644 index 000000000..7f2cd92b3 --- /dev/null +++ b/src/basic/DAXPY-KokkosSeq.cpp @@ -0,0 +1,76 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DAXPY.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +struct DaxpyFunctor { + Real_ptr y; + Real_ptr x; + Real_type a; + DaxpyFunctor(Real_ptr m_x, Real_ptr m_y, Real_type m_a) { DAXPY_DATA_SETUP; } + void operator()(Index_type i) const { DAXPY_BODY; } +}; + +void DAXPY::runKokkosSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + DAXPY_DATA_SETUP; + + auto daxpy_lam = [=](Index_type i) { + DAXPY_BODY; + }; + + switch ( vid ) { + +#if defined(RUN_KOKKOS) +#if defined(RUN_RAJA_SEQ) + case Kokkos_Lambda_Seq: { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + Kokkos::parallel_for("perfsuite.kokkos.seq.lambda", Kokkos::RangePolicy(ibegin, iend), + [=](Index_type i) { DAXPY_BODY; }); + } + stopTimer(); + + break; + } + case Kokkos_Functor_Seq: { + DaxpyFunctor daxpy_functor_instance(y,x,a); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + Kokkos::parallel_for("perfsuite.kokkos.seq.lambda", Kokkos::RangePolicy(ibegin, iend), + daxpy_functor_instance); + } + stopTimer(); + + break; + } +#endif // RUN_KOKKOS +#endif // RUN_RAJA_SEQ + default : { + std::cout << "\n DAXPY : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace basic +} // end namespace rajaperf From f7631d1f0854346341f6f08c0b6a2444a61a708f Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Thu, 14 May 2020 07:00:04 -0700 Subject: [PATCH 005/124] Moved Kokkos files into their own directory --- src/basic-kokkos/CMakeLists.txt | 15 +++++++++++++++ src/{basic => basic-kokkos}/DAXPY-KokkosSeq.cpp | 0 src/basic/CMakeLists.txt | 1 - 3 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 src/basic-kokkos/CMakeLists.txt rename src/{basic => basic-kokkos}/DAXPY-KokkosSeq.cpp (100%) diff --git a/src/basic-kokkos/CMakeLists.txt b/src/basic-kokkos/CMakeLists.txt new file mode 100644 index 000000000..ec7916f76 --- /dev/null +++ b/src/basic-kokkos/CMakeLists.txt @@ -0,0 +1,15 @@ +############################################################################### +# Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +blt_add_library( + NAME basic-kokkos + SOURCES + DAXPY-KokkosSeq.cpp + DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} + ) + diff --git a/src/basic/DAXPY-KokkosSeq.cpp b/src/basic-kokkos/DAXPY-KokkosSeq.cpp similarity index 100% rename from src/basic/DAXPY-KokkosSeq.cpp rename to src/basic-kokkos/DAXPY-KokkosSeq.cpp diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index 62e6f18fb..b218e7e94 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -15,7 +15,6 @@ blt_add_library( ATOMIC_PI-OMPTarget.cpp DAXPY.cpp DAXPY-Seq.cpp - DAXPY-KokkosSeq.cpp DAXPY-Cuda.cpp DAXPY-OMP.cpp DAXPY-OMPTarget.cpp From 0f748314cf01d622bc4dcd3a04cdafc72cfe302b Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Thu, 14 May 2020 07:10:50 -0700 Subject: [PATCH 006/124] Working state, can now just stamp out kernels --- src/CMakeLists.txt | 19 ++++--- src/basic-kokkos/CMakeLists.txt | 2 + src/basic/CMakeLists.txt | 91 +++++++++++++++++---------------- src/common/RAJAPerfSuite.cpp | 8 ++- 4 files changed, 65 insertions(+), 55 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0bbae5b88..75c5e646c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,19 +9,22 @@ include_directories(.) add_subdirectory(common) -add_subdirectory(apps) +#add_subdirectory(apps) add_subdirectory(basic) -add_subdirectory(lcals) -add_subdirectory(polybench) -add_subdirectory(stream) +add_subdirectory(basic-kokkos) +#add_subdirectory(lcals) +#add_subdirectory(polybench) +#add_subdirectory(stream) set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS common - apps + #apps basic - lcals - polybench - stream) + basic-kokkos + #lcals + #polybench + #stream + ) list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) if(ENABLE_TARGET_OPENMP) diff --git a/src/basic-kokkos/CMakeLists.txt b/src/basic-kokkos/CMakeLists.txt index ec7916f76..a0a7d990d 100644 --- a/src/basic-kokkos/CMakeLists.txt +++ b/src/basic-kokkos/CMakeLists.txt @@ -6,6 +6,8 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### +include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/../basic) + blt_add_library( NAME basic-kokkos SOURCES diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index b218e7e94..25f45e953 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -8,55 +8,56 @@ blt_add_library( NAME basic - SOURCES ATOMIC_PI.cpp - ATOMIC_PI-Seq.cpp - ATOMIC_PI-Cuda.cpp - ATOMIC_PI-OMP.cpp - ATOMIC_PI-OMPTarget.cpp + SOURCES + # ATOMIC_PI.cpp + # ATOMIC_PI-Seq.cpp + # ATOMIC_PI-Cuda.cpp + # ATOMIC_PI-OMP.cpp + # ATOMIC_PI-OMPTarget.cpp DAXPY.cpp DAXPY-Seq.cpp DAXPY-Cuda.cpp DAXPY-OMP.cpp DAXPY-OMPTarget.cpp - IF_QUAD.cpp - IF_QUAD-Seq.cpp - IF_QUAD-Cuda.cpp - IF_QUAD-OMP.cpp - IF_QUAD-OMPTarget.cpp - INIT3.cpp - INIT3-Seq.cpp - INIT3-Cuda.cpp - INIT3-OMP.cpp - INIT3-OMPTarget.cpp - INIT_VIEW1D.cpp - INIT_VIEW1D-Seq.cpp - INIT_VIEW1D-Cuda.cpp - INIT_VIEW1D-OMP.cpp - INIT_VIEW1D-OMPTarget.cpp - INIT_VIEW1D_OFFSET.cpp - INIT_VIEW1D_OFFSET-Seq.cpp - INIT_VIEW1D_OFFSET-Cuda.cpp - INIT_VIEW1D_OFFSET-OMP.cpp - INIT_VIEW1D_OFFSET-OMPTarget.cpp - MULADDSUB.cpp - MULADDSUB-Seq.cpp - MULADDSUB-Cuda.cpp - MULADDSUB-OMP.cpp - MULADDSUB-OMPTarget.cpp - NESTED_INIT.cpp - NESTED_INIT-Seq.cpp - NESTED_INIT-Cuda.cpp - NESTED_INIT-OMP.cpp - NESTED_INIT-OMPTarget.cpp - REDUCE3_INT.cpp - REDUCE3_INT-Seq.cpp - REDUCE3_INT-Cuda.cpp - REDUCE3_INT-OMP.cpp - REDUCE3_INT-OMPTarget.cpp - TRAP_INT.cpp - TRAP_INT-Seq.cpp - TRAP_INT-Cuda.cpp - TRAP_INT-OMPTarget.cpp - TRAP_INT-OMP.cpp + # IF_QUAD.cpp + # IF_QUAD-Seq.cpp + # IF_QUAD-Cuda.cpp + # IF_QUAD-OMP.cpp + # IF_QUAD-OMPTarget.cpp + # INIT3.cpp + # INIT3-Seq.cpp + # INIT3-Cuda.cpp + # INIT3-OMP.cpp + # INIT3-OMPTarget.cpp + # INIT_VIEW1D.cpp + # INIT_VIEW1D-Seq.cpp + # INIT_VIEW1D-Cuda.cpp + # INIT_VIEW1D-OMP.cpp + # INIT_VIEW1D-OMPTarget.cpp + # INIT_VIEW1D_OFFSET.cpp + # INIT_VIEW1D_OFFSET-Seq.cpp + # INIT_VIEW1D_OFFSET-Cuda.cpp + # INIT_VIEW1D_OFFSET-OMP.cpp + # INIT_VIEW1D_OFFSET-OMPTarget.cpp + # MULADDSUB.cpp + # MULADDSUB-Seq.cpp + # MULADDSUB-Cuda.cpp + # MULADDSUB-OMP.cpp + # MULADDSUB-OMPTarget.cpp + # NESTED_INIT.cpp + # NESTED_INIT-Seq.cpp + # NESTED_INIT-Cuda.cpp + # NESTED_INIT-OMP.cpp + # NESTED_INIT-OMPTarget.cpp + # REDUCE3_INT.cpp + # REDUCE3_INT-Seq.cpp + # REDUCE3_INT-Cuda.cpp + # REDUCE3_INT-OMP.cpp + # REDUCE3_INT-OMPTarget.cpp + # TRAP_INT.cpp + # TRAP_INT-Seq.cpp + # TRAP_INT-Cuda.cpp + # TRAP_INT-OMPTarget.cpp + # TRAP_INT-OMP.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index b302c65b9..b805e5d7f 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -330,14 +330,17 @@ KernelBase* getKernelObject(KernelID kid, // // Basic kernels... // + /** case Basic_ATOMIC_PI : { kernel = new basic::ATOMIC_PI(run_params); break; } + */ case Basic_DAXPY : { kernel = new basic::DAXPY(run_params); break; } + /** case Basic_IF_QUAD : { kernel = new basic::IF_QUAD(run_params); break; @@ -370,7 +373,8 @@ KernelBase* getKernelObject(KernelID kid, kernel = new basic::TRAP_INT(run_params); break; } - + */ +/** DZP: big comment block for unimplemented // // Lcals kernels... // @@ -534,7 +538,7 @@ KernelBase* getKernelObject(KernelID kid, kernel = new apps::VOL3D(run_params); break; } - +*/ default: { std::cout << "\n Unknown Kernel ID = " << kid << std::endl; } From c70a7ca6b5bee2531a530337d4656e23fda433a3 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Thu, 14 May 2020 07:22:27 -0700 Subject: [PATCH 007/124] Fixed up the sequential Kokkos version --- src/basic-kokkos/DAXPY-KokkosSeq.cpp | 4 ++-- src/basic/DAXPY.hpp | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/basic-kokkos/DAXPY-KokkosSeq.cpp b/src/basic-kokkos/DAXPY-KokkosSeq.cpp index 7f2cd92b3..9216e7950 100644 --- a/src/basic-kokkos/DAXPY-KokkosSeq.cpp +++ b/src/basic-kokkos/DAXPY-KokkosSeq.cpp @@ -18,10 +18,10 @@ namespace basic { struct DaxpyFunctor { - Real_ptr y; Real_ptr x; + Real_ptr y; Real_type a; - DaxpyFunctor(Real_ptr m_x, Real_ptr m_y, Real_type m_a) { DAXPY_DATA_SETUP; } + DaxpyFunctor(Real_ptr m_x, Real_ptr m_y, Real_type m_a) : DAXPY_FUNCTOR_CONSTRUCT { } void operator()(Index_type i) const { DAXPY_BODY; } }; diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index d3a43135c..61f56f133 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -22,6 +22,11 @@ Real_ptr y = m_y; \ Real_type a = m_a; +#define DAXPY_FUNCTOR_CONSTRUCT \ + x(m_x),\ + y(m_y), \ + a(m_a) + #define DAXPY_BODY \ y[i] += a * x[i] ; From b52d974748a35d632e390c5356077046bf5578d5 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Thu, 15 Oct 2020 07:28:49 -0700 Subject: [PATCH 008/124] WIP: CUDA --- CMakeLists.txt | 29 ++++++- src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp | 85 ++++++++++++++++++++ src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp | 85 ++++++++++++++++++++ src/basic-kokkos/CMakeLists.txt | 3 + src/basic-kokkos/DAXPY-KokkosCuda.cpp | 99 ++++++++++++++++++++++++ src/basic-kokkos/DAXPY-KokkosOMP.cpp | 76 ++++++++++++++++++ src/basic-kokkos/DAXPY-KokkosOpenMP.cpp | 76 ++++++++++++++++++ src/basic/ATOMIC_PI.hpp | 9 +++ src/basic/CMakeLists.txt | 10 +-- 9 files changed, 464 insertions(+), 8 deletions(-) create mode 100644 src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp create mode 100644 src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp create mode 100644 src/basic-kokkos/DAXPY-KokkosCuda.cpp create mode 100644 src/basic-kokkos/DAXPY-KokkosOMP.cpp create mode 100644 src/basic-kokkos/DAXPY-KokkosOpenMP.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ee6e36e77..a1bdd5df5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,6 @@ ############################################################################### project(RAJAPerfSuite CXX) - cmake_minimum_required(VERSION 3.9) option(ENABLE_RAJA_SEQUENTIAL "Run sequential variants of RAJA kernels. Disable @@ -25,7 +24,8 @@ endif() set(ENABLE_TESTS Off CACHE BOOL "Enable BLT and RAJA tests") set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples") set(ENABLE_EXERCISES Off CACHE BOOL "Enable RAJA exercises") - +set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) +set(CMAKE_CUDA_HOST_LINK_LAUNCHER ${CMAKE_CUDA_COMPILER}) include(blt/SetupBLT.cmake) set(CMAKE_CXX_STANDARD 11) @@ -109,10 +109,33 @@ set (CUDA_NVCC_FLAGS ${RAJA_NVCC_FLAGS}) if(ENABLE_KOKKOS) add_definitions(-DRUN_KOKKOS) - add_subdirectory(tpl/kokkos) + if(ENABLE_CUDA) + set(Kokkos_ENABLE_CUDA ON) + set(Kokkos_ARCH_VOLTA70 ON) #TODO: better + enable_language(CUDA) + endif() + if(ENABLE_OPENMP) + set(Kokkos_ENABLE_OPENMP ON) + endif() + add_subdirectory(tpl/kokkos) + if(ENABLE_CUDA) + #get_target_property(kokkos_core_files kokkoscore SOURCES) + #get_target_property(kokkos_container_files kokkoscontainers SOURCES) + ##message(STATUS "KOKKOS FILES: ${kokkos_core_files}") + #foreach(kokkos_core_file IN LISTS kokkos_core_files) + # set_source_files_properties(${kokkos_core_file} PROPERTIES COMPILE_LANGUAGE CUDA) + # get_source_file_property(local ${kokkos_core_file} LANGUAGE) + # set(remote "") + # #get_source_file_property(remote ${kokkos_core_file} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/tpl/kokkos/core/src LANGUAGE) + # message(STATUS "Kokkos file: ${kokkos_core_file} ${local} ${remote}") + # + #endforeach() + #set_target_properties(kokkoscore kokkoscontainers PROPERTIES LANGUAGE CUDA) + endif() get_property(KOKKOS_INCLUDE_DIRS DIRECTORY tpl/kokkos PROPERTY INCLUDE_DIRECTORIES) include_directories(${KOKKOS_INCLUDE_DIRS}) + list(APPEND RAJA_PERFSUITE_DEPENDS kokkos) endif() diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp b/src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp new file mode 100644 index 000000000..d6cd9a1e6 --- /dev/null +++ b/src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp @@ -0,0 +1,85 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC_PI.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ +struct AtomicPIFunctor { + Real_type dx; + Real_ptr pi; + + AtomicPIFunctor(Real_type m_dx, Real_ptr m_pi) : ATOMIC_PI_FUNCTOR_CONSTRUCT {} +}; + + +void ATOMIC_PI::runKokkosOpenMPVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + ATOMIC_PI_DATA_SETUP; + +#if defined(RUN_KOKKOS) && defined(RUN_OPENMP) + switch ( vid ) { + + case Kokkos_Functor_OpenMP : { + + startTimer(); + //for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // *pi = m_pi_init; + // RAJA::forall( + // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + // double x = (double(i) + 0.5) * dx; + // RAJA::atomicAdd(pi, dx / (1.0 + x * x)); + // }); + // *pi *= 4.0; + + //} + stopTimer(); + + break; + } + case Kokkos_Lambda_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + *pi = m_pi_init; + + Kokkos::parallel_for("name",Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA(Index_type i){ + double x = ((double(i) + 0.5) * dx); + Kokkos::atomic_add(pi, dx / (1.0 + x * x)); + }); + *pi *= 4.0; + } + stopTimer(); + + break; + } + + + default : { + std::cout << "\n ATOMIC_PI : Unknown variant id = " << vid << std::endl; + } + + } + +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp b/src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp new file mode 100644 index 000000000..aa10e4c9a --- /dev/null +++ b/src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp @@ -0,0 +1,85 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC_PI.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ +struct AtomicPIFunctor { + Real_type dx; + Real_ptr pi; + + AtomicPIFunctor(Real_type m_dx, Real_ptr m_pi) : ATOMIC_PI_FUNCTOR_CONSTRUCT {} +}; + + +void ATOMIC_PI::runKokkosSeqVariant(VariantID vid) +{ + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + ATOMIC_PI_DATA_SETUP; + +#if defined(RUN_KOKKOS) && defined(RUN_OPENMP) + switch ( vid ) { + + case Kokkos_Functor_OpenMP : { + + startTimer(); + //for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // *pi = m_pi_init; + // RAJA::forall( + // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + // double x = (double(i) + 0.5) * dx; + // RAJA::atomicAdd(pi, dx / (1.0 + x * x)); + // }); + // *pi *= 4.0; + + //} + stopTimer(); + + break; + } + case Kokkos_Lambda_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + *pi = m_pi_init; + + Kokkos::parallel_for("name",Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA(Index_type i){ + double x = ((double(i) + 0.5) * dx); + Kokkos::atomic_add(pi, dx / (1.0 + x * x)); + }); + *pi *= 4.0; + } + stopTimer(); + + break; + } + + + default : { + std::cout << "\n ATOMIC_PI : Unknown variant id = " << vid << std::endl; + } + + } + +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/CMakeLists.txt b/src/basic-kokkos/CMakeLists.txt index a0a7d990d..4241d583f 100644 --- a/src/basic-kokkos/CMakeLists.txt +++ b/src/basic-kokkos/CMakeLists.txt @@ -12,6 +12,9 @@ blt_add_library( NAME basic-kokkos SOURCES DAXPY-KokkosSeq.cpp + DAXPY-KokkosOMP.cpp + ATOMIC_PI-KokkosOMP.cpp + ATOMIC_PI-KokkosSeq.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/basic-kokkos/DAXPY-KokkosCuda.cpp b/src/basic-kokkos/DAXPY-KokkosCuda.cpp new file mode 100644 index 000000000..026e62a70 --- /dev/null +++ b/src/basic-kokkos/DAXPY-KokkosCuda.cpp @@ -0,0 +1,99 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DAXPY.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for CUDA execution + // + const size_t block_size = 256; + + +#define DAXPY_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(x, m_x, iend); \ + allocAndInitCudaDeviceData(y, m_y, iend); + +#define DAXPY_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_y, y, iend); \ + deallocCudaDeviceData(x); \ + deallocCudaDeviceData(y); + +__global__ void daxpy(Real_ptr y, Real_ptr x, + Real_type a, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + DAXPY_BODY; + } +} + + +void DAXPY::runCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + DAXPY_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + DAXPY_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + daxpy<<>>( y, x, a, + iend ); + + } + stopTimer(); + + DAXPY_DATA_TEARDOWN_CUDA; + + } else if ( vid == RAJA_CUDA ) { + + DAXPY_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::cuda_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + DAXPY_BODY; + }); + + } + stopTimer(); + + DAXPY_DATA_TEARDOWN_CUDA; + + } else { + std::cout << "\n DAXPY : Unknown Cuda variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/DAXPY-KokkosOMP.cpp b/src/basic-kokkos/DAXPY-KokkosOMP.cpp new file mode 100644 index 000000000..2b59c8012 --- /dev/null +++ b/src/basic-kokkos/DAXPY-KokkosOMP.cpp @@ -0,0 +1,76 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DAXPY.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +struct DaxpyFunctor { + Real_ptr x; + Real_ptr y; + Real_type a; + DaxpyFunctor(Real_ptr m_x, Real_ptr m_y, Real_type m_a) : DAXPY_FUNCTOR_CONSTRUCT { } + void operator()(Index_type i) const { DAXPY_BODY; } +}; + +void DAXPY::runKokkosOpenMPVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + DAXPY_DATA_SETUP; + + auto daxpy_lam = [=](Index_type i) { + DAXPY_BODY; + }; + + switch ( vid ) { + +#if defined(RUN_KOKKOS) +#if defined(RUN_OPENMP) + case Kokkos_Lambda_OpenMP: { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + Kokkos::parallel_for("perfsuite.kokkos.openmp.lambda", Kokkos::RangePolicy(ibegin, iend), + [=](Index_type i) { DAXPY_BODY; }); + } + stopTimer(); + + break; + } + case Kokkos_Functor_OpenMP: { + DaxpyFunctor daxpy_functor_instance(y,x,a); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + Kokkos::parallel_for("perfsuite.kokkos.openmp.lambda", Kokkos::RangePolicy(ibegin, iend), + daxpy_functor_instance); + } + stopTimer(); + + break; + } +#endif // RUN_KOKKOS +#endif // RUN_RAJA_SEQ + default : { + std::cout << "\n DAXPY : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/DAXPY-KokkosOpenMP.cpp b/src/basic-kokkos/DAXPY-KokkosOpenMP.cpp new file mode 100644 index 000000000..2b59c8012 --- /dev/null +++ b/src/basic-kokkos/DAXPY-KokkosOpenMP.cpp @@ -0,0 +1,76 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DAXPY.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +struct DaxpyFunctor { + Real_ptr x; + Real_ptr y; + Real_type a; + DaxpyFunctor(Real_ptr m_x, Real_ptr m_y, Real_type m_a) : DAXPY_FUNCTOR_CONSTRUCT { } + void operator()(Index_type i) const { DAXPY_BODY; } +}; + +void DAXPY::runKokkosOpenMPVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + DAXPY_DATA_SETUP; + + auto daxpy_lam = [=](Index_type i) { + DAXPY_BODY; + }; + + switch ( vid ) { + +#if defined(RUN_KOKKOS) +#if defined(RUN_OPENMP) + case Kokkos_Lambda_OpenMP: { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + Kokkos::parallel_for("perfsuite.kokkos.openmp.lambda", Kokkos::RangePolicy(ibegin, iend), + [=](Index_type i) { DAXPY_BODY; }); + } + stopTimer(); + + break; + } + case Kokkos_Functor_OpenMP: { + DaxpyFunctor daxpy_functor_instance(y,x,a); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + Kokkos::parallel_for("perfsuite.kokkos.openmp.lambda", Kokkos::RangePolicy(ibegin, iend), + daxpy_functor_instance); + } + stopTimer(); + + break; + } +#endif // RUN_KOKKOS +#endif // RUN_RAJA_SEQ + default : { + std::cout << "\n DAXPY : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/ATOMIC_PI.hpp b/src/basic/ATOMIC_PI.hpp index 7dafbcae6..9680b2a6f 100644 --- a/src/basic/ATOMIC_PI.hpp +++ b/src/basic/ATOMIC_PI.hpp @@ -27,6 +27,9 @@ Real_type dx = m_dx; \ Real_ptr pi = m_pi; +#define ATOMIC_PI_FUNCTOR_CONSTRUCT \ + dx(m_dx), \ + pi(m_pi) #include "common/KernelBase.hpp" @@ -53,6 +56,12 @@ class ATOMIC_PI : public KernelBase void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); +#ifdef RUN_KOKKOS + void runKokkosSeqVariant(VariantID vid); + void runKokkosOpenMPVariant(VariantID vid); + void runKokkosCudaVariant(VariantID vid); + void runKokkosOpenMPTargetVariant(VariantID vid); +#endif private: Real_type m_dx; diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index 25f45e953..45c7daa2c 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -9,11 +9,11 @@ blt_add_library( NAME basic SOURCES - # ATOMIC_PI.cpp - # ATOMIC_PI-Seq.cpp - # ATOMIC_PI-Cuda.cpp - # ATOMIC_PI-OMP.cpp - # ATOMIC_PI-OMPTarget.cpp + ATOMIC_PI.cpp + ATOMIC_PI-Seq.cpp + ATOMIC_PI-Cuda.cpp + ATOMIC_PI-OMP.cpp + ATOMIC_PI-OMPTarget.cpp DAXPY.cpp DAXPY-Seq.cpp DAXPY-Cuda.cpp From 37c28972049579d90bb6dafe8d8d35388533ed6e Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Thu, 15 Oct 2020 07:35:46 -0700 Subject: [PATCH 009/124] Pulled latest Kokkos --- tpl/kokkos | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/kokkos b/tpl/kokkos index 2983b80d9..c4f78ff3a 160000 --- a/tpl/kokkos +++ b/tpl/kokkos @@ -1 +1 @@ -Subproject commit 2983b80d9aeafabb81f2c8c1c5a49b40cc0856cb +Subproject commit c4f78ff3ad12bf6d74b7f325617c27fde73d2ab8 From ae5007abb16d36d2361f50cce72f172effd6e759 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Thu, 15 Oct 2020 08:54:05 -0700 Subject: [PATCH 010/124] Functional DAXPY --- src/basic-kokkos/CMakeLists.txt | 1 + src/basic-kokkos/DAXPY-KokkosCuda.cpp | 42 +++++++++++++-------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/src/basic-kokkos/CMakeLists.txt b/src/basic-kokkos/CMakeLists.txt index 4241d583f..06ced59d1 100644 --- a/src/basic-kokkos/CMakeLists.txt +++ b/src/basic-kokkos/CMakeLists.txt @@ -13,6 +13,7 @@ blt_add_library( SOURCES DAXPY-KokkosSeq.cpp DAXPY-KokkosOMP.cpp + DAXPY-KokkosCuda.cpp ATOMIC_PI-KokkosOMP.cpp ATOMIC_PI-KokkosSeq.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} diff --git a/src/basic-kokkos/DAXPY-KokkosCuda.cpp b/src/basic-kokkos/DAXPY-KokkosCuda.cpp index 026e62a70..2fc6fab35 100644 --- a/src/basic-kokkos/DAXPY-KokkosCuda.cpp +++ b/src/basic-kokkos/DAXPY-KokkosCuda.cpp @@ -21,6 +21,14 @@ namespace rajaperf namespace basic { +struct DaxpyCudaFunctor { + Real_ptr x; + Real_ptr y; + Real_type a; + DaxpyCudaFunctor(Real_ptr m_x, Real_ptr m_y, Real_type m_a) : DAXPY_FUNCTOR_CONSTRUCT { } + KOKKOS_FUNCTION void operator()(Index_type i) const { DAXPY_BODY; } +}; + // // Define thread block size for CUDA execution // @@ -36,50 +44,40 @@ namespace basic deallocCudaDeviceData(x); \ deallocCudaDeviceData(y); -__global__ void daxpy(Real_ptr y, Real_ptr x, - Real_type a, - Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - DAXPY_BODY; - } -} - -void DAXPY::runCudaVariant(VariantID vid) +void DAXPY::runKokkosCudaVariant(VariantID vid) { +#if defined(RUN_KOKKOS) +#if defined(RAJA_ENABLE_CUDA) const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getRunSize(); DAXPY_DATA_SETUP; - - if ( vid == Base_CUDA ) { - + if ( vid == Kokkos_Functor_CUDA) { DAXPY_DATA_SETUP_CUDA; + DaxpyCudaFunctor daxpy_functor_instance(y,x,a); startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - daxpy<<>>( y, x, a, - iend ); + Kokkos::parallel_for("perfsuite.kokkos.seq.functor", Kokkos::RangePolicy(ibegin, iend), + daxpy_functor_instance); } stopTimer(); DAXPY_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { + } else if ( vid == Kokkos_Lambda_CUDA ) { DAXPY_DATA_SETUP_CUDA; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::cuda_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + Kokkos::parallel_for("perfsuite.kokkos.cuda.lambda", + Kokkos::RangePolicy(ibegin, iend), [=] __device__ (Index_type i) { DAXPY_BODY; }); @@ -91,6 +89,8 @@ void DAXPY::runCudaVariant(VariantID vid) } else { std::cout << "\n DAXPY : Unknown Cuda variant id = " << vid << std::endl; } +#endif // RAJA_ENABLE_CUDA +#endif // RUN_KOKKOS } } // end namespace basic From 7f707f9b02ee766f1a3de63e5cdafe455f2aef50 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Thu, 15 Oct 2020 09:16:33 -0700 Subject: [PATCH 011/124] Corrected name --- src/basic-kokkos/DAXPY-KokkosCuda.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic-kokkos/DAXPY-KokkosCuda.cpp b/src/basic-kokkos/DAXPY-KokkosCuda.cpp index 2fc6fab35..414d556f5 100644 --- a/src/basic-kokkos/DAXPY-KokkosCuda.cpp +++ b/src/basic-kokkos/DAXPY-KokkosCuda.cpp @@ -61,7 +61,7 @@ void DAXPY::runKokkosCudaVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Kokkos::parallel_for("perfsuite.kokkos.seq.functor", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("perfsuite.kokkos.cuda.functor", Kokkos::RangePolicy(ibegin, iend), daxpy_functor_instance); } From 88f98f9767c7bf17af12268db00d0030303ba857 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Thu, 15 Oct 2020 11:08:35 -0700 Subject: [PATCH 012/124] Update C++ standard --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a1bdd5df5..865f10beb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,8 +28,8 @@ set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) set(CMAKE_CUDA_HOST_LINK_LAUNCHER ${CMAKE_CUDA_COMPILER}) include(blt/SetupBLT.cmake) -set(CMAKE_CXX_STANDARD 11) -set(BLT_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 14) +set(BLT_CXX_STANDARD 14) # # Define RAJA settings... From 8b12325c207e8b5a495847c40a85e08fce803864 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Thu, 15 Oct 2020 12:04:47 -0700 Subject: [PATCH 013/124] Changes for a minimal build --- src/common/Executor.cpp | 2 +- src/common/RAJAPerfSuite.cpp | 4 +- src/common/RAJAPerfSuite.hpp | 90 ++++++++++++++++++------------------ src/common/RunParams.cpp | 4 +- 4 files changed, 50 insertions(+), 50 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 4419a078e..3e866d8cd 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -234,7 +234,7 @@ void Executor::setupSuite() kid != run_kern.end(); ++kid) { /// RDH DISABLE COUPLE KERNEL until we find a reasonable way to do /// complex numbers in GPU code - if ( *kid != Apps_COUPLE ) { + if ( /** *kid != Apps_COUPLE */ true ) { kernels.push_back( getKernelObject(*kid, run_params) ); } } diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index b805e5d7f..1c33de3e6 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -330,12 +330,12 @@ KernelBase* getKernelObject(KernelID kid, // // Basic kernels... // - /** + case Basic_ATOMIC_PI : { kernel = new basic::ATOMIC_PI(run_params); break; } - */ + case Basic_DAXPY : { kernel = new basic::DAXPY(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 283ae3138..3d4c04aa7 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -104,67 +104,67 @@ enum KernelID { // Basic_ATOMIC_PI = 0, Basic_DAXPY, - Basic_IF_QUAD, - Basic_INIT3, - Basic_INIT_VIEW1D, - Basic_INIT_VIEW1D_OFFSET, - Basic_MULADDSUB, - Basic_NESTED_INIT, - Basic_REDUCE3_INT, - Basic_TRAP_INT, + //Basic_IF_QUAD, + //Basic_INIT3, + //Basic_INIT_VIEW1D, + //Basic_INIT_VIEW1D_OFFSET, + //Basic_MULADDSUB, + //Basic_NESTED_INIT, + //Basic_REDUCE3_INT, + //Basic_TRAP_INT, // // Lcals kernels... // - Lcals_DIFF_PREDICT, - Lcals_EOS, - Lcals_FIRST_DIFF, - Lcals_FIRST_MIN, - Lcals_FIRST_SUM, - Lcals_GEN_LIN_RECUR, - Lcals_HYDRO_1D, - Lcals_HYDRO_2D, - Lcals_INT_PREDICT, - Lcals_PLANCKIAN, - Lcals_TRIDIAG_ELIM, + //Lcals_DIFF_PREDICT, + //Lcals_EOS, + //Lcals_FIRST_DIFF, + //Lcals_FIRST_MIN, + //Lcals_FIRST_SUM, + //Lcals_GEN_LIN_RECUR, + //Lcals_HYDRO_1D, + //Lcals_HYDRO_2D, + //Lcals_INT_PREDICT, + //Lcals_PLANCKIAN, + //Lcals_TRIDIAG_ELIM, // // Polybench kernels... // - Polybench_2MM, - Polybench_3MM, - Polybench_ADI, - Polybench_ATAX, - Polybench_FDTD_2D, - Polybench_FLOYD_WARSHALL, - Polybench_GEMM, - Polybench_GEMVER, - Polybench_GESUMMV, - Polybench_HEAT_3D, - Polybench_JACOBI_1D, - Polybench_JACOBI_2D, - Polybench_MVT, + //Polybench_2MM, + //Polybench_3MM, + //Polybench_ADI, + //Polybench_ATAX, + //Polybench_FDTD_2D, + //Polybench_FLOYD_WARSHALL, + //Polybench_GEMM, + //Polybench_GEMVER, + //Polybench_GESUMMV, + //Polybench_HEAT_3D, + //Polybench_JACOBI_1D, + //Polybench_JACOBI_2D, + //Polybench_MVT, // // Stream kernels... // - Stream_ADD, - Stream_COPY, - Stream_DOT, - Stream_MUL, - Stream_TRIAD, + //Stream_ADD, + //Stream_COPY, + //Stream_DOT, + //Stream_MUL, + //Stream_TRIAD, // // Apps kernels... // - Apps_COUPLE, - Apps_DEL_DOT_VEC_2D, - Apps_ENERGY, - Apps_FIR, - Apps_LTIMES, - Apps_LTIMES_NOVIEW, - Apps_PRESSURE, - Apps_VOL3D, + //Apps_COUPLE, + //Apps_DEL_DOT_VEC_2D, + //Apps_ENERGY, + //Apps_FIR, + //Apps_LTIMES, + //Apps_LTIMES_NOVIEW, + //Apps_PRESSURE, + //Apps_VOL3D, NumKernels // Keep this one last and NEVER comment out (!!) diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 156c19b87..4ad3b08b3 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -385,7 +385,7 @@ void RunParams::printKernelNames(std::ostream& str) const str << "\n------------------\n"; for (int ik = 0; ik < NumKernels; ++ik) { /// RDH DISABLE COUPLE KERNEL - if (static_cast(ik) != Apps_COUPLE) { + if ( /** static_cast(ik) != Apps_COUPLE*/ true) { str << getKernelName(static_cast(ik)) << std::endl; } } @@ -399,7 +399,7 @@ void RunParams::printFullKernelNames(std::ostream& str) const str << "\n-----------------------------------------\n"; for (int ik = 0; ik < NumKernels; ++ik) { /// RDH DISABLE COUPLE KERNEL - if (static_cast(ik) != Apps_COUPLE) { + if ( /** static_cast(ik) != Apps_COUPLE */ true) { str << getFullKernelName(static_cast(ik)) << std::endl; } } From 8b828f5214ccedd9e3009c1acb56f70452fad29a Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 16 Nov 2020 13:23:49 -0800 Subject: [PATCH 014/124] Kokkos IF_QUAD basic kernels --- CMakeLists.txt | 4 +++- src/basic-kokkos/CMakeLists.txt | 3 +++ src/basic/CMakeLists.txt | 10 +++++----- src/basic/IF_QUAD.hpp | 5 +++++ src/common/RAJAPerfSuite.cpp | 5 +++-- src/common/RAJAPerfSuite.hpp | 2 +- 6 files changed, 20 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 865f10beb..f104a6524 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,7 +87,7 @@ set(RAJAPERF_BUILD_SYSTYPE $ENV{SYS_TYPE}) set(RAJAPERF_BUILD_HOST $ENV{HOSTNAME}) if (ENABLE_CUDA) - set(CMAKE_CUDA_STANDARD 11) + set(CMAKE_CUDA_STANDARD 14) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -restrict -arch ${CUDA_ARCH} --expt-extended-lambda --expt-relaxed-constexpr") set(RAJAPERF_COMPILER "${CUDA_NVCC_EXECUTABLE}") @@ -106,9 +106,11 @@ configure_file(${CMAKE_SOURCE_DIR}/src/rajaperf_config.hpp.in # Make sure RAJA flag propagate set (CUDA_NVCC_FLAGS ${RAJA_NVCC_FLAGS}) +# ENABLE KOKKOS IS A RAJA PERFSUITE OPTION if(ENABLE_KOKKOS) add_definitions(-DRUN_KOKKOS) +# ENABLE_CUDA IS A RAJA PERFSUITE OPTION if(ENABLE_CUDA) set(Kokkos_ENABLE_CUDA ON) set(Kokkos_ARCH_VOLTA70 ON) #TODO: better diff --git a/src/basic-kokkos/CMakeLists.txt b/src/basic-kokkos/CMakeLists.txt index 06ced59d1..1ff3726f7 100644 --- a/src/basic-kokkos/CMakeLists.txt +++ b/src/basic-kokkos/CMakeLists.txt @@ -16,6 +16,9 @@ blt_add_library( DAXPY-KokkosCuda.cpp ATOMIC_PI-KokkosOMP.cpp ATOMIC_PI-KokkosSeq.cpp + IF_QUAD-KokkosSeq.cpp + IF_QUAD-KokkosOMP.cpp + IF_QUAD-KokkosCuda.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index 45c7daa2c..37ad73d0f 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -19,11 +19,11 @@ blt_add_library( DAXPY-Cuda.cpp DAXPY-OMP.cpp DAXPY-OMPTarget.cpp - # IF_QUAD.cpp - # IF_QUAD-Seq.cpp - # IF_QUAD-Cuda.cpp - # IF_QUAD-OMP.cpp - # IF_QUAD-OMPTarget.cpp + IF_QUAD.cpp + IF_QUAD-Seq.cpp + IF_QUAD-Cuda.cpp + IF_QUAD-OMP.cpp + IF_QUAD-OMPTarget.cpp # INIT3.cpp # INIT3-Seq.cpp # INIT3-Cuda.cpp diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp index 93e24b6f3..9c8f48697 100644 --- a/src/basic/IF_QUAD.hpp +++ b/src/basic/IF_QUAD.hpp @@ -69,6 +69,11 @@ class IF_QUAD : public KernelBase void runCudaVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosSeqVariant(VariantID vid); + void runKokkosOpenMPVariant(VariantID vid); + void runKokkosCudaVariant(VariantID vid); + void runKokkosOpenMPTargetVariant(VariantID vid); + private: Real_ptr m_a; Real_ptr m_b; diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 1c33de3e6..040ddd9d2 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -340,11 +340,12 @@ KernelBase* getKernelObject(KernelID kid, kernel = new basic::DAXPY(run_params); break; } - /** + case Basic_IF_QUAD : { kernel = new basic::IF_QUAD(run_params); break; - } +} + /** case Basic_INIT3 : { kernel = new basic::INIT3(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 3d4c04aa7..b9cad0e0a 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -104,7 +104,7 @@ enum KernelID { // Basic_ATOMIC_PI = 0, Basic_DAXPY, - //Basic_IF_QUAD, + Basic_IF_QUAD, //Basic_INIT3, //Basic_INIT_VIEW1D, //Basic_INIT_VIEW1D_OFFSET, From cfda905b32d7682f39cda1a28736286bf6e704dc Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 17 Nov 2020 10:19:19 -0800 Subject: [PATCH 015/124] infrastructure changes and Kokkos_ATOMIC_PI --- src/basic-kokkos/CMakeLists.txt | 1 + src/basic/ATOMIC_PI.cpp | 4 ++ src/basic/IF_QUAD.cpp | 6 +++ src/common/RAJAPerfSuite.hpp | 74 +++++++++++++++------------------ 4 files changed, 44 insertions(+), 41 deletions(-) diff --git a/src/basic-kokkos/CMakeLists.txt b/src/basic-kokkos/CMakeLists.txt index 1ff3726f7..566b9d8d2 100644 --- a/src/basic-kokkos/CMakeLists.txt +++ b/src/basic-kokkos/CMakeLists.txt @@ -16,6 +16,7 @@ blt_add_library( DAXPY-KokkosCuda.cpp ATOMIC_PI-KokkosOMP.cpp ATOMIC_PI-KokkosSeq.cpp + ATOMIC_PI-KokkosCuda.cpp IF_QUAD-KokkosSeq.cpp IF_QUAD-KokkosOMP.cpp IF_QUAD-KokkosCuda.cpp diff --git a/src/basic/ATOMIC_PI.cpp b/src/basic/ATOMIC_PI.cpp index 7d9f8f834..c876147ba 100644 --- a/src/basic/ATOMIC_PI.cpp +++ b/src/basic/ATOMIC_PI.cpp @@ -24,6 +24,10 @@ ATOMIC_PI::ATOMIC_PI(const RunParams& params) setDefaultSize(3000); setDefaultReps(10000); + setVariantDefined( Kokkos_Lambda_Seq ); + setVariantDefined( Kokkos_Lambda_OpenMP ); + setVariantDefined( Kokkos_Lambda_CUDA ); + setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index bf8983b64..b9836a175 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -24,6 +24,12 @@ IF_QUAD::IF_QUAD(const RunParams& params) setDefaultSize(100000); setDefaultReps(1800); + + setVariantDefined( Kokkos_Lambda_Seq ); + setVariantDefined( Kokkos_Lambda_OpenMP ); + setVariantDefined( Kokkos_Lambda_CUDA ); + + setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index fd6d992cc..00036a681 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -105,45 +105,45 @@ enum KernelID { Basic_ATOMIC_PI = 0, Basic_DAXPY, Basic_IF_QUAD, - //Basic_INIT3, - //Basic_INIT_VIEW1D, - //Basic_INIT_VIEW1D_OFFSET, - //Basic_MULADDSUB, - //Basic_NESTED_INIT, - //Basic_REDUCE3_INT, - //Basic_TRAP_INT, + Basic_INIT3, + Basic_INIT_VIEW1D, + Basic_INIT_VIEW1D_OFFSET, + Basic_MULADDSUB, + Basic_NESTED_INIT, + Basic_REDUCE3_INT, + Basic_TRAP_INT, // // Lcals kernels... // - //Lcals_DIFF_PREDICT, - //Lcals_EOS, - //Lcals_FIRST_DIFF, - //Lcals_FIRST_MIN, - //Lcals_FIRST_SUM, - //Lcals_GEN_LIN_RECUR, - //Lcals_HYDRO_1D, - //Lcals_HYDRO_2D, - //Lcals_INT_PREDICT, - //Lcals_PLANCKIAN, - //Lcals_TRIDIAG_ELIM, + Lcals_DIFF_PREDICT, + Lcals_EOS, + Lcals_FIRST_DIFF, + Lcals_FIRST_MIN, + Lcals_FIRST_SUM, + Lcals_GEN_LIN_RECUR, + Lcals_HYDRO_1D, + Lcals_HYDRO_2D, + Lcals_INT_PREDICT, + Lcals_PLANCKIAN, + Lcals_TRIDIAG_ELIM, // // Polybench kernels... // - //Polybench_2MM, - //Polybench_3MM, - //Polybench_ADI, - //Polybench_ATAX, - //Polybench_FDTD_2D, - //Polybench_FLOYD_WARSHALL, - //Polybench_GEMM, - //Polybench_GEMVER, - //Polybench_GESUMMV, - //Polybench_HEAT_3D, - //Polybench_JACOBI_1D, - //Polybench_JACOBI_2D, - //Polybench_MVT, + Polybench_2MM, + Polybench_3MM, + Polybench_ADI, + Polybench_ATAX, + Polybench_FDTD_2D, + Polybench_FLOYD_WARSHALL, + Polybench_GEMM, + Polybench_GEMVER, + Polybench_GESUMMV, + Polybench_HEAT_3D, + Polybench_JACOBI_1D, + Polybench_JACOBI_2D, + Polybench_MVT, // // Stream kernels... @@ -202,29 +202,21 @@ enum VariantID { Base_HIP, RAJA_HIP, -#if defined(RUN_KOKKOS) -#if defined(RUN_RAJA_SEQ) Kokkos_Lambda_Seq, Kokkos_Functor_Seq, -#endif -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) Kokkos_Lambda_OpenMP, Kokkos_Functor_OpenMP, -#endif -#if defined(RAJA_ENABLE_TARGET_OPENMP) Kokkos_Lambda_OpenMPTarget, Kokkos_Functor_OpenMPTarget, -#endif -#if defined(RAJA_ENABLE_CUDA) Kokkos_Lambda_CUDA, Kokkos_Functor_CUDA, -#endif -#endif // RUN_KOKKOS + NumVariants // Keep this one last and NEVER comment out (!!) + }; From b227e703a30daae6e9ed32ef47de55830c22ea18 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 17 Nov 2020 10:21:12 -0800 Subject: [PATCH 016/124] basic-kokkos IF_QUAD and ATOMIC_PI kernels --- src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp | 125 +++++++++++++++++++ src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp.bck | 85 +++++++++++++ src/basic-kokkos/ATOMIC_PI-OMPTarget.cpp | 103 +++++++++++++++ src/basic-kokkos/IF_QUAD-KokkosCuda.cpp | 120 ++++++++++++++++++ src/basic-kokkos/IF_QUAD-KokkosOMP.cpp | 85 +++++++++++++ src/basic-kokkos/IF_QUAD-KokkosSeq.cpp | 82 ++++++++++++ 6 files changed, 600 insertions(+) create mode 100644 src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp create mode 100644 src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp.bck create mode 100644 src/basic-kokkos/ATOMIC_PI-OMPTarget.cpp create mode 100644 src/basic-kokkos/IF_QUAD-KokkosCuda.cpp create mode 100644 src/basic-kokkos/IF_QUAD-KokkosOMP.cpp create mode 100644 src/basic-kokkos/IF_QUAD-KokkosSeq.cpp diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp b/src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp new file mode 100644 index 000000000..63a77bfc6 --- /dev/null +++ b/src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp @@ -0,0 +1,125 @@ +////~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC_PI.hpp" + +#include "RAJA/RAJA.hpp" + +//#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for CUDA execution + // + + +#define ATOMIC_PI_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(pi, m_pi, 1); + +#define ATOMIC_PI_DATA_TEARDOWN_CUDA \ + deallocCudaDeviceData(pi); + +// AJP COMMENTED THIS DEF OUT; IT IS THE DEF OF A RAJA KERNEL +/*__global__ void atomic_pi(Real_ptr pi, + Real_type dx, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + double x = (double(i) + 0.5) * dx; + RAJA::atomicAdd(pi, dx / (1.0 + x * x)); + } +} + +*/ +// AJP Kokkos-ifying here: + +void ATOMIC_PI::runKokkosCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + ATOMIC_PI_DATA_SETUP; + +#if defined(RUN_KOKKOS) + + if ( vid == Base_CUDA ) { + +#if defined (RUN_CUDA) + + ATOMIC_PI_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initCudaDeviceData(pi, &m_pi_init, 1); + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + atomic_pi<<>>( pi, dx, iend ); + + getCudaDeviceData(m_pi, pi, 1); + *m_pi *= 4.0; + + } + stopTimer(); + + ATOMIC_PI_DATA_TEARDOWN_CUDA; + +#endif //RUN_CUDA + + } else if ( vid == Kokkos_Lambda_CUDA ) { + + ATOMIC_PI_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initCudaDeviceData(pi, &m_pi_init, 1); + +// RAJA::forall< RAJA::cuda_exec >( +/* RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + double x = (double(i) + 0.5) * dx; + RAJA::atomicAdd(pi, dx / (1.0 + x * x)); + }); +*/ + + Kokkos::parallel_for("Atomic_PI Cuda", Kokkos::RangePolicy(ibegin, iend), + // Here, function executes on the device / GPU, and copies by VALUE + // the "[=] __device__" indicates "KOKKOS_LAMBDA"; + // KOKKOS_LAMBDA = #define KOKKOS_LAMBDA[=]__device__ + [=] __device__ (Index_type i) { + double x = (double(i) + 0.5) * dx; + Kokkos::atomic_add(pi, dx / (1.0 + x * x)); +}); + getCudaDeviceData(m_pi, pi, 1); + *m_pi *= 4.0; + + }; + stopTimer(); + + ATOMIC_PI_DATA_TEARDOWN_CUDA; + + } else { + std::cout << "\n ATOMIC_PI : Unknown Cuda variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RUN_KOKKOS +//#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp.bck b/src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp.bck new file mode 100644 index 000000000..d6cd9a1e6 --- /dev/null +++ b/src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp.bck @@ -0,0 +1,85 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC_PI.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ +struct AtomicPIFunctor { + Real_type dx; + Real_ptr pi; + + AtomicPIFunctor(Real_type m_dx, Real_ptr m_pi) : ATOMIC_PI_FUNCTOR_CONSTRUCT {} +}; + + +void ATOMIC_PI::runKokkosOpenMPVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + ATOMIC_PI_DATA_SETUP; + +#if defined(RUN_KOKKOS) && defined(RUN_OPENMP) + switch ( vid ) { + + case Kokkos_Functor_OpenMP : { + + startTimer(); + //for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // *pi = m_pi_init; + // RAJA::forall( + // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + // double x = (double(i) + 0.5) * dx; + // RAJA::atomicAdd(pi, dx / (1.0 + x * x)); + // }); + // *pi *= 4.0; + + //} + stopTimer(); + + break; + } + case Kokkos_Lambda_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + *pi = m_pi_init; + + Kokkos::parallel_for("name",Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA(Index_type i){ + double x = ((double(i) + 0.5) * dx); + Kokkos::atomic_add(pi, dx / (1.0 + x * x)); + }); + *pi *= 4.0; + } + stopTimer(); + + break; + } + + + default : { + std::cout << "\n ATOMIC_PI : Unknown variant id = " << vid << std::endl; + } + + } + +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/ATOMIC_PI-OMPTarget.cpp b/src/basic-kokkos/ATOMIC_PI-OMPTarget.cpp new file mode 100644 index 000000000..578b5ed99 --- /dev/null +++ b/src/basic-kokkos/ATOMIC_PI-OMPTarget.cpp @@ -0,0 +1,103 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC_PI.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define ATOMIC_PI_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ +\ + allocAndInitOpenMPDeviceData(pi, m_pi, 1, did, hid); + +#define ATOMIC_PI_DATA_TEARDOWN_OMP_TARGET \ + deallocOpenMPDeviceData(pi, did); + + +void ATOMIC_PI::runOpenMPTargetVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + ATOMIC_PI_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + ATOMIC_PI_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initOpenMPDeviceData(pi, &m_pi_init, 1, did, hid); + + #pragma omp target is_device_ptr(pi) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + double x = (double(i) + 0.5) * dx; + #pragma omp atomic + *pi += dx / (1.0 + x * x); + } + + getOpenMPDeviceData(m_pi, pi, 1, hid, did); + *m_pi *= 4.0; + + } + stopTimer(); + + ATOMIC_PI_DATA_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + ATOMIC_PI_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initOpenMPDeviceData(pi, &m_pi_init, 1, did, hid); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + double x = (double(i) + 0.5) * dx; + RAJA::atomicAdd(pi, dx / (1.0 + x * x)); + }); + + getOpenMPDeviceData(m_pi, pi, 1, hid, did); + *m_pi *= 4.0; + + } + stopTimer(); + + ATOMIC_PI_DATA_TEARDOWN_OMP_TARGET; + + } else { + std::cout << "\n ATOMIC_PI : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp b/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp new file mode 100644 index 000000000..658797702 --- /dev/null +++ b/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp @@ -0,0 +1,120 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "IF_QUAD.hpp" + +#include "RAJA/RAJA.hpp" + +//#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for CUDA execution + // + const size_t block_size = 256; + + +#define IF_QUAD_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(a, m_a, iend); \ + allocAndInitCudaDeviceData(b, m_b, iend); \ + allocAndInitCudaDeviceData(c, m_c, iend); \ + allocAndInitCudaDeviceData(x1, m_x1, iend); \ + allocAndInitCudaDeviceData(x2, m_x2, iend); + +#define IF_QUAD_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_x1, x1, iend); \ + getCudaDeviceData(m_x2, x2, iend); \ + deallocCudaDeviceData(a); \ + deallocCudaDeviceData(b); \ + deallocCudaDeviceData(c); \ + deallocCudaDeviceData(x1); \ + deallocCudaDeviceData(x2); + +// AJP started Kokkos-ifying here +void IF_QUAD::runKokkosCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + IF_QUAD_DATA_SETUP; + +#if defined(RUN_KOKKOS) + + if ( vid == Base_CUDA ) { + +#if defined(RUN_CUDA) + + IF_QUAD_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // QUESTION: Should "RAJA_DIVIDE_CEILING_INT be changed? + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + ifquad<<>>( x1, x2, a, b, c, + iend ); + + } + stopTimer(); + + IF_QUAD_DATA_TEARDOWN_CUDA; + +#endif // RUN_CUDA + + } else if ( vid == Kokkos_Lambda_CUDA ) { +// } else if ( vid == RAJA_CUDA ) { + + IF_QUAD_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + +// RAJA::forall< RAJA::cuda_exec >( +// RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { +// IF_QUAD_BODY; +// }); + + + Kokkos::parallel_for("Quad Cuda", Kokkos::RangePolicy(ibegin, iend), + // Here, the function executes on the device / GPU + [=] __device__ (Index_type i) {IF_QUAD_BODY}); + //KOKKOS_LAMBDA (Index_type i) {IF_QUAD_BODY}); + + +// >( +// RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { +// IF_QUAD_BODY; +// }); + + + } + stopTimer(); + + IF_QUAD_DATA_TEARDOWN_CUDA; + + } else { + std::cout << "\n IF_QUAD : Unknown Cuda variant id = " << vid << std::endl; + } + + +#endif // RUN_KOKKOS +} + +} // end namespace basic +} // end namespace rajaperf + +//#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/IF_QUAD-KokkosOMP.cpp b/src/basic-kokkos/IF_QUAD-KokkosOMP.cpp new file mode 100644 index 000000000..c2e3bb006 --- /dev/null +++ b/src/basic-kokkos/IF_QUAD-KokkosOMP.cpp @@ -0,0 +1,85 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "IF_QUAD.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +// Refers to both Kokkos and Raja namespaces; we are defining methods on a class in the +namespace basic +{ + + +// Kokkos-ify here +//void IF_QUAD::runSeqVariant(VariantID vid) + +void IF_QUAD::runKokkosOpenMPVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + IF_QUAD_DATA_SETUP; + + auto ifquad_lam = [=](Index_type i) { + IF_QUAD_BODY; + }; + + +#if defined(RUN_KOKKOS) + + switch ( vid ) { + + // AJP added (following DAXPY example) -- + +//#if defined(RUN_KOKKOS) +//#if defined(RUN_OPENMP) + + +#if defined(RUN_OPENMP) + +//#if defined(RUN_RAJA_SEQ) + + case Kokkos_Lambda_OpenMP: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + +/* RAJA::forall( + RAJA::RangeSegment(ibegin, iend), ifquad_lam); +*/ + // Translation + Kokkos::parallel_for("Quad", Kokkos::RangePolicy(ibegin, iend), + [=] (Index_type i) {IF_QUAD_BODY}); + + } + stopTimer(); + + break; + } +#endif // RUN_OPENMP + + default : { + std::cout << "\n IF_QUAD : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + + + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/IF_QUAD-KokkosSeq.cpp b/src/basic-kokkos/IF_QUAD-KokkosSeq.cpp new file mode 100644 index 000000000..0b21faa73 --- /dev/null +++ b/src/basic-kokkos/IF_QUAD-KokkosSeq.cpp @@ -0,0 +1,82 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "IF_QUAD.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +// Kokkos-ify here +//void IF_QUAD::runSeqVariant(VariantID vid) + +void IF_QUAD::runKokkosSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + IF_QUAD_DATA_SETUP; + + auto ifquad_lam = [=](Index_type i) { + IF_QUAD_BODY; + }; + + +#if defined(RUN_KOKKOS) + + switch ( vid ) { + + // AJP added (following DAXPY example) -- + +//#if defined(RUN_KOKKOS) +//#if defined(RUN_OPENMP) + + +#if defined(RUN_RAJA_SEQ) + + case Kokkos_Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + +/* RAJA::forall( + RAJA::RangeSegment(ibegin, iend), ifquad_lam); +*/ + // Translation + Kokkos::parallel_for("Quad", Kokkos::RangePolicy(ibegin, iend), + [=] (Index_type i) {IF_QUAD_BODY}); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n IF_QUAD : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + + + +} + +} // end namespace basic +} // end namespace rajaperf From b3ef7f5830b2284e8344960edc942d98d94da7cf Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Tue, 17 Nov 2020 11:16:03 -0800 Subject: [PATCH 017/124] Adding piles o' files --- src/basic-kokkos/ATOMIC_PI-KokkosHip.cpp | 107 +++++++++++ .../ATOMIC_PI-KokkosKokkosOMPTarget.cpp | 103 ++++++++++ src/basic-kokkos/DAXPY-KokkosHip.cpp | 99 ++++++++++ .../DAXPY-KokkosKokkosOMPTarget.cpp | 93 +++++++++ src/basic-kokkos/IF_QUAD-KokkosHip.cpp | 106 +++++++++++ .../IF_QUAD-KokkosKokkosOMPTarget.cpp | 99 ++++++++++ src/basic-kokkos/INIT3-KokkosCuda.cpp | 107 +++++++++++ src/basic-kokkos/INIT3-KokkosHip.cpp | 107 +++++++++++ .../INIT3-KokkosKokkosOMPTarget.cpp | 101 ++++++++++ src/basic-kokkos/INIT3-KokkosOMP.cpp | 93 +++++++++ src/basic-kokkos/INIT3-KokkosSeq.cpp | 90 +++++++++ src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp | 100 ++++++++++ src/basic-kokkos/INIT_VIEW1D-KokkosHip.cpp | 100 ++++++++++ .../INIT_VIEW1D-KokkosKokkosOMPTarget.cpp | 93 +++++++++ src/basic-kokkos/INIT_VIEW1D-KokkosOMP.cpp | 99 ++++++++++ src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp | 96 ++++++++++ .../INIT_VIEW1D_OFFSET-KokkosCuda.cpp | 101 ++++++++++ .../INIT_VIEW1D_OFFSET-KokkosHip.cpp | 101 ++++++++++ ...IT_VIEW1D_OFFSET-KokkosKokkosOMPTarget.cpp | 94 ++++++++++ .../INIT_VIEW1D_OFFSET-KokkosOMP.cpp | 99 ++++++++++ .../INIT_VIEW1D_OFFSET-KokkosSeq.cpp | 96 ++++++++++ src/basic-kokkos/MULADDSUB-KokkosCuda.cpp | 107 +++++++++++ src/basic-kokkos/MULADDSUB-KokkosHip.cpp | 107 +++++++++++ .../MULADDSUB-KokkosKokkosOMPTarget.cpp | 101 ++++++++++ src/basic-kokkos/MULADDSUB-KokkosOMP.cpp | 93 +++++++++ src/basic-kokkos/MULADDSUB-KokkosSeq.cpp | 90 +++++++++ src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp | 107 +++++++++++ src/basic-kokkos/NESTED_INIT-KokkosHip.cpp | 106 +++++++++++ .../NESTED_INIT-KokkosKokkosOMPTarget.cpp | 98 ++++++++++ src/basic-kokkos/NESTED_INIT-KokkosOMP.cpp | 134 +++++++++++++ src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp | 110 +++++++++++ src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp | 176 ++++++++++++++++++ src/basic-kokkos/REDUCE3_INT-KokkosHip.cpp | 175 +++++++++++++++++ .../REDUCE3_INT-KokkosKokkosOMPTarget.cpp | 110 +++++++++++ src/basic-kokkos/REDUCE3_INT-KokkosOMP.cpp | 126 +++++++++++++ src/basic-kokkos/REDUCE3_INT-KokkosSeq.cpp | 119 ++++++++++++ src/basic-kokkos/TRAP_INT-KokkosCuda.cpp | 158 ++++++++++++++++ src/basic-kokkos/TRAP_INT-KokkosHip.cpp | 157 ++++++++++++++++ .../TRAP_INT-KokkosKokkosOMPTarget.cpp | 111 +++++++++++ src/basic-kokkos/TRAP_INT-KokkosOMP.cpp | 122 ++++++++++++ src/basic-kokkos/TRAP_INT-KokkosSeq.cpp | 119 ++++++++++++ 41 files changed, 4510 insertions(+) create mode 100644 src/basic-kokkos/ATOMIC_PI-KokkosHip.cpp create mode 100644 src/basic-kokkos/ATOMIC_PI-KokkosKokkosOMPTarget.cpp create mode 100644 src/basic-kokkos/DAXPY-KokkosHip.cpp create mode 100644 src/basic-kokkos/DAXPY-KokkosKokkosOMPTarget.cpp create mode 100644 src/basic-kokkos/IF_QUAD-KokkosHip.cpp create mode 100644 src/basic-kokkos/IF_QUAD-KokkosKokkosOMPTarget.cpp create mode 100644 src/basic-kokkos/INIT3-KokkosCuda.cpp create mode 100644 src/basic-kokkos/INIT3-KokkosHip.cpp create mode 100644 src/basic-kokkos/INIT3-KokkosKokkosOMPTarget.cpp create mode 100644 src/basic-kokkos/INIT3-KokkosOMP.cpp create mode 100644 src/basic-kokkos/INIT3-KokkosSeq.cpp create mode 100644 src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp create mode 100644 src/basic-kokkos/INIT_VIEW1D-KokkosHip.cpp create mode 100644 src/basic-kokkos/INIT_VIEW1D-KokkosKokkosOMPTarget.cpp create mode 100644 src/basic-kokkos/INIT_VIEW1D-KokkosOMP.cpp create mode 100644 src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp create mode 100644 src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp create mode 100644 src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosHip.cpp create mode 100644 src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosKokkosOMPTarget.cpp create mode 100644 src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosOMP.cpp create mode 100644 src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosSeq.cpp create mode 100644 src/basic-kokkos/MULADDSUB-KokkosCuda.cpp create mode 100644 src/basic-kokkos/MULADDSUB-KokkosHip.cpp create mode 100644 src/basic-kokkos/MULADDSUB-KokkosKokkosOMPTarget.cpp create mode 100644 src/basic-kokkos/MULADDSUB-KokkosOMP.cpp create mode 100644 src/basic-kokkos/MULADDSUB-KokkosSeq.cpp create mode 100644 src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp create mode 100644 src/basic-kokkos/NESTED_INIT-KokkosHip.cpp create mode 100644 src/basic-kokkos/NESTED_INIT-KokkosKokkosOMPTarget.cpp create mode 100644 src/basic-kokkos/NESTED_INIT-KokkosOMP.cpp create mode 100644 src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp create mode 100644 src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp create mode 100644 src/basic-kokkos/REDUCE3_INT-KokkosHip.cpp create mode 100644 src/basic-kokkos/REDUCE3_INT-KokkosKokkosOMPTarget.cpp create mode 100644 src/basic-kokkos/REDUCE3_INT-KokkosOMP.cpp create mode 100644 src/basic-kokkos/REDUCE3_INT-KokkosSeq.cpp create mode 100644 src/basic-kokkos/TRAP_INT-KokkosCuda.cpp create mode 100644 src/basic-kokkos/TRAP_INT-KokkosHip.cpp create mode 100644 src/basic-kokkos/TRAP_INT-KokkosKokkosOMPTarget.cpp create mode 100644 src/basic-kokkos/TRAP_INT-KokkosOMP.cpp create mode 100644 src/basic-kokkos/TRAP_INT-KokkosSeq.cpp diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosHip.cpp b/src/basic-kokkos/ATOMIC_PI-KokkosHip.cpp new file mode 100644 index 000000000..e429c95ee --- /dev/null +++ b/src/basic-kokkos/ATOMIC_PI-KokkosHip.cpp @@ -0,0 +1,107 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC_PI.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + + +#define ATOMIC_PI_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(pi, m_pi, 1); + +#define ATOMIC_PI_DATA_TEARDOWN_HIP \ + deallocHipDeviceData(pi); + +__global__ void atomic_pi(Real_ptr pi, + Real_type dx, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + double x = (double(i) + 0.5) * dx; + RAJA::atomicAdd(pi, dx / (1.0 + x * x)); + } +} + + +void ATOMIC_PI::runHipVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + ATOMIC_PI_DATA_SETUP; + + if ( vid == Base_HIP ) { + + ATOMIC_PI_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initHipDeviceData(pi, &m_pi_init, 1); + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + hipLaunchKernelGGL(atomic_pi,grid_size, block_size, 0, 0, pi, dx, iend ); + + getHipDeviceData(m_pi, pi, 1); + *m_pi *= 4.0; + + } + stopTimer(); + + ATOMIC_PI_DATA_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { + + ATOMIC_PI_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initHipDeviceData(pi, &m_pi_init, 1); + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + double x = (double(i) + 0.5) * dx; + RAJA::atomicAdd(pi, dx / (1.0 + x * x)); + }); + + getHipDeviceData(m_pi, pi, 1); + *m_pi *= 4.0; + + } + stopTimer(); + + ATOMIC_PI_DATA_TEARDOWN_HIP; + + } else { + std::cout << "\n ATOMIC_PI : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/ATOMIC_PI-KokkosKokkosOMPTarget.cpp new file mode 100644 index 000000000..934415cd9 --- /dev/null +++ b/src/basic-kokkos/ATOMIC_PI-KokkosKokkosOMPTarget.cpp @@ -0,0 +1,103 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ATOMIC_PI.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define ATOMIC_PI_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ +\ + allocAndInitOpenMPDeviceData(pi, m_pi, 1, did, hid); + +#define ATOMIC_PI_DATA_TEARDOWN_OMP_TARGET \ + deallocOpenMPDeviceData(pi, did); + + +void ATOMIC_PI::runKokkosOpenMPTargetVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + ATOMIC_PI_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + ATOMIC_PI_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initOpenMPDeviceData(pi, &m_pi_init, 1, did, hid); + + #pragma omp target is_device_ptr(pi) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + double x = (double(i) + 0.5) * dx; + #pragma omp atomic + *pi += dx / (1.0 + x * x); + } + + getOpenMPDeviceData(m_pi, pi, 1, hid, did); + *m_pi *= 4.0; + + } + stopTimer(); + + ATOMIC_PI_DATA_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + ATOMIC_PI_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initOpenMPDeviceData(pi, &m_pi_init, 1, did, hid); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + double x = (double(i) + 0.5) * dx; + RAJA::atomicAdd(pi, dx / (1.0 + x * x)); + }); + + getOpenMPDeviceData(m_pi, pi, 1, hid, did); + *m_pi *= 4.0; + + } + stopTimer(); + + ATOMIC_PI_DATA_TEARDOWN_OMP_TARGET; + + } else { + std::cout << "\n ATOMIC_PI : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/DAXPY-KokkosHip.cpp b/src/basic-kokkos/DAXPY-KokkosHip.cpp new file mode 100644 index 000000000..8f3fb1ada --- /dev/null +++ b/src/basic-kokkos/DAXPY-KokkosHip.cpp @@ -0,0 +1,99 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DAXPY.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + + +#define DAXPY_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(x, m_x, iend); \ + allocAndInitHipDeviceData(y, m_y, iend); + +#define DAXPY_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_y, y, iend); \ + deallocHipDeviceData(x); \ + deallocHipDeviceData(y); + +__global__ void daxpy(Real_ptr y, Real_ptr x, + Real_type a, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + DAXPY_BODY; + } +} + + +void DAXPY::runHipVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + DAXPY_DATA_SETUP; + + if ( vid == Base_HIP ) { + + DAXPY_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + hipLaunchKernelGGL((daxpy),dim3(grid_size), dim3(block_size), 0, 0, y, x, a, + iend ); + + } + stopTimer(); + + DAXPY_DATA_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { + + DAXPY_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + DAXPY_BODY; + }); + + } + stopTimer(); + + DAXPY_DATA_TEARDOWN_HIP; + + } else { + std::cout << "\n DAXPY : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/DAXPY-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/DAXPY-KokkosKokkosOMPTarget.cpp new file mode 100644 index 000000000..98783a19f --- /dev/null +++ b/src/basic-kokkos/DAXPY-KokkosKokkosOMPTarget.cpp @@ -0,0 +1,93 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DAXPY.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define DAXPY_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ +\ + allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \ + allocAndInitOpenMPDeviceData(y, m_y, iend, did, hid); + +#define DAXPY_DATA_TEARDOWN_OMP_TARGET \ + getOpenMPDeviceData(m_y, y, iend, hid, did); \ + deallocOpenMPDeviceData(x, did); \ + deallocOpenMPDeviceData(y, did); + + +void DAXPY::runKokkosOpenMPTargetVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + DAXPY_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + DAXPY_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp target is_device_ptr(x, y) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + DAXPY_BODY; + } + + } + stopTimer(); + + DAXPY_DATA_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + DAXPY_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + DAXPY_BODY; + }); + + } + stopTimer(); + + DAXPY_DATA_TEARDOWN_OMP_TARGET; + + } else { + std::cout << "\n DAXPY : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/IF_QUAD-KokkosHip.cpp b/src/basic-kokkos/IF_QUAD-KokkosHip.cpp new file mode 100644 index 000000000..246cd30fd --- /dev/null +++ b/src/basic-kokkos/IF_QUAD-KokkosHip.cpp @@ -0,0 +1,106 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "IF_QUAD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + + +#define IF_QUAD_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(a, m_a, iend); \ + allocAndInitHipDeviceData(b, m_b, iend); \ + allocAndInitHipDeviceData(c, m_c, iend); \ + allocAndInitHipDeviceData(x1, m_x1, iend); \ + allocAndInitHipDeviceData(x2, m_x2, iend); + +#define IF_QUAD_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_x1, x1, iend); \ + getHipDeviceData(m_x2, x2, iend); \ + deallocHipDeviceData(a); \ + deallocHipDeviceData(b); \ + deallocHipDeviceData(c); \ + deallocHipDeviceData(x1); \ + deallocHipDeviceData(x2); + +__global__ void ifquad(Real_ptr x1, Real_ptr x2, + Real_ptr a, Real_ptr b, Real_ptr c, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + IF_QUAD_BODY; + } +} + + +void IF_QUAD::runHipVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + IF_QUAD_DATA_SETUP; + + if ( vid == Base_HIP ) { + + IF_QUAD_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + hipLaunchKernelGGL((ifquad), dim3(grid_size), dim3(block_size), 0, 0, x1, x2, a, b, c, + iend ); + + } + stopTimer(); + + IF_QUAD_DATA_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { + + IF_QUAD_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + IF_QUAD_BODY; + }); + + } + stopTimer(); + + IF_QUAD_DATA_TEARDOWN_HIP; + + } else { + std::cout << "\n IF_QUAD : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/IF_QUAD-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/IF_QUAD-KokkosKokkosOMPTarget.cpp new file mode 100644 index 000000000..8a93dcd28 --- /dev/null +++ b/src/basic-kokkos/IF_QUAD-KokkosKokkosOMPTarget.cpp @@ -0,0 +1,99 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "IF_QUAD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define IF_QUAD_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ +\ + allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid); \ + allocAndInitOpenMPDeviceData(b, m_b, iend, did, hid); \ + allocAndInitOpenMPDeviceData(c, m_c, iend, did, hid); \ + allocAndInitOpenMPDeviceData(x1, m_x1, iend, did, hid); \ + allocAndInitOpenMPDeviceData(x2, m_x2, iend, did, hid); + +#define IF_QUAD_DATA_TEARDOWN_OMP_TARGET \ + getOpenMPDeviceData(m_x1, x1, iend, hid, did); \ + getOpenMPDeviceData(m_x2, x2, iend, hid, did); \ + deallocOpenMPDeviceData(a, did); \ + deallocOpenMPDeviceData(b, did); \ + deallocOpenMPDeviceData(c, did); \ + deallocOpenMPDeviceData(x1, did); \ + deallocOpenMPDeviceData(x2, did); + +void IF_QUAD::runKokkosOpenMPTargetVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + IF_QUAD_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + IF_QUAD_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp target is_device_ptr(a, b, c, x1, x2) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + IF_QUAD_BODY; + } + + } + stopTimer(); + + IF_QUAD_DATA_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + IF_QUAD_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + IF_QUAD_BODY; + }); + + } + stopTimer(); + + IF_QUAD_DATA_TEARDOWN_OMP_TARGET; + + } else { + std::cout << "\n IF_QUAD : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/INIT3-KokkosCuda.cpp b/src/basic-kokkos/INIT3-KokkosCuda.cpp new file mode 100644 index 000000000..14ee12f81 --- /dev/null +++ b/src/basic-kokkos/INIT3-KokkosCuda.cpp @@ -0,0 +1,107 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT3.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for CUDA execution + // + const size_t block_size = 256; + + +#define INIT3_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(out1, m_out1, iend); \ + allocAndInitCudaDeviceData(out2, m_out2, iend); \ + allocAndInitCudaDeviceData(out3, m_out3, iend); \ + allocAndInitCudaDeviceData(in1, m_in1, iend); \ + allocAndInitCudaDeviceData(in2, m_in2, iend); + +#define INIT3_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_out1, out1, iend); \ + getCudaDeviceData(m_out2, out2, iend); \ + getCudaDeviceData(m_out3, out3, iend); \ + deallocCudaDeviceData(out1); \ + deallocCudaDeviceData(out2); \ + deallocCudaDeviceData(out3); \ + deallocCudaDeviceData(in1); \ + deallocCudaDeviceData(in2); + +__global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3, + Real_ptr in1, Real_ptr in2, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + INIT3_BODY; + } +} + + +void INIT3::runKokkosCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INIT3_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + INIT3_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + init3<<>>( out1, out2, out3, in1, in2, + iend ); + + } + stopTimer(); + + INIT3_DATA_TEARDOWN_CUDA; + + } else if ( vid == RAJA_CUDA ) { + + INIT3_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::cuda_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + INIT3_BODY; + }); + + } + stopTimer(); + + INIT3_DATA_TEARDOWN_CUDA; + + } else { + std::cout << "\n INIT3 : Unknown Cuda variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/INIT3-KokkosHip.cpp b/src/basic-kokkos/INIT3-KokkosHip.cpp new file mode 100644 index 000000000..9013a9c5a --- /dev/null +++ b/src/basic-kokkos/INIT3-KokkosHip.cpp @@ -0,0 +1,107 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT3.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + + +#define INIT3_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(out1, m_out1, iend); \ + allocAndInitHipDeviceData(out2, m_out2, iend); \ + allocAndInitHipDeviceData(out3, m_out3, iend); \ + allocAndInitHipDeviceData(in1, m_in1, iend); \ + allocAndInitHipDeviceData(in2, m_in2, iend); + +#define INIT3_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_out1, out1, iend); \ + getHipDeviceData(m_out2, out2, iend); \ + getHipDeviceData(m_out3, out3, iend); \ + deallocHipDeviceData(out1); \ + deallocHipDeviceData(out2); \ + deallocHipDeviceData(out3); \ + deallocHipDeviceData(in1); \ + deallocHipDeviceData(in2); + +__global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3, + Real_ptr in1, Real_ptr in2, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + INIT3_BODY; + } +} + + +void INIT3::runHipVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INIT3_DATA_SETUP; + + if ( vid == Base_HIP ) { + + INIT3_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + hipLaunchKernelGGL((init3), dim3(grid_size), dim3(block_size), 0, 0, out1, out2, out3, in1, in2, + iend ); + + } + stopTimer(); + + INIT3_DATA_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { + + INIT3_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + INIT3_BODY; + }); + + } + stopTimer(); + + INIT3_DATA_TEARDOWN_HIP; + + } else { + std::cout << "\n INIT3 : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/INIT3-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/INIT3-KokkosKokkosOMPTarget.cpp new file mode 100644 index 000000000..c81db46de --- /dev/null +++ b/src/basic-kokkos/INIT3-KokkosKokkosOMPTarget.cpp @@ -0,0 +1,101 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT3.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define INIT3_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ +\ + allocAndInitOpenMPDeviceData(out1, m_out1, iend, did, hid); \ + allocAndInitOpenMPDeviceData(out2, m_out2, iend, did, hid); \ + allocAndInitOpenMPDeviceData(out3, m_out3, iend, did, hid); \ + allocAndInitOpenMPDeviceData(in1, m_in1, iend, did, hid); \ + allocAndInitOpenMPDeviceData(in2, m_in2, iend, did, hid); + +#define INIT3_DATA_TEARDOWN_OMP_TARGET \ + getOpenMPDeviceData(m_out1, out1, iend, hid, did); \ + getOpenMPDeviceData(m_out2, out2, iend, hid, did); \ + getOpenMPDeviceData(m_out3, out3, iend, hid, did); \ + deallocOpenMPDeviceData(out1, did); \ + deallocOpenMPDeviceData(out2, did); \ + deallocOpenMPDeviceData(out3, did); \ + deallocOpenMPDeviceData(in1, did); \ + deallocOpenMPDeviceData(in2, did); + + +void INIT3::runKokkosOpenMPTargetVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INIT3_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + INIT3_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp target is_device_ptr(out1, out2, out3, in1, in2) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + INIT3_BODY; + } + + } + stopTimer(); + + INIT3_DATA_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + INIT3_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + INIT3_BODY; + }); + + } + stopTimer(); + + INIT3_DATA_TEARDOWN_OMP_TARGET; + + } else { + std::cout << "\n INIT3 : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/INIT3-KokkosOMP.cpp b/src/basic-kokkos/INIT3-KokkosOMP.cpp new file mode 100644 index 000000000..be5316252 --- /dev/null +++ b/src/basic-kokkos/INIT3-KokkosOMP.cpp @@ -0,0 +1,93 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT3.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void INIT3::runKokkosOpenMPVariant(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INIT3_DATA_SETUP; + + auto init3_lam = [=](Index_type i) { + INIT3_BODY; + }; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + INIT3_BODY; + } + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + init3_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), init3_lam); + + } + stopTimer(); + + break; + } + + default : { + std::cout << "\n INIT3 : Unknown variant id = " << vid << std::endl; + } + + } + +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/INIT3-KokkosSeq.cpp b/src/basic-kokkos/INIT3-KokkosSeq.cpp new file mode 100644 index 000000000..663bb0ae6 --- /dev/null +++ b/src/basic-kokkos/INIT3-KokkosSeq.cpp @@ -0,0 +1,90 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT3.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void INIT3::runKokkosSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INIT3_DATA_SETUP; + + auto init3_lam = [=](Index_type i) { + INIT3_BODY; + }; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + INIT3_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + init3_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), init3_lam); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n INIT3 : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp b/src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp new file mode 100644 index 000000000..0415fbb17 --- /dev/null +++ b/src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp @@ -0,0 +1,100 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT_VIEW1D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for CUDA execution + // + const size_t block_size = 256; + + +#define INIT_VIEW1D_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(a, m_a, getRunSize()); + +#define INIT_VIEW1D_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_a, a, getRunSize()); \ + deallocCudaDeviceData(a); + +__global__ void initview1d(Real_ptr a, + Real_type v, + const Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + INIT_VIEW1D_BODY; + } +} + + +void INIT_VIEW1D::runKokkosCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INIT_VIEW1D_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + INIT_VIEW1D_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + initview1d<<>>( a, + v, + iend ); + + } + stopTimer(); + + INIT_VIEW1D_DATA_TEARDOWN_CUDA; + + } else if ( vid == RAJA_CUDA ) { + + INIT_VIEW1D_DATA_SETUP_CUDA; + + INIT_VIEW1D_VIEW_RAJA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::cuda_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + INIT_VIEW1D_BODY_RAJA; + }); + + } + stopTimer(); + + INIT_VIEW1D_DATA_TEARDOWN_CUDA; + + } else { + std::cout << "\n INIT_VIEW1D : Unknown Cuda variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/INIT_VIEW1D-KokkosHip.cpp b/src/basic-kokkos/INIT_VIEW1D-KokkosHip.cpp new file mode 100644 index 000000000..d7ed15a33 --- /dev/null +++ b/src/basic-kokkos/INIT_VIEW1D-KokkosHip.cpp @@ -0,0 +1,100 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT_VIEW1D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + + +#define INIT_VIEW1D_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(a, m_a, iend); + +#define INIT_VIEW1D_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_a, a, iend); \ + deallocHipDeviceData(a); + +__global__ void initview1d(Real_ptr a, + Real_type v, + const Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + INIT_VIEW1D_BODY; + } +} + + +void INIT_VIEW1D::runHipVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INIT_VIEW1D_DATA_SETUP; + + if ( vid == Base_HIP ) { + + INIT_VIEW1D_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + hipLaunchKernelGGL((initview1d), dim3(grid_size), dim3(block_size), 0, 0, a, + v, + iend ); + + } + stopTimer(); + + INIT_VIEW1D_DATA_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { + + INIT_VIEW1D_DATA_SETUP_HIP; + + INIT_VIEW1D_VIEW_RAJA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + INIT_VIEW1D_BODY_RAJA; + }); + + } + stopTimer(); + + INIT_VIEW1D_DATA_TEARDOWN_HIP; + + } else { + std::cout << "\n INIT_VIEW1D : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/INIT_VIEW1D-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/INIT_VIEW1D-KokkosKokkosOMPTarget.cpp new file mode 100644 index 000000000..ffe170c77 --- /dev/null +++ b/src/basic-kokkos/INIT_VIEW1D-KokkosKokkosOMPTarget.cpp @@ -0,0 +1,93 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT_VIEW1D.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define INIT_VIEW1D_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ +\ + allocAndInitOpenMPDeviceData(a, m_a, getRunSize(), did, hid); + +#define INIT_VIEW1D_DATA_TEARDOWN_OMP_TARGET \ + getOpenMPDeviceData(m_a, a, getRunSize(), hid, did); \ + deallocOpenMPDeviceData(a, did); + + +void INIT_VIEW1D::runKokkosOpenMPTargetVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INIT_VIEW1D_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + INIT_VIEW1D_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp target is_device_ptr(a) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + INIT_VIEW1D_BODY; + } + + } + stopTimer(); + + INIT_VIEW1D_DATA_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + INIT_VIEW1D_DATA_SETUP_OMP_TARGET; + + INIT_VIEW1D_VIEW_RAJA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + INIT_VIEW1D_BODY_RAJA; + }); + + } + stopTimer(); + + INIT_VIEW1D_DATA_TEARDOWN_OMP_TARGET; + + } else { + std::cout << "\n INIT_VIEW1D : Unknown OMP Targetvariant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/INIT_VIEW1D-KokkosOMP.cpp b/src/basic-kokkos/INIT_VIEW1D-KokkosOMP.cpp new file mode 100644 index 000000000..0596813a2 --- /dev/null +++ b/src/basic-kokkos/INIT_VIEW1D-KokkosOMP.cpp @@ -0,0 +1,99 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT_VIEW1D.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void INIT_VIEW1D::runKokkosOpenMPVariant(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INIT_VIEW1D_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + INIT_VIEW1D_BODY; + } + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + auto initview1d_base_lam = [=](Index_type i) { + INIT_VIEW1D_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + initview1d_base_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + INIT_VIEW1D_VIEW_RAJA; + + auto initview1d_lam = [=](Index_type i) { + INIT_VIEW1D_BODY_RAJA; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), initview1d_lam); + + } + stopTimer(); + + break; + } + + default : { + std::cout << "\n INIT_VIEW1D : Unknown variant id = " << vid << std::endl; + } + + } + +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp b/src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp new file mode 100644 index 000000000..fe54d7a18 --- /dev/null +++ b/src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp @@ -0,0 +1,96 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT_VIEW1D.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void INIT_VIEW1D::runKokkosSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INIT_VIEW1D_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + INIT_VIEW1D_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto initview1d_base_lam = [=](Index_type i) { + INIT_VIEW1D_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + initview1d_base_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + INIT_VIEW1D_VIEW_RAJA; + + auto initview1d_lam = [=](Index_type i) { + INIT_VIEW1D_BODY_RAJA; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), initview1d_lam); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n INIT_VIEW1D : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp new file mode 100644 index 000000000..d71e67e9e --- /dev/null +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp @@ -0,0 +1,101 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT_VIEW1D_OFFSET.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for CUDA execution + // + const size_t block_size = 256; + + +#define INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(a, m_a, getRunSize()); + +#define INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_a, a, getRunSize()); \ + deallocCudaDeviceData(a); + +__global__ void initview1d_offset(Real_ptr a, + Real_type v, + const Index_type ibegin, + const Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= ibegin && i < iend) { + INIT_VIEW1D_OFFSET_BODY; + } +} + + +void INIT_VIEW1D_OFFSET::runKokkosCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 1; + const Index_type iend = getRunSize()+1; + + INIT_VIEW1D_OFFSET_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + initview1d_offset<<>>( a, v, + ibegin, + iend ); + + } + stopTimer(); + + INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA; + + } else if ( vid == RAJA_CUDA ) { + + INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA; + + INIT_VIEW1D_OFFSET_VIEW_RAJA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::cuda_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + INIT_VIEW1D_OFFSET_BODY_RAJA; + }); + + } + stopTimer(); + + INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA; + + } else { + std::cout << "\n INIT_VIEW1D_OFFSET : Unknown Cuda variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosHip.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosHip.cpp new file mode 100644 index 000000000..e568d1a78 --- /dev/null +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosHip.cpp @@ -0,0 +1,101 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT_VIEW1D_OFFSET.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + + +#define INIT_VIEW1D_OFFSET_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(a, m_a, getRunSize()); + +#define INIT_VIEW1D_OFFSET_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_a, a, getRunSize()); \ + deallocHipDeviceData(a); + +__global__ void initview1d_offset(Real_ptr a, + Real_type v, + const Index_type ibegin, + const Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= ibegin && i < iend) { + INIT_VIEW1D_OFFSET_BODY; + } +} + + +void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 1; + const Index_type iend = getRunSize()+1; + + INIT_VIEW1D_OFFSET_DATA_SETUP; + + if ( vid == Base_HIP ) { + + INIT_VIEW1D_OFFSET_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + hipLaunchKernelGGL((initview1d_offset), dim3(grid_size), dim3(block_size), 0, 0, a, v, + ibegin, + iend ); + + } + stopTimer(); + + INIT_VIEW1D_OFFSET_DATA_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { + + INIT_VIEW1D_OFFSET_DATA_SETUP_HIP; + + INIT_VIEW1D_OFFSET_VIEW_RAJA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + INIT_VIEW1D_OFFSET_BODY_RAJA; + }); + + } + stopTimer(); + + INIT_VIEW1D_OFFSET_DATA_TEARDOWN_HIP; + + } else { + std::cout << "\n INIT_VIEW1D_OFFSET : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosKokkosOMPTarget.cpp new file mode 100644 index 000000000..285b3b69d --- /dev/null +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosKokkosOMPTarget.cpp @@ -0,0 +1,94 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT_VIEW1D_OFFSET.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define INIT_VIEW1D_OFFSET_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ +\ + allocAndInitOpenMPDeviceData(a, m_a, getRunSize(), did, hid); + +#define INIT_VIEW1D_OFFSET_DATA_TEARDOWN_OMP_TARGET \ + getOpenMPDeviceData(m_a, a, getRunSize(), hid, did); \ + deallocOpenMPDeviceData(a, did); + + +void INIT_VIEW1D_OFFSET::runKokkosOpenMPTargetVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 1; + const Index_type iend = getRunSize()+1; + + INIT_VIEW1D_OFFSET_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + INIT_VIEW1D_OFFSET_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp target is_device_ptr(a) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + INIT_VIEW1D_OFFSET_BODY; + } + + } + stopTimer(); + + INIT_VIEW1D_OFFSET_DATA_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + INIT_VIEW1D_OFFSET_DATA_SETUP_OMP_TARGET; + + INIT_VIEW1D_OFFSET_VIEW_RAJA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + INIT_VIEW1D_OFFSET_BODY_RAJA; + }); + + } + stopTimer(); + + INIT_VIEW1D_OFFSET_DATA_TEARDOWN_OMP_TARGET; + + } else { + std::cout << "\n INIT_VIEW1D_OFFSET : Unknown OMP Targetvariant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP + diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosOMP.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosOMP.cpp new file mode 100644 index 000000000..ac0577b96 --- /dev/null +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosOMP.cpp @@ -0,0 +1,99 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT_VIEW1D_OFFSET.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void INIT_VIEW1D_OFFSET::runKokkosOpenMPVariant(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 1; + const Index_type iend = getRunSize()+1; + + INIT_VIEW1D_OFFSET_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + INIT_VIEW1D_OFFSET_BODY; + } + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + auto initview1doffset_base_lam = [=](Index_type i) { + INIT_VIEW1D_OFFSET_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + initview1doffset_base_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + INIT_VIEW1D_OFFSET_VIEW_RAJA; + + auto initview1doffset_lam = [=](Index_type i) { + INIT_VIEW1D_OFFSET_BODY_RAJA; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), initview1doffset_lam); + + } + stopTimer(); + + break; + } + + default : { + std::cout << "\n INIT_VIEW1D_OFFSET : Unknown variant id = " << vid << std::endl; + } + + } + +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosSeq.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosSeq.cpp new file mode 100644 index 000000000..834d22bad --- /dev/null +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosSeq.cpp @@ -0,0 +1,96 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INIT_VIEW1D_OFFSET.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void INIT_VIEW1D_OFFSET::runKokkosSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 1; + const Index_type iend = getRunSize()+1; + + INIT_VIEW1D_OFFSET_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + INIT_VIEW1D_OFFSET_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto initview1doffset_base_lam = [=](Index_type i) { + INIT_VIEW1D_OFFSET_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + initview1doffset_base_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + INIT_VIEW1D_OFFSET_VIEW_RAJA; + + auto initview1doffset_lam = [=](Index_type i) { + INIT_VIEW1D_OFFSET_BODY_RAJA; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), initview1doffset_lam); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n INIT_VIEW1D_OFFSET : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/MULADDSUB-KokkosCuda.cpp b/src/basic-kokkos/MULADDSUB-KokkosCuda.cpp new file mode 100644 index 000000000..40506073d --- /dev/null +++ b/src/basic-kokkos/MULADDSUB-KokkosCuda.cpp @@ -0,0 +1,107 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULADDSUB.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for CUDA execution + // + const size_t block_size = 256; + + +#define MULADDSUB_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(out1, m_out1, iend); \ + allocAndInitCudaDeviceData(out2, m_out2, iend); \ + allocAndInitCudaDeviceData(out3, m_out3, iend); \ + allocAndInitCudaDeviceData(in1, m_in1, iend); \ + allocAndInitCudaDeviceData(in2, m_in2, iend); + +#define MULADDSUB_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_out1, out1, iend); \ + getCudaDeviceData(m_out2, out2, iend); \ + getCudaDeviceData(m_out3, out3, iend); \ + deallocCudaDeviceData(out1); \ + deallocCudaDeviceData(out2); \ + deallocCudaDeviceData(out3); \ + deallocCudaDeviceData(in1); \ + deallocCudaDeviceData(in2); + +__global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3, + Real_ptr in1, Real_ptr in2, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + MULADDSUB_BODY; + } +} + + +void MULADDSUB::runKokkosCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + MULADDSUB_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + MULADDSUB_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + muladdsub<<>>( out1, out2, out3, in1, in2, + iend ); + + } + stopTimer(); + + MULADDSUB_DATA_TEARDOWN_CUDA; + + } else if ( vid == RAJA_CUDA ) { + + MULADDSUB_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::cuda_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + MULADDSUB_BODY; + }); + + } + stopTimer(); + + MULADDSUB_DATA_TEARDOWN_CUDA; + + } else { + std::cout << "\n MULADDSUB : Unknown Cuda variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/MULADDSUB-KokkosHip.cpp b/src/basic-kokkos/MULADDSUB-KokkosHip.cpp new file mode 100644 index 000000000..f999752ee --- /dev/null +++ b/src/basic-kokkos/MULADDSUB-KokkosHip.cpp @@ -0,0 +1,107 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULADDSUB.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + + +#define MULADDSUB_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(out1, m_out1, iend); \ + allocAndInitHipDeviceData(out2, m_out2, iend); \ + allocAndInitHipDeviceData(out3, m_out3, iend); \ + allocAndInitHipDeviceData(in1, m_in1, iend); \ + allocAndInitHipDeviceData(in2, m_in2, iend); + +#define MULADDSUB_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_out1, out1, iend); \ + getHipDeviceData(m_out2, out2, iend); \ + getHipDeviceData(m_out3, out3, iend); \ + deallocHipDeviceData(out1); \ + deallocHipDeviceData(out2); \ + deallocHipDeviceData(out3); \ + deallocHipDeviceData(in1); \ + deallocHipDeviceData(in2); + +__global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3, + Real_ptr in1, Real_ptr in2, + Index_type iend) +{ + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < iend) { + MULADDSUB_BODY; + } +} + + +void MULADDSUB::runHipVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + MULADDSUB_DATA_SETUP; + + if ( vid == Base_HIP ) { + + MULADDSUB_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + hipLaunchKernelGGL((muladdsub), dim3(grid_size), dim3(block_size), 0, 0, out1, out2, out3, in1, in2, + iend ); + + } + stopTimer(); + + MULADDSUB_DATA_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { + + MULADDSUB_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + MULADDSUB_BODY; + }); + + } + stopTimer(); + + MULADDSUB_DATA_TEARDOWN_HIP; + + } else { + std::cout << "\n MULADDSUB : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/MULADDSUB-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/MULADDSUB-KokkosKokkosOMPTarget.cpp new file mode 100644 index 000000000..ca664f0e8 --- /dev/null +++ b/src/basic-kokkos/MULADDSUB-KokkosKokkosOMPTarget.cpp @@ -0,0 +1,101 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULADDSUB.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define MULADDSUB_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ +\ + allocAndInitOpenMPDeviceData(out1, m_out1, iend, did, hid); \ + allocAndInitOpenMPDeviceData(out2, m_out2, iend, did, hid); \ + allocAndInitOpenMPDeviceData(out3, m_out3, iend, did, hid); \ + allocAndInitOpenMPDeviceData(in1, m_in1, iend, did, hid); \ + allocAndInitOpenMPDeviceData(in2, m_in2, iend, did, hid); + +#define MULADDSUB_DATA_TEARDOWN_OMP_TARGET \ + getOpenMPDeviceData(m_out1, out1, iend, hid, did); \ + getOpenMPDeviceData(m_out2, out2, iend, hid, did); \ + getOpenMPDeviceData(m_out3, out3, iend, hid, did); \ + deallocOpenMPDeviceData(out1, did); \ + deallocOpenMPDeviceData(out2, did); \ + deallocOpenMPDeviceData(out3, did); \ + deallocOpenMPDeviceData(in1, did); \ + deallocOpenMPDeviceData(in2, did); + + +void MULADDSUB::runKokkosOpenMPTargetVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + MULADDSUB_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + MULADDSUB_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp target is_device_ptr(out1, out2, out3, in1, in2) device( did ) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) + for (Index_type i = ibegin; i < iend; ++i ) { + MULADDSUB_BODY; + } + + } + stopTimer(); + + MULADDSUB_DATA_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + MULADDSUB_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + MULADDSUB_BODY; + }); + + } + stopTimer(); + + MULADDSUB_DATA_TEARDOWN_OMP_TARGET; + + } else { + std::cout << "\n MULADDSUB : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/MULADDSUB-KokkosOMP.cpp b/src/basic-kokkos/MULADDSUB-KokkosOMP.cpp new file mode 100644 index 000000000..9df7b0129 --- /dev/null +++ b/src/basic-kokkos/MULADDSUB-KokkosOMP.cpp @@ -0,0 +1,93 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULADDSUB.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void MULADDSUB::runKokkosOpenMPVariant(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + MULADDSUB_DATA_SETUP; + + auto mas_lam = [=](Index_type i) { + MULADDSUB_BODY; + }; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + MULADDSUB_BODY; + } + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type i = ibegin; i < iend; ++i ) { + mas_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), mas_lam); + + } + stopTimer(); + + break; + } + + default : { + std::cout << "\n MULADDSUB : Unknown variant id = " << vid << std::endl; + } + + } + +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/MULADDSUB-KokkosSeq.cpp b/src/basic-kokkos/MULADDSUB-KokkosSeq.cpp new file mode 100644 index 000000000..868e537ae --- /dev/null +++ b/src/basic-kokkos/MULADDSUB-KokkosSeq.cpp @@ -0,0 +1,90 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MULADDSUB.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void MULADDSUB::runKokkosSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + MULADDSUB_DATA_SETUP; + + auto mas_lam = [=](Index_type i) { + MULADDSUB_BODY; + }; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + MULADDSUB_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + mas_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), mas_lam); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n MULADDSUB : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp b/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp new file mode 100644 index 000000000..0d1a9648e --- /dev/null +++ b/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp @@ -0,0 +1,107 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "NESTED_INIT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +#define NESTED_INIT_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(array, m_array, m_array_length); + +#define NESTED_INIT_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_array, array, m_array_length); \ + deallocCudaDeviceData(array); + +__global__ void nested_init(Real_ptr array, + Index_type ni, Index_type nj) +{ + Index_type i = threadIdx.x; + Index_type j = blockIdx.y; + Index_type k = blockIdx.z; + + NESTED_INIT_BODY; +} + + +void NESTED_INIT::runKokkosCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + NESTED_INIT_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + NESTED_INIT_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + dim3 nthreads_per_block(ni, 1, 1); + dim3 nblocks(1, nj, nk); + + nested_init<<>>(array, + ni, nj); + + } + stopTimer(); + + NESTED_INIT_DATA_TEARDOWN_CUDA; + + } else if ( vid == RAJA_CUDA ) { + + NESTED_INIT_DATA_SETUP_CUDA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::CudaKernelAsync< + RAJA::statement::For<2, RAJA::cuda_block_z_loop, // k + RAJA::statement::For<1, RAJA::cuda_block_y_loop, // j + RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // i + RAJA::statement::Lambda<0> + > + > + > + > + >; + + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), + RAJA::RangeSegment(0, nj), + RAJA::RangeSegment(0, nk)), + [=] __device__ (Index_type i, Index_type j, Index_type k) { + NESTED_INIT_BODY; + }); + + } + stopTimer(); + + NESTED_INIT_DATA_TEARDOWN_CUDA; + + } else { + std::cout << "\n NESTED_INIT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/NESTED_INIT-KokkosHip.cpp b/src/basic-kokkos/NESTED_INIT-KokkosHip.cpp new file mode 100644 index 000000000..1ba5b6b17 --- /dev/null +++ b/src/basic-kokkos/NESTED_INIT-KokkosHip.cpp @@ -0,0 +1,106 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "NESTED_INIT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +#define NESTED_INIT_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(array, m_array, m_array_length); + +#define NESTED_INIT_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_array, array, m_array_length); \ + deallocHipDeviceData(array); + +__global__ void nested_init(Real_ptr array, + Index_type ni, Index_type nj) +{ + Index_type i = threadIdx.x; + Index_type j = blockIdx.y; + Index_type k = blockIdx.z; + + NESTED_INIT_BODY; +} + + +void NESTED_INIT::runHipVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + NESTED_INIT_DATA_SETUP; + + if ( vid == Base_HIP ) { + + NESTED_INIT_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + dim3 nthreads_per_block(ni, 1, 1); + dim3 nblocks(1, nj, nk); + + hipLaunchKernelGGL((nested_init), dim3(nblocks), dim3(nthreads_per_block), 0, 0, array, + ni, nj); + + } + stopTimer(); + + NESTED_INIT_DATA_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { + + NESTED_INIT_DATA_SETUP_HIP; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::HipKernelAsync< + RAJA::statement::For<2, RAJA::hip_block_z_loop, // k + RAJA::statement::For<1, RAJA::hip_block_y_loop, // j + RAJA::statement::For<0, RAJA::hip_thread_x_loop, // i + RAJA::statement::Lambda<0> + > + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), + RAJA::RangeSegment(0, nj), + RAJA::RangeSegment(0, nk)), + [=] __device__ (Index_type i, Index_type j, Index_type k) { + NESTED_INIT_BODY; + }); + + } + stopTimer(); + + NESTED_INIT_DATA_TEARDOWN_HIP; + + } else { + std::cout << "\n NESTED_INIT : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/NESTED_INIT-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/NESTED_INIT-KokkosKokkosOMPTarget.cpp new file mode 100644 index 000000000..d61173a47 --- /dev/null +++ b/src/basic-kokkos/NESTED_INIT-KokkosKokkosOMPTarget.cpp @@ -0,0 +1,98 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "NESTED_INIT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +#define NESTED_INIT_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ +\ + allocAndInitOpenMPDeviceData(array, m_array, m_array_length, did, hid); + +#define NESTED_INIT_DATA_TEARDOWN_OMP_TARGET \ + getOpenMPDeviceData(m_array, array, m_array_length, hid, did); \ + deallocOpenMPDeviceData(array, did); + + +void NESTED_INIT::runKokkosOpenMPTargetVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + NESTED_INIT_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + NESTED_INIT_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp target is_device_ptr(array) device( did ) + #pragma omp teams distribute parallel for schedule(static, 1) collapse(3) + for (Index_type k = 0; k < nk; ++k ) { + for (Index_type j = 0; j < nj; ++j ) { + for (Index_type i = 0; i < ni; ++i ) { + NESTED_INIT_BODY; + } + } + } + + } + stopTimer(); + + NESTED_INIT_DATA_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + NESTED_INIT_DATA_SETUP_OMP_TARGET; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::Collapse, // k, j, i + RAJA::statement::Lambda<0> + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), + RAJA::RangeSegment(0, nj), + RAJA::RangeSegment(0, nk)), + [=](Index_type i, Index_type j, Index_type k) { + NESTED_INIT_BODY; + }); + + } + stopTimer(); + + NESTED_INIT_DATA_TEARDOWN_OMP_TARGET; + + } else { + std::cout << "\n NESTED_INIT : Unknown variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/NESTED_INIT-KokkosOMP.cpp b/src/basic-kokkos/NESTED_INIT-KokkosOMP.cpp new file mode 100644 index 000000000..bfdaca2cd --- /dev/null +++ b/src/basic-kokkos/NESTED_INIT-KokkosOMP.cpp @@ -0,0 +1,134 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "NESTED_INIT.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +//#define USE_OMP_COLLAPSE +#undef USE_OMP_COLLAPSE + + +void NESTED_INIT::runKokkosOpenMPVariant(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + + NESTED_INIT_DATA_SETUP; + + auto nestedinit_lam = [=](Index_type i, Index_type j, Index_type k) { + NESTED_INIT_BODY; + }; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + +#if defined(USE_OMP_COLLAPSE) + #pragma omp parallel for collapse(3) +#else + #pragma omp parallel for +#endif + for (Index_type k = 0; k < nk; ++k ) { + for (Index_type j = 0; j < nj; ++j ) { + for (Index_type i = 0; i < ni; ++i ) { + NESTED_INIT_BODY; + } + } + } + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + +#if defined(USE_OMP_COLLAPSE) + #pragma omp parallel for collapse(3) +#else + #pragma omp parallel for +#endif + for (Index_type k = 0; k < nk; ++k ) { + for (Index_type j = 0; j < nj; ++j ) { + for (Index_type i = 0; i < ni; ++i ) { + nestedinit_lam(i, j, k); + } + } + } + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + +#if defined(USE_OMP_COLLAPSE) + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::Collapse, // k, j, i + RAJA::statement::Lambda<0> + > + >; +#else + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<2, RAJA::omp_parallel_for_exec, // k + RAJA::statement::For<1, RAJA::loop_exec, // j + RAJA::statement::For<0, RAJA::loop_exec, // i + RAJA::statement::Lambda<0> + > + > + > + >; +#endif + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), + RAJA::RangeSegment(0, nj), + RAJA::RangeSegment(0, nk)), + nestedinit_lam + ); + + } + stopTimer(); + + break; + } + + default : { + std::cout << "\n NESTED_INIT : Unknown variant id = " << vid << std::endl; + } + + } + +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp b/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp new file mode 100644 index 000000000..92308a0d8 --- /dev/null +++ b/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp @@ -0,0 +1,110 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "NESTED_INIT.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + +void NESTED_INIT::runKokkosSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + NESTED_INIT_DATA_SETUP; + + auto nestedinit_lam = [=](Index_type i, Index_type j, Index_type k) { + NESTED_INIT_BODY; + }; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type k = 0; k < nk; ++k ) { + for (Index_type j = 0; j < nj; ++j ) { + for (Index_type i = 0; i < ni; ++i ) { + NESTED_INIT_BODY; + } + } + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type k = 0; k < nk; ++k ) { + for (Index_type j = 0; j < nj; ++j ) { + for (Index_type i = 0; i < ni; ++i ) { + nestedinit_lam(i, j, k); + } + } + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<2, RAJA::loop_exec, // k + RAJA::statement::For<1, RAJA::loop_exec, // j + RAJA::statement::For<0, RAJA::loop_exec,// i + RAJA::statement::Lambda<0> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), + RAJA::RangeSegment(0, nj), + RAJA::RangeSegment(0, nk)), + nestedinit_lam + ); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n NESTED_INIT : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp b/src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp new file mode 100644 index 000000000..f5b86c961 --- /dev/null +++ b/src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp @@ -0,0 +1,176 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE3_INT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for CUDA execution + // + const size_t block_size = 256; + + +#define REDUCE3_INT_DATA_SETUP_CUDA \ + allocAndInitCudaDeviceData(vec, m_vec, iend); + +#define REDUCE3_INT_DATA_TEARDOWN_CUDA \ + deallocCudaDeviceData(vec); + + +__global__ void reduce3int(Int_ptr vec, + Int_ptr vsum, Int_type vsum_init, + Int_ptr vmin, Int_type vmin_init, + Int_ptr vmax, Int_type vmax_init, + Index_type iend) +{ + extern __shared__ Int_type psum[ ]; + Int_type* pmin = (Int_type*)&psum[ 1 * blockDim.x ]; + Int_type* pmax = (Int_type*)&psum[ 2 * blockDim.x ]; + + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + + psum[ threadIdx.x ] = vsum_init; + pmin[ threadIdx.x ] = vmin_init; + pmax[ threadIdx.x ] = vmax_init; + + for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + psum[ threadIdx.x ] += vec[ i ]; + pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], vec[ i ] ); + pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], vec[ i ] ); + } + __syncthreads(); + + for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + if ( threadIdx.x < i ) { + psum[ threadIdx.x ] += psum[ threadIdx.x + i ]; + pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], pmin[ threadIdx.x + i ] ); + pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], pmax[ threadIdx.x + i ] ); + } + __syncthreads(); + } + +#if 1 // serialized access to shared data; + if ( threadIdx.x == 0 ) { + RAJA::atomicAdd( vsum, psum[ 0 ] ); + RAJA::atomicMin( vmin, pmin[ 0 ] ); + RAJA::atomicMax( vmax, pmax[ 0 ] ); + } +#else // this doesn't work due to data races + if ( threadIdx.x == 0 ) { + *vsum += psum[ 0 ]; + *vmin = RAJA_MIN( *vmin, pmin[ 0 ] ); + *vmax = RAJA_MAX( *vmax, pmax[ 0 ] ); + } +#endif +} + + +void REDUCE3_INT::runKokkosCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + REDUCE3_INT_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + REDUCE3_INT_DATA_SETUP_CUDA; + + Int_ptr vsum; + allocAndInitCudaDeviceData(vsum, &m_vsum_init, 1); + Int_ptr vmin; + allocAndInitCudaDeviceData(vmin, &m_vmin_init, 1); + Int_ptr vmax; + allocAndInitCudaDeviceData(vmax, &m_vmax_init, 1); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initCudaDeviceData(vsum, &m_vsum_init, 1); + initCudaDeviceData(vmin, &m_vmin_init, 1); + initCudaDeviceData(vmax, &m_vmax_init, 1); + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + reduce3int<<>>(vec, + vsum, m_vsum_init, + vmin, m_vmin_init, + vmax, m_vmax_init, + iend ); + + Int_type lsum; + Int_ptr plsum = &lsum; + getCudaDeviceData(plsum, vsum, 1); + m_vsum += lsum; + + Int_type lmin; + Int_ptr plmin = &lmin; + getCudaDeviceData(plmin, vmin, 1); + m_vmin = RAJA_MIN(m_vmin, lmin); + + Int_type lmax; + Int_ptr plmax = &lmax; + getCudaDeviceData(plmax, vmax, 1); + m_vmax = RAJA_MAX(m_vmax, lmax); + + } + stopTimer(); + + REDUCE3_INT_DATA_TEARDOWN_CUDA; + + deallocCudaDeviceData(vsum); + deallocCudaDeviceData(vmin); + deallocCudaDeviceData(vmax); + + } else if ( vid == RAJA_CUDA ) { + + REDUCE3_INT_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall< RAJA::cuda_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + REDUCE3_INT_DATA_TEARDOWN_CUDA; + + } else { + std::cout << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/REDUCE3_INT-KokkosHip.cpp b/src/basic-kokkos/REDUCE3_INT-KokkosHip.cpp new file mode 100644 index 000000000..2646ff547 --- /dev/null +++ b/src/basic-kokkos/REDUCE3_INT-KokkosHip.cpp @@ -0,0 +1,175 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE3_INT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + + +#define REDUCE3_INT_DATA_SETUP_HIP \ + allocAndInitHipDeviceData(vec, m_vec, iend); + +#define REDUCE3_INT_DATA_TEARDOWN_HIP \ + deallocHipDeviceData(vec); + + +__global__ void reduce3int(Int_ptr vec, + Int_ptr vsum, Int_type vsum_init, + Int_ptr vmin, Int_type vmin_init, + Int_ptr vmax, Int_type vmax_init, + Index_type iend) +{ + HIP_DYNAMIC_SHARED( Int_type, psum) + Int_type* pmin = (Int_type*)&psum[ 1 * blockDim.x ]; + Int_type* pmax = (Int_type*)&psum[ 2 * blockDim.x ]; + + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + + psum[ threadIdx.x ] = vsum_init; + pmin[ threadIdx.x ] = vmin_init; + pmax[ threadIdx.x ] = vmax_init; + + for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + psum[ threadIdx.x ] += vec[ i ]; + pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], vec[ i ] ); + pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], vec[ i ] ); + } + __syncthreads(); + + for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + if ( threadIdx.x < i ) { + psum[ threadIdx.x ] += psum[ threadIdx.x + i ]; + pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], pmin[ threadIdx.x + i ] ); + pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], pmax[ threadIdx.x + i ] ); + } + __syncthreads(); + } + +#if 1 // serialized access to shared data; + if ( threadIdx.x == 0 ) { + RAJA::atomicAdd( vsum, psum[ 0 ] ); + RAJA::atomicMin( vmin, pmin[ 0 ] ); + RAJA::atomicMax( vmax, pmax[ 0 ] ); + } +#else // this doesn't work due to data races + if ( threadIdx.x == 0 ) { + *vsum += psum[ 0 ]; + *vmin = RAJA_MIN( *vmin, pmin[ 0 ] ); + *vmax = RAJA_MAX( *vmax, pmax[ 0 ] ); + } +#endif +} + + +void REDUCE3_INT::runHipVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + REDUCE3_INT_DATA_SETUP; + + if ( vid == Base_HIP ) { + + REDUCE3_INT_DATA_SETUP_HIP; + + Int_ptr vsum; + allocAndInitHipDeviceData(vsum, &m_vsum_init, 1); + Int_ptr vmin; + allocAndInitHipDeviceData(vmin, &m_vmin_init, 1); + Int_ptr vmax; + allocAndInitHipDeviceData(vmax, &m_vmax_init, 1); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initHipDeviceData(vsum, &m_vsum_init, 1); + initHipDeviceData(vmin, &m_vmin_init, 1); + initHipDeviceData(vmax, &m_vmax_init, 1); + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + hipLaunchKernelGGL((reduce3int), dim3(grid_size), dim3(block_size), 3*sizeof(Int_type)*block_size, 0, vec, + vsum, m_vsum_init, + vmin, m_vmin_init, + vmax, m_vmax_init, + iend ); + + Int_type lsum; + Int_ptr plsum = &lsum; + getHipDeviceData(plsum, vsum, 1); + m_vsum += lsum; + + Int_type lmin; + Int_ptr plmin = &lmin; + getHipDeviceData(plmin, vmin, 1); + m_vmin = RAJA_MIN(m_vmin, lmin); + + Int_type lmax; + Int_ptr plmax = &lmax; + getHipDeviceData(plmax, vmax, 1); + m_vmax = RAJA_MAX(m_vmax, lmax); + + } + stopTimer(); + + REDUCE3_INT_DATA_TEARDOWN_HIP; + + deallocHipDeviceData(vsum); + deallocHipDeviceData(vmin); + deallocHipDeviceData(vmax); + + } else if ( vid == RAJA_HIP ) { + + REDUCE3_INT_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + REDUCE3_INT_DATA_TEARDOWN_HIP; + + } else { + std::cout << "\n REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/REDUCE3_INT-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/REDUCE3_INT-KokkosKokkosOMPTarget.cpp new file mode 100644 index 000000000..b96b05794 --- /dev/null +++ b/src/basic-kokkos/REDUCE3_INT-KokkosKokkosOMPTarget.cpp @@ -0,0 +1,110 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE3_INT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define REDUCE3_INT_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ +\ + allocAndInitOpenMPDeviceData(vec, m_vec, iend, did, hid); + +#define REDUCE3_INT_DATA_TEARDOWN_OMP_TARGET \ + deallocOpenMPDeviceData(vec, did); \ + + +void REDUCE3_INT::runKokkosOpenMPTargetVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + REDUCE3_INT_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + REDUCE3_INT_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Int_type vsum = m_vsum_init; + Int_type vmin = m_vmin_init; + Int_type vmax = m_vmax_init; + + #pragma omp target is_device_ptr(vec) device( did ) map(tofrom:vsum, vmin, vmax) + #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static,1) \ + reduction(+:vsum) \ + reduction(min:vmin) \ + reduction(max:vmax) + for (Index_type i = ibegin; i < iend; ++i ) { + REDUCE3_INT_BODY; + } + + m_vsum += vsum; + m_vmin = RAJA_MIN(m_vmin, vmin); + m_vmax = RAJA_MAX(m_vmax, vmax); + + } + stopTimer(); + + REDUCE3_INT_DATA_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + REDUCE3_INT_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + REDUCE3_INT_DATA_TEARDOWN_OMP_TARGET; + + } else { + std::cout << "\n REDUCE3_INT : Unknown OMP Target variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/REDUCE3_INT-KokkosOMP.cpp b/src/basic-kokkos/REDUCE3_INT-KokkosOMP.cpp new file mode 100644 index 000000000..07cc5a2b6 --- /dev/null +++ b/src/basic-kokkos/REDUCE3_INT-KokkosOMP.cpp @@ -0,0 +1,126 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE3_INT.hpp" + +#include "RAJA/RAJA.hpp" + +#include +#include + +namespace rajaperf +{ +namespace basic +{ + + +void REDUCE3_INT::runKokkosOpenMPVariant(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + REDUCE3_INT_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Int_type vsum = m_vsum_init; + Int_type vmin = m_vmin_init; + Int_type vmax = m_vmax_init; + + #pragma omp parallel for reduction(+:vsum), \ + reduction(min:vmin), \ + reduction(max:vmax) + for (Index_type i = ibegin; i < iend; ++i ) { + REDUCE3_INT_BODY; + } + + m_vsum += vsum; + m_vmin = RAJA_MIN(m_vmin, vmin); + m_vmax = RAJA_MAX(m_vmax, vmax); + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + auto reduce3int_base_lam = [=](Index_type i) -> Int_type { + return vec[i]; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Int_type vsum = m_vsum_init; + Int_type vmin = m_vmin_init; + Int_type vmax = m_vmax_init; + + #pragma omp parallel for reduction(+:vsum), \ + reduction(min:vmin), \ + reduction(max:vmax) + for (Index_type i = ibegin; i < iend; ++i ) { + vsum += reduce3int_base_lam(i); + vmin = RAJA_MIN(vmin, reduce3int_base_lam(i)); + vmax = RAJA_MAX(vmax, reduce3int_base_lam(i)); + } + + m_vsum += vsum; + m_vmin = RAJA_MIN(m_vmin, vmin); + m_vmax = RAJA_MAX(m_vmax, vmax); + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + break; + } + + default : { + std::cout << "\n REDUCE3_INT : Unknown variant id = " << vid << std::endl; + } + + } + +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/REDUCE3_INT-KokkosSeq.cpp b/src/basic-kokkos/REDUCE3_INT-KokkosSeq.cpp new file mode 100644 index 000000000..5635a4d49 --- /dev/null +++ b/src/basic-kokkos/REDUCE3_INT-KokkosSeq.cpp @@ -0,0 +1,119 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "REDUCE3_INT.hpp" + +#include "RAJA/RAJA.hpp" + +#include +#include + +namespace rajaperf +{ +namespace basic +{ + + +void REDUCE3_INT::runKokkosSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + REDUCE3_INT_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Int_type vsum = m_vsum_init; + Int_type vmin = m_vmin_init; + Int_type vmax = m_vmax_init; + + for (Index_type i = ibegin; i < iend; ++i ) { + REDUCE3_INT_BODY; + } + + m_vsum += vsum; + m_vmin = RAJA_MIN(m_vmin, vmin); + m_vmax = RAJA_MAX(m_vmax, vmax); + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto init3_base_lam = [=](Index_type i) -> Int_type { + return vec[i]; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Int_type vsum = m_vsum_init; + Int_type vmin = m_vmin_init; + Int_type vmax = m_vmax_init; + + for (Index_type i = ibegin; i < iend; ++i ) { + vsum += init3_base_lam(i); + vmin = RAJA_MIN(vmin, init3_base_lam(i)); + vmax = RAJA_MAX(vmax, init3_base_lam(i)); + } + + m_vsum += vsum; + m_vmin = RAJA_MIN(m_vmin, vmin); + m_vmax = RAJA_MAX(m_vmax, vmax); + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum vsum(m_vsum_init); + RAJA::ReduceMin vmin(m_vmin_init); + RAJA::ReduceMax vmax(m_vmax_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + REDUCE3_INT_BODY_RAJA; + }); + + m_vsum += static_cast(vsum.get()); + m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n REDUCE3_INT : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp b/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp new file mode 100644 index 000000000..ccd1c596d --- /dev/null +++ b/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp @@ -0,0 +1,158 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRAP_INT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +// +// Function used in TRAP_INT loop. +// +RAJA_INLINE +RAJA_DEVICE +Real_type trap_int_func(Real_type x, + Real_type y, + Real_type xp, + Real_type yp) +{ + Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); + denom = 1.0/sqrt(denom); + return denom; +} + + + // + // Define thread block size for CUDA execution + // + const size_t block_size = 256; + + +#define TRAP_INT_DATA_SETUP_CUDA // nothing to do here... + +#define TRAP_INT_DATA_TEARDOWN_CUDA // nothing to do here... + + +__global__ void trapint(Real_type x0, Real_type xp, + Real_type y, Real_type yp, + Real_type h, + Real_ptr sumx, + Index_type iend) +{ + extern __shared__ Real_type psumx[ ]; + + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + + psumx[ threadIdx.x ] = 0.0; + for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + Real_type x = x0 + i*h; + Real_type val = trap_int_func(x, y, xp, yp); + psumx[ threadIdx.x ] += val; + } + __syncthreads(); + + for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + if ( threadIdx.x < i ) { + psumx[ threadIdx.x ] += psumx[ threadIdx.x + i ]; + } + __syncthreads(); + } + +#if 1 // serialized access to shared data; + if ( threadIdx.x == 0 ) { + RAJA::atomicAdd( sumx, psumx[ 0 ] ); + } +#else // this doesn't work due to data races + if ( threadIdx.x == 0 ) { + *sumx += psumx[ 0 ]; + } +#endif + +} + + +void TRAP_INT::runKokkosCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + TRAP_INT_DATA_SETUP; + + if ( vid == Base_CUDA ) { + + TRAP_INT_DATA_SETUP_CUDA; + + Real_ptr sumx; + allocAndInitCudaDeviceData(sumx, &m_sumx_init, 1); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initCudaDeviceData(sumx, &m_sumx_init, 1); + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + trapint<<>>(x0, xp, + y, yp, + h, + sumx, + iend); + + Real_type lsumx; + Real_ptr plsumx = &lsumx; + getCudaDeviceData(plsumx, sumx, 1); + m_sumx += lsumx * h; + + } + stopTimer(); + + deallocCudaDeviceData(sumx); + + TRAP_INT_DATA_TEARDOWN_CUDA; + + } else if ( vid == RAJA_CUDA ) { + + TRAP_INT_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall< RAJA::cuda_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + stopTimer(); + + TRAP_INT_DATA_TEARDOWN_CUDA; + + } else { + std::cout << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/TRAP_INT-KokkosHip.cpp b/src/basic-kokkos/TRAP_INT-KokkosHip.cpp new file mode 100644 index 000000000..346050c3a --- /dev/null +++ b/src/basic-kokkos/TRAP_INT-KokkosHip.cpp @@ -0,0 +1,157 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRAP_INT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +// +// Function used in TRAP_INT loop. +// +RAJA_INLINE +RAJA_DEVICE +Real_type trap_int_func(Real_type x, + Real_type y, + Real_type xp, + Real_type yp) +{ + Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); + denom = 1.0/sqrt(denom); + return denom; +} + + + // + // Define thread block size for HIP execution + // + const size_t block_size = 256; + + +#define TRAP_INT_DATA_SETUP_HIP // nothing to do here... + +#define TRAP_INT_DATA_TEARDOWN_HIP // nothing to do here... + + +__global__ void trapint(Real_type x0, Real_type xp, + Real_type y, Real_type yp, + Real_type h, + Real_ptr sumx, + Index_type iend) +{ + HIP_DYNAMIC_SHARED( Real_type, psumx) + + Index_type i = blockIdx.x * blockDim.x + threadIdx.x; + + psumx[ threadIdx.x ] = 0.0; + for ( ; i < iend ; i += gridDim.x * blockDim.x ) { + Real_type x = x0 + i*h; + Real_type val = trap_int_func(x, y, xp, yp); + psumx[ threadIdx.x ] += val; + } + __syncthreads(); + + for ( i = blockDim.x / 2; i > 0; i /= 2 ) { + if ( threadIdx.x < i ) { + psumx[ threadIdx.x ] += psumx[ threadIdx.x + i ]; + } + __syncthreads(); + } + +#if 1 // serialized access to shared data; + if ( threadIdx.x == 0 ) { + RAJA::atomicAdd( sumx, psumx[ 0 ] ); + } +#else // this doesn't work due to data races + if ( threadIdx.x == 0 ) { + *sumx += psumx[ 0 ]; + } +#endif + +} + + +void TRAP_INT::runHipVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + TRAP_INT_DATA_SETUP; + + if ( vid == Base_HIP ) { + + TRAP_INT_DATA_SETUP_HIP; + + Real_ptr sumx; + allocAndInitHipDeviceData(sumx, &m_sumx_init, 1); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + initHipDeviceData(sumx, &m_sumx_init, 1); + + const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); + hipLaunchKernelGGL((trapint), dim3(grid_size), dim3(block_size), sizeof(Real_type)*block_size, 0, x0, xp, + y, yp, + h, + sumx, + iend); + + Real_type lsumx; + Real_ptr plsumx = &lsumx; + getHipDeviceData(plsumx, sumx, 1); + m_sumx += lsumx * h; + + } + stopTimer(); + + deallocHipDeviceData(sumx); + + TRAP_INT_DATA_TEARDOWN_HIP; + + } else if ( vid == RAJA_HIP ) { + + TRAP_INT_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall< RAJA::hip_exec >( + RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + stopTimer(); + + TRAP_INT_DATA_TEARDOWN_HIP; + + } else { + std::cout << "\n TRAP_INT : Unknown Hip variant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/TRAP_INT-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/TRAP_INT-KokkosKokkosOMPTarget.cpp new file mode 100644 index 000000000..7ac80bdbb --- /dev/null +++ b/src/basic-kokkos/TRAP_INT-KokkosKokkosOMPTarget.cpp @@ -0,0 +1,111 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRAP_INT.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +// +// Function used in TRAP_INT loop. +// +RAJA_INLINE +Real_type trap_int_func(Real_type x, + Real_type y, + Real_type xp, + Real_type yp) +{ + Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); + denom = 1.0/sqrt(denom); + return denom; +} + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + + +#define TRAP_INT_DATA_SETUP_OMP_TARGET // nothing to do here... + +#define TRAP_INT_DATA_TEARDOWN_OMP_TARGET // nothing to do here... + + +void TRAP_INT::runKokkosOpenMPTargetVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + TRAP_INT_DATA_SETUP; + + if ( vid == Base_OpenMPTarget ) { + + TRAP_INT_DATA_SETUP_OMP_TARGET; + + #pragma omp target enter data map(to:x0,xp,y,yp,h) + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type sumx = m_sumx_init; + + #pragma omp target teams distribute parallel for map(tofrom: sumx) reduction(+:sumx) \ + thread_limit(threads_per_team) schedule(static, 1) + + for (Index_type i = ibegin; i < iend; ++i ) { + TRAP_INT_BODY; + } + + m_sumx += sumx * h; + + } + stopTimer(); + + #pragma omp target exit data map(delete: x0,xp,y,yp,h) + + } else if ( vid == RAJA_OpenMPTarget ) { + + TRAP_INT_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + stopTimer(); + + TRAP_INT_DATA_TEARDOWN_OMP_TARGET; + + } else { + std::cout << "\n TRAP_INT : Unknown OMP Targetvariant id = " << vid << std::endl; + } +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/TRAP_INT-KokkosOMP.cpp b/src/basic-kokkos/TRAP_INT-KokkosOMP.cpp new file mode 100644 index 000000000..94f8f2f3f --- /dev/null +++ b/src/basic-kokkos/TRAP_INT-KokkosOMP.cpp @@ -0,0 +1,122 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRAP_INT.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +// +// Function used in TRAP_INT loop. +// +RAJA_INLINE +Real_type trap_int_func(Real_type x, + Real_type y, + Real_type xp, + Real_type yp) +{ + Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); + denom = 1.0/sqrt(denom); + return denom; +} + + +void TRAP_INT::runKokkosOpenMPVariant(VariantID vid) +{ +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + TRAP_INT_DATA_SETUP; + + switch ( vid ) { + + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type sumx = m_sumx_init; + + #pragma omp parallel for reduction(+:sumx) + for (Index_type i = ibegin; i < iend; ++i ) { + TRAP_INT_BODY; + } + + m_sumx += sumx * h; + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP : { + + auto trapint_base_lam = [=](Index_type i) -> Real_type { + Real_type x = x0 + i*h; + return trap_int_func(x, y, xp, yp); + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type sumx = m_sumx_init; + + #pragma omp parallel for reduction(+:sumx) + for (Index_type i = ibegin; i < iend; ++i ) { + sumx += trapint_base_lam(i); + } + + m_sumx += sumx * h; + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + stopTimer(); + + break; + } + + default : { + std::cout << "\n TRAP_INT : Unknown variant id = " << vid << std::endl; + } + + } + +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic-kokkos/TRAP_INT-KokkosSeq.cpp b/src/basic-kokkos/TRAP_INT-KokkosSeq.cpp new file mode 100644 index 000000000..f4859927c --- /dev/null +++ b/src/basic-kokkos/TRAP_INT-KokkosSeq.cpp @@ -0,0 +1,119 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRAP_INT.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace basic +{ + +// +// Function used in TRAP_INT loop. +// +RAJA_INLINE +Real_type trap_int_func(Real_type x, + Real_type y, + Real_type xp, + Real_type yp) +{ + Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); + denom = 1.0/sqrt(denom); + return denom; +} + + +void TRAP_INT::runKokkosSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + TRAP_INT_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type sumx = m_sumx_init; + + for (Index_type i = ibegin; i < iend; ++i ) { + TRAP_INT_BODY; + } + + m_sumx += sumx * h; + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto trapint_base_lam = [=](Index_type i) -> Real_type { + Real_type x = x0 + i*h; + return trap_int_func(x, y, xp, yp); + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type sumx = m_sumx_init; + + for (Index_type i = ibegin; i < iend; ++i ) { + sumx += trapint_base_lam(i); + } + + m_sumx += sumx * h; + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum sumx(m_sumx_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + TRAP_INT_BODY; + }); + + m_sumx += static_cast(sumx.get()) * h; + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n TRAP_INT : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace basic +} // end namespace rajaperf From 104ac3d77c3ddb1ff0bee2f713359b356e79266b Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Tue, 17 Nov 2020 11:17:15 -0800 Subject: [PATCH 018/124] Removed extra files --- src/basic-kokkos/ATOMIC_PI-KokkosHip.cpp | 107 ----------- src/basic-kokkos/DAXPY-KokkosHip.cpp | 99 ---------- src/basic-kokkos/IF_QUAD-KokkosHip.cpp | 106 ----------- src/basic-kokkos/INIT3-KokkosHip.cpp | 107 ----------- src/basic-kokkos/INIT_VIEW1D-KokkosHip.cpp | 100 ---------- .../INIT_VIEW1D_OFFSET-KokkosHip.cpp | 101 ---------- src/basic-kokkos/MULADDSUB-KokkosHip.cpp | 107 ----------- src/basic-kokkos/NESTED_INIT-KokkosHip.cpp | 106 ----------- src/basic-kokkos/REDUCE3_INT-KokkosHip.cpp | 175 ------------------ src/basic-kokkos/TRAP_INT-KokkosHip.cpp | 157 ---------------- 10 files changed, 1165 deletions(-) delete mode 100644 src/basic-kokkos/ATOMIC_PI-KokkosHip.cpp delete mode 100644 src/basic-kokkos/DAXPY-KokkosHip.cpp delete mode 100644 src/basic-kokkos/IF_QUAD-KokkosHip.cpp delete mode 100644 src/basic-kokkos/INIT3-KokkosHip.cpp delete mode 100644 src/basic-kokkos/INIT_VIEW1D-KokkosHip.cpp delete mode 100644 src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosHip.cpp delete mode 100644 src/basic-kokkos/MULADDSUB-KokkosHip.cpp delete mode 100644 src/basic-kokkos/NESTED_INIT-KokkosHip.cpp delete mode 100644 src/basic-kokkos/REDUCE3_INT-KokkosHip.cpp delete mode 100644 src/basic-kokkos/TRAP_INT-KokkosHip.cpp diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosHip.cpp b/src/basic-kokkos/ATOMIC_PI-KokkosHip.cpp deleted file mode 100644 index e429c95ee..000000000 --- a/src/basic-kokkos/ATOMIC_PI-KokkosHip.cpp +++ /dev/null @@ -1,107 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "ATOMIC_PI.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_HIP) - -#include "common/HipDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - -#define ATOMIC_PI_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(pi, m_pi, 1); - -#define ATOMIC_PI_DATA_TEARDOWN_HIP \ - deallocHipDeviceData(pi); - -__global__ void atomic_pi(Real_ptr pi, - Real_type dx, - Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - double x = (double(i) + 0.5) * dx; - RAJA::atomicAdd(pi, dx / (1.0 + x * x)); - } -} - - -void ATOMIC_PI::runHipVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - ATOMIC_PI_DATA_SETUP; - - if ( vid == Base_HIP ) { - - ATOMIC_PI_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - initHipDeviceData(pi, &m_pi_init, 1); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL(atomic_pi,grid_size, block_size, 0, 0, pi, dx, iend ); - - getHipDeviceData(m_pi, pi, 1); - *m_pi *= 4.0; - - } - stopTimer(); - - ATOMIC_PI_DATA_TEARDOWN_HIP; - - } else if ( vid == RAJA_HIP ) { - - ATOMIC_PI_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - initHipDeviceData(pi, &m_pi_init, 1); - - RAJA::forall< RAJA::hip_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - double x = (double(i) + 0.5) * dx; - RAJA::atomicAdd(pi, dx / (1.0 + x * x)); - }); - - getHipDeviceData(m_pi, pi, 1); - *m_pi *= 4.0; - - } - stopTimer(); - - ATOMIC_PI_DATA_TEARDOWN_HIP; - - } else { - std::cout << "\n ATOMIC_PI : Unknown Hip variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/DAXPY-KokkosHip.cpp b/src/basic-kokkos/DAXPY-KokkosHip.cpp deleted file mode 100644 index 8f3fb1ada..000000000 --- a/src/basic-kokkos/DAXPY-KokkosHip.cpp +++ /dev/null @@ -1,99 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "DAXPY.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_HIP) - -#include "common/HipDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - -#define DAXPY_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(x, m_x, iend); \ - allocAndInitHipDeviceData(y, m_y, iend); - -#define DAXPY_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_y, y, iend); \ - deallocHipDeviceData(x); \ - deallocHipDeviceData(y); - -__global__ void daxpy(Real_ptr y, Real_ptr x, - Real_type a, - Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - DAXPY_BODY; - } -} - - -void DAXPY::runHipVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - DAXPY_DATA_SETUP; - - if ( vid == Base_HIP ) { - - DAXPY_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((daxpy),dim3(grid_size), dim3(block_size), 0, 0, y, x, a, - iend ); - - } - stopTimer(); - - DAXPY_DATA_TEARDOWN_HIP; - - } else if ( vid == RAJA_HIP ) { - - DAXPY_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall< RAJA::hip_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - DAXPY_BODY; - }); - - } - stopTimer(); - - DAXPY_DATA_TEARDOWN_HIP; - - } else { - std::cout << "\n DAXPY : Unknown Hip variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/IF_QUAD-KokkosHip.cpp b/src/basic-kokkos/IF_QUAD-KokkosHip.cpp deleted file mode 100644 index 246cd30fd..000000000 --- a/src/basic-kokkos/IF_QUAD-KokkosHip.cpp +++ /dev/null @@ -1,106 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "IF_QUAD.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_HIP) - -#include "common/HipDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - -#define IF_QUAD_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(a, m_a, iend); \ - allocAndInitHipDeviceData(b, m_b, iend); \ - allocAndInitHipDeviceData(c, m_c, iend); \ - allocAndInitHipDeviceData(x1, m_x1, iend); \ - allocAndInitHipDeviceData(x2, m_x2, iend); - -#define IF_QUAD_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_x1, x1, iend); \ - getHipDeviceData(m_x2, x2, iend); \ - deallocHipDeviceData(a); \ - deallocHipDeviceData(b); \ - deallocHipDeviceData(c); \ - deallocHipDeviceData(x1); \ - deallocHipDeviceData(x2); - -__global__ void ifquad(Real_ptr x1, Real_ptr x2, - Real_ptr a, Real_ptr b, Real_ptr c, - Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - IF_QUAD_BODY; - } -} - - -void IF_QUAD::runHipVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - IF_QUAD_DATA_SETUP; - - if ( vid == Base_HIP ) { - - IF_QUAD_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((ifquad), dim3(grid_size), dim3(block_size), 0, 0, x1, x2, a, b, c, - iend ); - - } - stopTimer(); - - IF_QUAD_DATA_TEARDOWN_HIP; - - } else if ( vid == RAJA_HIP ) { - - IF_QUAD_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall< RAJA::hip_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - IF_QUAD_BODY; - }); - - } - stopTimer(); - - IF_QUAD_DATA_TEARDOWN_HIP; - - } else { - std::cout << "\n IF_QUAD : Unknown Hip variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/INIT3-KokkosHip.cpp b/src/basic-kokkos/INIT3-KokkosHip.cpp deleted file mode 100644 index 9013a9c5a..000000000 --- a/src/basic-kokkos/INIT3-KokkosHip.cpp +++ /dev/null @@ -1,107 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "INIT3.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_HIP) - -#include "common/HipDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - -#define INIT3_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(out1, m_out1, iend); \ - allocAndInitHipDeviceData(out2, m_out2, iend); \ - allocAndInitHipDeviceData(out3, m_out3, iend); \ - allocAndInitHipDeviceData(in1, m_in1, iend); \ - allocAndInitHipDeviceData(in2, m_in2, iend); - -#define INIT3_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_out1, out1, iend); \ - getHipDeviceData(m_out2, out2, iend); \ - getHipDeviceData(m_out3, out3, iend); \ - deallocHipDeviceData(out1); \ - deallocHipDeviceData(out2); \ - deallocHipDeviceData(out3); \ - deallocHipDeviceData(in1); \ - deallocHipDeviceData(in2); - -__global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3, - Real_ptr in1, Real_ptr in2, - Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - INIT3_BODY; - } -} - - -void INIT3::runHipVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - INIT3_DATA_SETUP; - - if ( vid == Base_HIP ) { - - INIT3_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((init3), dim3(grid_size), dim3(block_size), 0, 0, out1, out2, out3, in1, in2, - iend ); - - } - stopTimer(); - - INIT3_DATA_TEARDOWN_HIP; - - } else if ( vid == RAJA_HIP ) { - - INIT3_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall< RAJA::hip_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - INIT3_BODY; - }); - - } - stopTimer(); - - INIT3_DATA_TEARDOWN_HIP; - - } else { - std::cout << "\n INIT3 : Unknown Hip variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/INIT_VIEW1D-KokkosHip.cpp b/src/basic-kokkos/INIT_VIEW1D-KokkosHip.cpp deleted file mode 100644 index d7ed15a33..000000000 --- a/src/basic-kokkos/INIT_VIEW1D-KokkosHip.cpp +++ /dev/null @@ -1,100 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "INIT_VIEW1D.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_HIP) - -#include "common/HipDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - -#define INIT_VIEW1D_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(a, m_a, iend); - -#define INIT_VIEW1D_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_a, a, iend); \ - deallocHipDeviceData(a); - -__global__ void initview1d(Real_ptr a, - Real_type v, - const Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - INIT_VIEW1D_BODY; - } -} - - -void INIT_VIEW1D::runHipVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - INIT_VIEW1D_DATA_SETUP; - - if ( vid == Base_HIP ) { - - INIT_VIEW1D_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((initview1d), dim3(grid_size), dim3(block_size), 0, 0, a, - v, - iend ); - - } - stopTimer(); - - INIT_VIEW1D_DATA_TEARDOWN_HIP; - - } else if ( vid == RAJA_HIP ) { - - INIT_VIEW1D_DATA_SETUP_HIP; - - INIT_VIEW1D_VIEW_RAJA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall< RAJA::hip_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - INIT_VIEW1D_BODY_RAJA; - }); - - } - stopTimer(); - - INIT_VIEW1D_DATA_TEARDOWN_HIP; - - } else { - std::cout << "\n INIT_VIEW1D : Unknown Hip variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosHip.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosHip.cpp deleted file mode 100644 index e568d1a78..000000000 --- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosHip.cpp +++ /dev/null @@ -1,101 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "INIT_VIEW1D_OFFSET.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_HIP) - -#include "common/HipDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - -#define INIT_VIEW1D_OFFSET_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(a, m_a, getRunSize()); - -#define INIT_VIEW1D_OFFSET_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_a, a, getRunSize()); \ - deallocHipDeviceData(a); - -__global__ void initview1d_offset(Real_ptr a, - Real_type v, - const Index_type ibegin, - const Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= ibegin && i < iend) { - INIT_VIEW1D_OFFSET_BODY; - } -} - - -void INIT_VIEW1D_OFFSET::runHipVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 1; - const Index_type iend = getRunSize()+1; - - INIT_VIEW1D_OFFSET_DATA_SETUP; - - if ( vid == Base_HIP ) { - - INIT_VIEW1D_OFFSET_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((initview1d_offset), dim3(grid_size), dim3(block_size), 0, 0, a, v, - ibegin, - iend ); - - } - stopTimer(); - - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_HIP; - - } else if ( vid == RAJA_HIP ) { - - INIT_VIEW1D_OFFSET_DATA_SETUP_HIP; - - INIT_VIEW1D_OFFSET_VIEW_RAJA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall< RAJA::hip_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - INIT_VIEW1D_OFFSET_BODY_RAJA; - }); - - } - stopTimer(); - - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_HIP; - - } else { - std::cout << "\n INIT_VIEW1D_OFFSET : Unknown Hip variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/MULADDSUB-KokkosHip.cpp b/src/basic-kokkos/MULADDSUB-KokkosHip.cpp deleted file mode 100644 index f999752ee..000000000 --- a/src/basic-kokkos/MULADDSUB-KokkosHip.cpp +++ /dev/null @@ -1,107 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "MULADDSUB.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_HIP) - -#include "common/HipDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - -#define MULADDSUB_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(out1, m_out1, iend); \ - allocAndInitHipDeviceData(out2, m_out2, iend); \ - allocAndInitHipDeviceData(out3, m_out3, iend); \ - allocAndInitHipDeviceData(in1, m_in1, iend); \ - allocAndInitHipDeviceData(in2, m_in2, iend); - -#define MULADDSUB_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_out1, out1, iend); \ - getHipDeviceData(m_out2, out2, iend); \ - getHipDeviceData(m_out3, out3, iend); \ - deallocHipDeviceData(out1); \ - deallocHipDeviceData(out2); \ - deallocHipDeviceData(out3); \ - deallocHipDeviceData(in1); \ - deallocHipDeviceData(in2); - -__global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3, - Real_ptr in1, Real_ptr in2, - Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - MULADDSUB_BODY; - } -} - - -void MULADDSUB::runHipVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - MULADDSUB_DATA_SETUP; - - if ( vid == Base_HIP ) { - - MULADDSUB_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((muladdsub), dim3(grid_size), dim3(block_size), 0, 0, out1, out2, out3, in1, in2, - iend ); - - } - stopTimer(); - - MULADDSUB_DATA_TEARDOWN_HIP; - - } else if ( vid == RAJA_HIP ) { - - MULADDSUB_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall< RAJA::hip_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - MULADDSUB_BODY; - }); - - } - stopTimer(); - - MULADDSUB_DATA_TEARDOWN_HIP; - - } else { - std::cout << "\n MULADDSUB : Unknown Hip variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/NESTED_INIT-KokkosHip.cpp b/src/basic-kokkos/NESTED_INIT-KokkosHip.cpp deleted file mode 100644 index 1ba5b6b17..000000000 --- a/src/basic-kokkos/NESTED_INIT-KokkosHip.cpp +++ /dev/null @@ -1,106 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "NESTED_INIT.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_HIP) - -#include "common/HipDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - -#define NESTED_INIT_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(array, m_array, m_array_length); - -#define NESTED_INIT_DATA_TEARDOWN_HIP \ - getHipDeviceData(m_array, array, m_array_length); \ - deallocHipDeviceData(array); - -__global__ void nested_init(Real_ptr array, - Index_type ni, Index_type nj) -{ - Index_type i = threadIdx.x; - Index_type j = blockIdx.y; - Index_type k = blockIdx.z; - - NESTED_INIT_BODY; -} - - -void NESTED_INIT::runHipVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - - NESTED_INIT_DATA_SETUP; - - if ( vid == Base_HIP ) { - - NESTED_INIT_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - dim3 nthreads_per_block(ni, 1, 1); - dim3 nblocks(1, nj, nk); - - hipLaunchKernelGGL((nested_init), dim3(nblocks), dim3(nthreads_per_block), 0, 0, array, - ni, nj); - - } - stopTimer(); - - NESTED_INIT_DATA_TEARDOWN_HIP; - - } else if ( vid == RAJA_HIP ) { - - NESTED_INIT_DATA_SETUP_HIP; - - using EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::HipKernelAsync< - RAJA::statement::For<2, RAJA::hip_block_z_loop, // k - RAJA::statement::For<1, RAJA::hip_block_y_loop, // j - RAJA::statement::For<0, RAJA::hip_thread_x_loop, // i - RAJA::statement::Lambda<0> - > - > - > - > - >; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), - RAJA::RangeSegment(0, nj), - RAJA::RangeSegment(0, nk)), - [=] __device__ (Index_type i, Index_type j, Index_type k) { - NESTED_INIT_BODY; - }); - - } - stopTimer(); - - NESTED_INIT_DATA_TEARDOWN_HIP; - - } else { - std::cout << "\n NESTED_INIT : Unknown Hip variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/REDUCE3_INT-KokkosHip.cpp b/src/basic-kokkos/REDUCE3_INT-KokkosHip.cpp deleted file mode 100644 index 2646ff547..000000000 --- a/src/basic-kokkos/REDUCE3_INT-KokkosHip.cpp +++ /dev/null @@ -1,175 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "REDUCE3_INT.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_HIP) - -#include "common/HipDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - -#define REDUCE3_INT_DATA_SETUP_HIP \ - allocAndInitHipDeviceData(vec, m_vec, iend); - -#define REDUCE3_INT_DATA_TEARDOWN_HIP \ - deallocHipDeviceData(vec); - - -__global__ void reduce3int(Int_ptr vec, - Int_ptr vsum, Int_type vsum_init, - Int_ptr vmin, Int_type vmin_init, - Int_ptr vmax, Int_type vmax_init, - Index_type iend) -{ - HIP_DYNAMIC_SHARED( Int_type, psum) - Int_type* pmin = (Int_type*)&psum[ 1 * blockDim.x ]; - Int_type* pmax = (Int_type*)&psum[ 2 * blockDim.x ]; - - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - - psum[ threadIdx.x ] = vsum_init; - pmin[ threadIdx.x ] = vmin_init; - pmax[ threadIdx.x ] = vmax_init; - - for ( ; i < iend ; i += gridDim.x * blockDim.x ) { - psum[ threadIdx.x ] += vec[ i ]; - pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], vec[ i ] ); - pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], vec[ i ] ); - } - __syncthreads(); - - for ( i = blockDim.x / 2; i > 0; i /= 2 ) { - if ( threadIdx.x < i ) { - psum[ threadIdx.x ] += psum[ threadIdx.x + i ]; - pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], pmin[ threadIdx.x + i ] ); - pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], pmax[ threadIdx.x + i ] ); - } - __syncthreads(); - } - -#if 1 // serialized access to shared data; - if ( threadIdx.x == 0 ) { - RAJA::atomicAdd( vsum, psum[ 0 ] ); - RAJA::atomicMin( vmin, pmin[ 0 ] ); - RAJA::atomicMax( vmax, pmax[ 0 ] ); - } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *vsum += psum[ 0 ]; - *vmin = RAJA_MIN( *vmin, pmin[ 0 ] ); - *vmax = RAJA_MAX( *vmax, pmax[ 0 ] ); - } -#endif -} - - -void REDUCE3_INT::runHipVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - REDUCE3_INT_DATA_SETUP; - - if ( vid == Base_HIP ) { - - REDUCE3_INT_DATA_SETUP_HIP; - - Int_ptr vsum; - allocAndInitHipDeviceData(vsum, &m_vsum_init, 1); - Int_ptr vmin; - allocAndInitHipDeviceData(vmin, &m_vmin_init, 1); - Int_ptr vmax; - allocAndInitHipDeviceData(vmax, &m_vmax_init, 1); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - initHipDeviceData(vsum, &m_vsum_init, 1); - initHipDeviceData(vmin, &m_vmin_init, 1); - initHipDeviceData(vmax, &m_vmax_init, 1); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((reduce3int), dim3(grid_size), dim3(block_size), 3*sizeof(Int_type)*block_size, 0, vec, - vsum, m_vsum_init, - vmin, m_vmin_init, - vmax, m_vmax_init, - iend ); - - Int_type lsum; - Int_ptr plsum = &lsum; - getHipDeviceData(plsum, vsum, 1); - m_vsum += lsum; - - Int_type lmin; - Int_ptr plmin = &lmin; - getHipDeviceData(plmin, vmin, 1); - m_vmin = RAJA_MIN(m_vmin, lmin); - - Int_type lmax; - Int_ptr plmax = &lmax; - getHipDeviceData(plmax, vmax, 1); - m_vmax = RAJA_MAX(m_vmax, lmax); - - } - stopTimer(); - - REDUCE3_INT_DATA_TEARDOWN_HIP; - - deallocHipDeviceData(vsum); - deallocHipDeviceData(vmin); - deallocHipDeviceData(vmax); - - } else if ( vid == RAJA_HIP ) { - - REDUCE3_INT_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); - - RAJA::forall< RAJA::hip_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); - - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); - - } - stopTimer(); - - REDUCE3_INT_DATA_TEARDOWN_HIP; - - } else { - std::cout << "\n REDUCE3_INT : Unknown Hip variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_HIP diff --git a/src/basic-kokkos/TRAP_INT-KokkosHip.cpp b/src/basic-kokkos/TRAP_INT-KokkosHip.cpp deleted file mode 100644 index 346050c3a..000000000 --- a/src/basic-kokkos/TRAP_INT-KokkosHip.cpp +++ /dev/null @@ -1,157 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "TRAP_INT.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_HIP) - -#include "common/HipDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - -// -// Function used in TRAP_INT loop. -// -RAJA_INLINE -RAJA_DEVICE -Real_type trap_int_func(Real_type x, - Real_type y, - Real_type xp, - Real_type yp) -{ - Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); - denom = 1.0/sqrt(denom); - return denom; -} - - - // - // Define thread block size for HIP execution - // - const size_t block_size = 256; - - -#define TRAP_INT_DATA_SETUP_HIP // nothing to do here... - -#define TRAP_INT_DATA_TEARDOWN_HIP // nothing to do here... - - -__global__ void trapint(Real_type x0, Real_type xp, - Real_type y, Real_type yp, - Real_type h, - Real_ptr sumx, - Index_type iend) -{ - HIP_DYNAMIC_SHARED( Real_type, psumx) - - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - - psumx[ threadIdx.x ] = 0.0; - for ( ; i < iend ; i += gridDim.x * blockDim.x ) { - Real_type x = x0 + i*h; - Real_type val = trap_int_func(x, y, xp, yp); - psumx[ threadIdx.x ] += val; - } - __syncthreads(); - - for ( i = blockDim.x / 2; i > 0; i /= 2 ) { - if ( threadIdx.x < i ) { - psumx[ threadIdx.x ] += psumx[ threadIdx.x + i ]; - } - __syncthreads(); - } - -#if 1 // serialized access to shared data; - if ( threadIdx.x == 0 ) { - RAJA::atomicAdd( sumx, psumx[ 0 ] ); - } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *sumx += psumx[ 0 ]; - } -#endif - -} - - -void TRAP_INT::runHipVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - TRAP_INT_DATA_SETUP; - - if ( vid == Base_HIP ) { - - TRAP_INT_DATA_SETUP_HIP; - - Real_ptr sumx; - allocAndInitHipDeviceData(sumx, &m_sumx_init, 1); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - initHipDeviceData(sumx, &m_sumx_init, 1); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - hipLaunchKernelGGL((trapint), dim3(grid_size), dim3(block_size), sizeof(Real_type)*block_size, 0, x0, xp, - y, yp, - h, - sumx, - iend); - - Real_type lsumx; - Real_ptr plsumx = &lsumx; - getHipDeviceData(plsumx, sumx, 1); - m_sumx += lsumx * h; - - } - stopTimer(); - - deallocHipDeviceData(sumx); - - TRAP_INT_DATA_TEARDOWN_HIP; - - } else if ( vid == RAJA_HIP ) { - - TRAP_INT_DATA_SETUP_HIP; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sumx(m_sumx_init); - - RAJA::forall< RAJA::hip_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - TRAP_INT_BODY; - }); - - m_sumx += static_cast(sumx.get()) * h; - - } - stopTimer(); - - TRAP_INT_DATA_TEARDOWN_HIP; - - } else { - std::cout << "\n TRAP_INT : Unknown Hip variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_HIP From c9cf2a7378dfcb83f3dc8c8aa994d4b3ebdf715c Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Tue, 17 Nov 2020 11:28:25 -0800 Subject: [PATCH 019/124] Rename misnamed files --- ...rget.cpp => ATOMIC_PI-KokkosOMPTarget.cpp} | 0 src/basic-kokkos/CMakeLists.txt | 49 ++++++++++--- src/basic-kokkos/CMakeOther.txt | 72 +++++++++++++++++++ ...MPTarget.cpp => DAXPY-KokkosOMPTarget.cpp} | 0 ...Target.cpp => IF_QUAD-KokkosOMPTarget.cpp} | 0 ...MPTarget.cpp => INIT3-KokkosOMPTarget.cpp} | 0 ...et.cpp => INIT_VIEW1D-KokkosOMPTarget.cpp} | 0 ...=> INIT_VIEW1D_OFFSET-KokkosOMPTarget.cpp} | 0 ...rget.cpp => MULADDSUB-KokkosOMPTarget.cpp} | 0 ...et.cpp => NESTED_INIT-KokkosOMPTarget.cpp} | 0 ...et.cpp => REDUCE3_INT-KokkosOMPTarget.cpp} | 0 ...arget.cpp => TRAP_INT-KokkosOMPTarget.cpp} | 0 12 files changed, 112 insertions(+), 9 deletions(-) rename src/basic-kokkos/{ATOMIC_PI-KokkosKokkosOMPTarget.cpp => ATOMIC_PI-KokkosOMPTarget.cpp} (100%) create mode 100644 src/basic-kokkos/CMakeOther.txt rename src/basic-kokkos/{DAXPY-KokkosKokkosOMPTarget.cpp => DAXPY-KokkosOMPTarget.cpp} (100%) rename src/basic-kokkos/{IF_QUAD-KokkosKokkosOMPTarget.cpp => IF_QUAD-KokkosOMPTarget.cpp} (100%) rename src/basic-kokkos/{INIT3-KokkosKokkosOMPTarget.cpp => INIT3-KokkosOMPTarget.cpp} (100%) rename src/basic-kokkos/{INIT_VIEW1D-KokkosKokkosOMPTarget.cpp => INIT_VIEW1D-KokkosOMPTarget.cpp} (100%) rename src/basic-kokkos/{INIT_VIEW1D_OFFSET-KokkosKokkosOMPTarget.cpp => INIT_VIEW1D_OFFSET-KokkosOMPTarget.cpp} (100%) rename src/basic-kokkos/{MULADDSUB-KokkosKokkosOMPTarget.cpp => MULADDSUB-KokkosOMPTarget.cpp} (100%) rename src/basic-kokkos/{NESTED_INIT-KokkosKokkosOMPTarget.cpp => NESTED_INIT-KokkosOMPTarget.cpp} (100%) rename src/basic-kokkos/{REDUCE3_INT-KokkosKokkosOMPTarget.cpp => REDUCE3_INT-KokkosOMPTarget.cpp} (100%) rename src/basic-kokkos/{TRAP_INT-KokkosKokkosOMPTarget.cpp => TRAP_INT-KokkosOMPTarget.cpp} (100%) diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/ATOMIC_PI-KokkosOMPTarget.cpp similarity index 100% rename from src/basic-kokkos/ATOMIC_PI-KokkosKokkosOMPTarget.cpp rename to src/basic-kokkos/ATOMIC_PI-KokkosOMPTarget.cpp diff --git a/src/basic-kokkos/CMakeLists.txt b/src/basic-kokkos/CMakeLists.txt index 566b9d8d2..842e15ff9 100644 --- a/src/basic-kokkos/CMakeLists.txt +++ b/src/basic-kokkos/CMakeLists.txt @@ -10,16 +10,47 @@ include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/../basic) blt_add_library( NAME basic-kokkos - SOURCES + SOURCES + ATOMIC_PI-KokkosSeq.cpp + ATOMIC_PI-KokkosCuda.cpp + ATOMIC_PI-KokkosOMP.cpp + ATOMIC_PI-KokkosOMPTarget.cpp DAXPY-KokkosSeq.cpp - DAXPY-KokkosOMP.cpp - DAXPY-KokkosCuda.cpp - ATOMIC_PI-KokkosOMP.cpp - ATOMIC_PI-KokkosSeq.cpp - ATOMIC_PI-KokkosCuda.cpp - IF_QUAD-KokkosSeq.cpp - IF_QUAD-KokkosOMP.cpp - IF_QUAD-KokkosCuda.cpp + DAXPY-KokkosCuda.cpp + DAXPY-KokkosOMP.cpp + DAXPY-KokkosOMPTarget.cpp + IF_QUAD-KokkosSeq.cpp + IF_QUAD-KokkosCuda.cpp + IF_QUAD-KokkosOMP.cpp + IF_QUAD-KokkosOMPTarget.cpp + INIT3-KokkosSeq.cpp + INIT3-KokkosCuda.cpp + INIT3-KokkosOMP.cpp + INIT3-KokkosOMPTarget.cpp + INIT_VIEW1D-KokkosSeq.cpp + INIT_VIEW1D-KokkosCuda.cpp + INIT_VIEW1D-KokkosOMP.cpp + INIT_VIEW1D-KokkosOMPTarget.cpp + INIT_VIEW1D_KokkosOFFSET-Seq.cpp + INIT_VIEW1D_KokkosOFFSET-Cuda.cpp + INIT_VIEW1D_KokkosOFFSET-OMP.cpp + INIT_VIEW1D_KokkosOFFSET-OMPTarget.cpp + MULADDSUB-KokkosSeq.cpp + MULADDSUB-KokkosCuda.cpp + MULADDSUB-KokkosOMP.cpp + MULADDSUB-KokkosOMPTarget.cpp + NESTED_INIT-KokkosSeq.cpp + NESTED_INIT-KokkosCuda.cpp + NESTED_INIT-KokkosOMP.cpp + NESTED_INIT-KokkosOMPTarget.cpp + REDUCE3_INT-KokkosSeq.cpp + REDUCE3_INT-KokkosCuda.cpp + REDUCE3_INT-KokkosOMP.cpp + REDUCE3_INT-KokkosOMPTarget.cpp + TRAP_INT-KokkosSeq.cpp + TRAP_INT-KokkosCuda.cpp + TRAP_INT-KokkosOMPTarget.cpp + TRAP_INT-KokkosOMP.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/basic-kokkos/CMakeOther.txt b/src/basic-kokkos/CMakeOther.txt new file mode 100644 index 000000000..b4b886a91 --- /dev/null +++ b/src/basic-kokkos/CMakeOther.txt @@ -0,0 +1,72 @@ +############################################################################### +# Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +blt_add_library( + NAME basic + SOURCES ATOMIC_PI.cpp + ATOMIC_PI-Seq.cpp + ATOMIC_PI-Hip.cpp + ATOMIC_PI-Cuda.cpp + ATOMIC_PI-OMP.cpp + ATOMIC_PI-OMPTarget.cpp + DAXPY.cpp + DAXPY-Seq.cpp + DAXPY-Hip.cpp + DAXPY-Cuda.cpp + DAXPY-OMP.cpp + DAXPY-OMPTarget.cpp + IF_QUAD.cpp + IF_QUAD-Seq.cpp + IF_QUAD-Hip.cpp + IF_QUAD-Cuda.cpp + IF_QUAD-OMP.cpp + IF_QUAD-OMPTarget.cpp + INIT3.cpp + INIT3-Seq.cpp + INIT3-Hip.cpp + INIT3-Cuda.cpp + INIT3-OMP.cpp + INIT3-OMPTarget.cpp + INIT_VIEW1D.cpp + INIT_VIEW1D-Seq.cpp + INIT_VIEW1D-Hip.cpp + INIT_VIEW1D-Cuda.cpp + INIT_VIEW1D-OMP.cpp + INIT_VIEW1D-OMPTarget.cpp + INIT_VIEW1D_OFFSET.cpp + INIT_VIEW1D_OFFSET-Seq.cpp + INIT_VIEW1D_OFFSET-Hip.cpp + INIT_VIEW1D_OFFSET-Cuda.cpp + INIT_VIEW1D_OFFSET-OMP.cpp + INIT_VIEW1D_OFFSET-OMPTarget.cpp + MULADDSUB.cpp + MULADDSUB-Seq.cpp + MULADDSUB-Hip.cpp + MULADDSUB-Cuda.cpp + MULADDSUB-OMP.cpp + MULADDSUB-OMPTarget.cpp + NESTED_INIT.cpp + NESTED_INIT-Seq.cpp + NESTED_INIT-Hip.cpp + NESTED_INIT-Cuda.cpp + NESTED_INIT-OMP.cpp + NESTED_INIT-OMPTarget.cpp + REDUCE3_INT.cpp + REDUCE3_INT-Seq.cpp + REDUCE3_INT-Hip.cpp + REDUCE3_INT-Cuda.cpp + REDUCE3_INT-OMP.cpp + REDUCE3_INT-OMPTarget.cpp + TRAP_INT.cpp + TRAP_INT-Seq.cpp + TRAP_INT-Hip.cpp + TRAP_INT-Cuda.cpp + TRAP_INT-OMPTarget.cpp + TRAP_INT-OMP.cpp + DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} + ) diff --git a/src/basic-kokkos/DAXPY-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/DAXPY-KokkosOMPTarget.cpp similarity index 100% rename from src/basic-kokkos/DAXPY-KokkosKokkosOMPTarget.cpp rename to src/basic-kokkos/DAXPY-KokkosOMPTarget.cpp diff --git a/src/basic-kokkos/IF_QUAD-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/IF_QUAD-KokkosOMPTarget.cpp similarity index 100% rename from src/basic-kokkos/IF_QUAD-KokkosKokkosOMPTarget.cpp rename to src/basic-kokkos/IF_QUAD-KokkosOMPTarget.cpp diff --git a/src/basic-kokkos/INIT3-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/INIT3-KokkosOMPTarget.cpp similarity index 100% rename from src/basic-kokkos/INIT3-KokkosKokkosOMPTarget.cpp rename to src/basic-kokkos/INIT3-KokkosOMPTarget.cpp diff --git a/src/basic-kokkos/INIT_VIEW1D-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/INIT_VIEW1D-KokkosOMPTarget.cpp similarity index 100% rename from src/basic-kokkos/INIT_VIEW1D-KokkosKokkosOMPTarget.cpp rename to src/basic-kokkos/INIT_VIEW1D-KokkosOMPTarget.cpp diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosOMPTarget.cpp similarity index 100% rename from src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosKokkosOMPTarget.cpp rename to src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosOMPTarget.cpp diff --git a/src/basic-kokkos/MULADDSUB-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/MULADDSUB-KokkosOMPTarget.cpp similarity index 100% rename from src/basic-kokkos/MULADDSUB-KokkosKokkosOMPTarget.cpp rename to src/basic-kokkos/MULADDSUB-KokkosOMPTarget.cpp diff --git a/src/basic-kokkos/NESTED_INIT-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/NESTED_INIT-KokkosOMPTarget.cpp similarity index 100% rename from src/basic-kokkos/NESTED_INIT-KokkosKokkosOMPTarget.cpp rename to src/basic-kokkos/NESTED_INIT-KokkosOMPTarget.cpp diff --git a/src/basic-kokkos/REDUCE3_INT-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/REDUCE3_INT-KokkosOMPTarget.cpp similarity index 100% rename from src/basic-kokkos/REDUCE3_INT-KokkosKokkosOMPTarget.cpp rename to src/basic-kokkos/REDUCE3_INT-KokkosOMPTarget.cpp diff --git a/src/basic-kokkos/TRAP_INT-KokkosKokkosOMPTarget.cpp b/src/basic-kokkos/TRAP_INT-KokkosOMPTarget.cpp similarity index 100% rename from src/basic-kokkos/TRAP_INT-KokkosKokkosOMPTarget.cpp rename to src/basic-kokkos/TRAP_INT-KokkosOMPTarget.cpp From fddf90401b70ce21b05792f488b1858e701de50c Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Tue, 17 Nov 2020 12:01:34 -0800 Subject: [PATCH 020/124] Infrastructure fixes --- src/basic-kokkos/CMakeLists.txt | 8 +- src/basic-kokkos/IF_QUAD-KokkosCuda.cpp | 4 +- src/basic-kokkos/INIT3-KokkosCuda.cpp | 22 ++-- src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp | 24 ++-- .../INIT_VIEW1D_OFFSET-KokkosCuda.cpp | 26 ++--- src/basic-kokkos/MULADDSUB-KokkosCuda.cpp | 22 ++-- src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp | 22 ++-- src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp | 104 +++++++++--------- src/basic-kokkos/TRAP_INT-KokkosCuda.cpp | 84 +++++++------- src/basic/INIT3.hpp | 5 +- src/basic/INIT_VIEW1D.hpp | 5 +- src/basic/INIT_VIEW1D_OFFSET.hpp | 5 +- src/basic/MULADDSUB.hpp | 5 +- src/basic/NESTED_INIT.hpp | 5 +- src/basic/REDUCE3_INT.hpp | 5 +- src/basic/TRAP_INT.hpp | 5 +- src/common/RAJAPerfSuite.cpp | 2 - src/common/RAJAPerfSuite.hpp | 48 ++++---- 18 files changed, 210 insertions(+), 191 deletions(-) diff --git a/src/basic-kokkos/CMakeLists.txt b/src/basic-kokkos/CMakeLists.txt index 842e15ff9..f559af456 100644 --- a/src/basic-kokkos/CMakeLists.txt +++ b/src/basic-kokkos/CMakeLists.txt @@ -31,10 +31,10 @@ blt_add_library( INIT_VIEW1D-KokkosCuda.cpp INIT_VIEW1D-KokkosOMP.cpp INIT_VIEW1D-KokkosOMPTarget.cpp - INIT_VIEW1D_KokkosOFFSET-Seq.cpp - INIT_VIEW1D_KokkosOFFSET-Cuda.cpp - INIT_VIEW1D_KokkosOFFSET-OMP.cpp - INIT_VIEW1D_KokkosOFFSET-OMPTarget.cpp + INIT_VIEW1D_OFFSET-KokkosSeq.cpp + INIT_VIEW1D_OFFSET-KokkosCuda.cpp + INIT_VIEW1D_OFFSET-KokkosOMP.cpp + INIT_VIEW1D_OFFSET-KokkosOMPTarget.cpp MULADDSUB-KokkosSeq.cpp MULADDSUB-KokkosCuda.cpp MULADDSUB-KokkosOMP.cpp diff --git a/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp b/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp index 658797702..15709bfed 100644 --- a/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp +++ b/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp @@ -65,8 +65,8 @@ void IF_QUAD::runKokkosCudaVariant(VariantID vid) // QUESTION: Should "RAJA_DIVIDE_CEILING_INT be changed? const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - ifquad<<>>( x1, x2, a, b, c, - iend ); + //ifquad<<>>( x1, x2, a, b, c, + // iend ); } stopTimer(); diff --git a/src/basic-kokkos/INIT3-KokkosCuda.cpp b/src/basic-kokkos/INIT3-KokkosCuda.cpp index 14ee12f81..56c11f465 100644 --- a/src/basic-kokkos/INIT3-KokkosCuda.cpp +++ b/src/basic-kokkos/INIT3-KokkosCuda.cpp @@ -44,15 +44,15 @@ namespace basic deallocCudaDeviceData(in1); \ deallocCudaDeviceData(in2); -__global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3, - Real_ptr in1, Real_ptr in2, - Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - INIT3_BODY; - } -} +//__global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3, +// Real_ptr in1, Real_ptr in2, +// Index_type iend) +//{ +// Index_type i = blockIdx.x * blockDim.x + threadIdx.x; +// if (i < iend) { +// INIT3_BODY; +// } +//} void INIT3::runKokkosCudaVariant(VariantID vid) @@ -71,8 +71,8 @@ void INIT3::runKokkosCudaVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - init3<<>>( out1, out2, out3, in1, in2, - iend ); + //init3<<>>( out1, out2, out3, in1, in2, + // iend ); } stopTimer(); diff --git a/src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp b/src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp index 0415fbb17..38a71bfcf 100644 --- a/src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp +++ b/src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp @@ -34,15 +34,15 @@ namespace basic getCudaDeviceData(m_a, a, getRunSize()); \ deallocCudaDeviceData(a); -__global__ void initview1d(Real_ptr a, - Real_type v, - const Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - INIT_VIEW1D_BODY; - } -} +//__global__ void initview1d(Real_ptr a, +// Real_type v, +// const Index_type iend) +//{ +// Index_type i = blockIdx.x * blockDim.x + threadIdx.x; +// if (i < iend) { +// INIT_VIEW1D_BODY; +// } +//} void INIT_VIEW1D::runKokkosCudaVariant(VariantID vid) @@ -61,9 +61,9 @@ void INIT_VIEW1D::runKokkosCudaVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - initview1d<<>>( a, - v, - iend ); + //initview1d<<>>( a, + // v, + // iend ); } stopTimer(); diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp index d71e67e9e..eb903a68b 100644 --- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp @@ -34,16 +34,16 @@ namespace basic getCudaDeviceData(m_a, a, getRunSize()); \ deallocCudaDeviceData(a); -__global__ void initview1d_offset(Real_ptr a, - Real_type v, - const Index_type ibegin, - const Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= ibegin && i < iend) { - INIT_VIEW1D_OFFSET_BODY; - } -} +//__global__ void initview1d_offset(Real_ptr a, +// Real_type v, +// const Index_type ibegin, +// const Index_type iend) +//{ +// Index_type i = blockIdx.x * blockDim.x + threadIdx.x; +// if (i >= ibegin && i < iend) { +// INIT_VIEW1D_OFFSET_BODY; +// } +//} void INIT_VIEW1D_OFFSET::runKokkosCudaVariant(VariantID vid) @@ -62,9 +62,9 @@ void INIT_VIEW1D_OFFSET::runKokkosCudaVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - initview1d_offset<<>>( a, v, - ibegin, - iend ); + //initview1d_offset<<>>( a, v, + // ibegin, + // iend ); } stopTimer(); diff --git a/src/basic-kokkos/MULADDSUB-KokkosCuda.cpp b/src/basic-kokkos/MULADDSUB-KokkosCuda.cpp index 40506073d..cb30f4969 100644 --- a/src/basic-kokkos/MULADDSUB-KokkosCuda.cpp +++ b/src/basic-kokkos/MULADDSUB-KokkosCuda.cpp @@ -44,15 +44,15 @@ namespace basic deallocCudaDeviceData(in1); \ deallocCudaDeviceData(in2); -__global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3, - Real_ptr in1, Real_ptr in2, - Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - MULADDSUB_BODY; - } -} +//__global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3, +// Real_ptr in1, Real_ptr in2, +// Index_type iend) +//{ +// Index_type i = blockIdx.x * blockDim.x + threadIdx.x; +// if (i < iend) { +// MULADDSUB_BODY; +// } +//} void MULADDSUB::runKokkosCudaVariant(VariantID vid) @@ -71,8 +71,8 @@ void MULADDSUB::runKokkosCudaVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - muladdsub<<>>( out1, out2, out3, in1, in2, - iend ); + //muladdsub<<>>( out1, out2, out3, in1, in2, + // iend ); } stopTimer(); diff --git a/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp b/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp index 0d1a9648e..b64b0b960 100644 --- a/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp +++ b/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp @@ -28,15 +28,15 @@ namespace basic getCudaDeviceData(m_array, array, m_array_length); \ deallocCudaDeviceData(array); -__global__ void nested_init(Real_ptr array, - Index_type ni, Index_type nj) -{ - Index_type i = threadIdx.x; - Index_type j = blockIdx.y; - Index_type k = blockIdx.z; - - NESTED_INIT_BODY; -} +//__global__ void nested_init(Real_ptr array, +// Index_type ni, Index_type nj) +//{ +// Index_type i = threadIdx.x; +// Index_type j = blockIdx.y; +// Index_type k = blockIdx.z; +// +// NESTED_INIT_BODY; +//} void NESTED_INIT::runKokkosCudaVariant(VariantID vid) @@ -55,8 +55,8 @@ void NESTED_INIT::runKokkosCudaVariant(VariantID vid) dim3 nthreads_per_block(ni, 1, 1); dim3 nblocks(1, nj, nk); - nested_init<<>>(array, - ni, nj); + //nested_init<<>>(array, + // ni, nj); } stopTimer(); diff --git a/src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp b/src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp index f5b86c961..c1a32ff6f 100644 --- a/src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp +++ b/src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp @@ -34,52 +34,52 @@ namespace basic deallocCudaDeviceData(vec); -__global__ void reduce3int(Int_ptr vec, - Int_ptr vsum, Int_type vsum_init, - Int_ptr vmin, Int_type vmin_init, - Int_ptr vmax, Int_type vmax_init, - Index_type iend) -{ - extern __shared__ Int_type psum[ ]; - Int_type* pmin = (Int_type*)&psum[ 1 * blockDim.x ]; - Int_type* pmax = (Int_type*)&psum[ 2 * blockDim.x ]; - - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - - psum[ threadIdx.x ] = vsum_init; - pmin[ threadIdx.x ] = vmin_init; - pmax[ threadIdx.x ] = vmax_init; - - for ( ; i < iend ; i += gridDim.x * blockDim.x ) { - psum[ threadIdx.x ] += vec[ i ]; - pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], vec[ i ] ); - pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], vec[ i ] ); - } - __syncthreads(); - - for ( i = blockDim.x / 2; i > 0; i /= 2 ) { - if ( threadIdx.x < i ) { - psum[ threadIdx.x ] += psum[ threadIdx.x + i ]; - pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], pmin[ threadIdx.x + i ] ); - pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], pmax[ threadIdx.x + i ] ); - } - __syncthreads(); - } - -#if 1 // serialized access to shared data; - if ( threadIdx.x == 0 ) { - RAJA::atomicAdd( vsum, psum[ 0 ] ); - RAJA::atomicMin( vmin, pmin[ 0 ] ); - RAJA::atomicMax( vmax, pmax[ 0 ] ); - } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *vsum += psum[ 0 ]; - *vmin = RAJA_MIN( *vmin, pmin[ 0 ] ); - *vmax = RAJA_MAX( *vmax, pmax[ 0 ] ); - } -#endif -} +//__global__ void reduce3int(Int_ptr vec, +// Int_ptr vsum, Int_type vsum_init, +// Int_ptr vmin, Int_type vmin_init, +// Int_ptr vmax, Int_type vmax_init, +// Index_type iend) +//{ +// extern __shared__ Int_type psum[ ]; +// Int_type* pmin = (Int_type*)&psum[ 1 * blockDim.x ]; +// Int_type* pmax = (Int_type*)&psum[ 2 * blockDim.x ]; +// +// Index_type i = blockIdx.x * blockDim.x + threadIdx.x; +// +// psum[ threadIdx.x ] = vsum_init; +// pmin[ threadIdx.x ] = vmin_init; +// pmax[ threadIdx.x ] = vmax_init; +// +// for ( ; i < iend ; i += gridDim.x * blockDim.x ) { +// psum[ threadIdx.x ] += vec[ i ]; +// pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], vec[ i ] ); +// pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], vec[ i ] ); +// } +// __syncthreads(); +// +// for ( i = blockDim.x / 2; i > 0; i /= 2 ) { +// if ( threadIdx.x < i ) { +// psum[ threadIdx.x ] += psum[ threadIdx.x + i ]; +// pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], pmin[ threadIdx.x + i ] ); +// pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], pmax[ threadIdx.x + i ] ); +// } +// __syncthreads(); +// } +// +//#if 1 // serialized access to shared data; +// if ( threadIdx.x == 0 ) { +// RAJA::atomicAdd( vsum, psum[ 0 ] ); +// RAJA::atomicMin( vmin, pmin[ 0 ] ); +// RAJA::atomicMax( vmax, pmax[ 0 ] ); +// } +//#else // this doesn't work due to data races +// if ( threadIdx.x == 0 ) { +// *vsum += psum[ 0 ]; +// *vmin = RAJA_MIN( *vmin, pmin[ 0 ] ); +// *vmax = RAJA_MAX( *vmax, pmax[ 0 ] ); +// } +//#endif +//} void REDUCE3_INT::runKokkosCudaVariant(VariantID vid) @@ -109,12 +109,12 @@ void REDUCE3_INT::runKokkosCudaVariant(VariantID vid) initCudaDeviceData(vmax, &m_vmax_init, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - reduce3int<<>>(vec, - vsum, m_vsum_init, - vmin, m_vmin_init, - vmax, m_vmax_init, - iend ); + //reduce3int<<>>(vec, + // vsum, m_vsum_init, + // vmin, m_vmin_init, + // vmax, m_vmax_init, + // iend ); Int_type lsum; Int_ptr plsum = &lsum; diff --git a/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp b/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp index ccd1c596d..e47d4c3c5 100644 --- a/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp +++ b/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp @@ -48,42 +48,42 @@ Real_type trap_int_func(Real_type x, #define TRAP_INT_DATA_TEARDOWN_CUDA // nothing to do here... -__global__ void trapint(Real_type x0, Real_type xp, - Real_type y, Real_type yp, - Real_type h, - Real_ptr sumx, - Index_type iend) -{ - extern __shared__ Real_type psumx[ ]; - - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - - psumx[ threadIdx.x ] = 0.0; - for ( ; i < iend ; i += gridDim.x * blockDim.x ) { - Real_type x = x0 + i*h; - Real_type val = trap_int_func(x, y, xp, yp); - psumx[ threadIdx.x ] += val; - } - __syncthreads(); - - for ( i = blockDim.x / 2; i > 0; i /= 2 ) { - if ( threadIdx.x < i ) { - psumx[ threadIdx.x ] += psumx[ threadIdx.x + i ]; - } - __syncthreads(); - } - -#if 1 // serialized access to shared data; - if ( threadIdx.x == 0 ) { - RAJA::atomicAdd( sumx, psumx[ 0 ] ); - } -#else // this doesn't work due to data races - if ( threadIdx.x == 0 ) { - *sumx += psumx[ 0 ]; - } -#endif - -} +//__global__ void trapint(Real_type x0, Real_type xp, +// Real_type y, Real_type yp, +// Real_type h, +// Real_ptr sumx, +// Index_type iend) +//{ +// extern __shared__ Real_type psumx[ ]; +// +// Index_type i = blockIdx.x * blockDim.x + threadIdx.x; +// +// psumx[ threadIdx.x ] = 0.0; +// for ( ; i < iend ; i += gridDim.x * blockDim.x ) { +// Real_type x = x0 + i*h; +// Real_type val = trap_int_func(x, y, xp, yp); +// psumx[ threadIdx.x ] += val; +// } +// __syncthreads(); +// +// for ( i = blockDim.x / 2; i > 0; i /= 2 ) { +// if ( threadIdx.x < i ) { +// psumx[ threadIdx.x ] += psumx[ threadIdx.x + i ]; +// } +// __syncthreads(); +// } +// +//#if 1 // serialized access to shared data; +// if ( threadIdx.x == 0 ) { +// RAJA::atomicAdd( sumx, psumx[ 0 ] ); +// } +//#else // this doesn't work due to data races +// if ( threadIdx.x == 0 ) { +// *sumx += psumx[ 0 ]; +// } +//#endif +// +//} void TRAP_INT::runKokkosCudaVariant(VariantID vid) @@ -107,12 +107,12 @@ void TRAP_INT::runKokkosCudaVariant(VariantID vid) initCudaDeviceData(sumx, &m_sumx_init, 1); const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - trapint<<>>(x0, xp, - y, yp, - h, - sumx, - iend); + //trapint<<>>(x0, xp, + // y, yp, + // h, + // sumx, + // iend); Real_type lsumx; Real_ptr plsumx = &lsumx; diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index 1a0216344..d1380c20b 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -55,7 +55,10 @@ class INIT3 : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - + void runKokkosSeqVariant(VariantID vid); + void runKokkosOpenMPVariant(VariantID vid); + void runKokkosCudaVariant(VariantID vid); + void runKokkosOpenMPTargetVariant(VariantID vid); private: Real_ptr m_out1; Real_ptr m_out2; diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index 77a9eebe6..b1807a168 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -66,7 +66,10 @@ class INIT_VIEW1D : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - + void runKokkosSeqVariant(VariantID vid); + void runKokkosOpenMPVariant(VariantID vid); + void runKokkosCudaVariant(VariantID vid); + void runKokkosOpenMPTargetVariant(VariantID vid); private: Real_ptr m_a; Real_type m_val; diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp index b89a00fe4..421f03b10 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.hpp +++ b/src/basic/INIT_VIEW1D_OFFSET.hpp @@ -65,7 +65,10 @@ class INIT_VIEW1D_OFFSET : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - + void runKokkosSeqVariant(VariantID vid); + void runKokkosOpenMPVariant(VariantID vid); + void runKokkosCudaVariant(VariantID vid); + void runKokkosOpenMPTargetVariant(VariantID vid); private: Real_ptr m_a; Real_type m_val; diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index 48a664f63..a280d1b8f 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -58,7 +58,10 @@ class MULADDSUB : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - + void runKokkosSeqVariant(VariantID vid); + void runKokkosOpenMPVariant(VariantID vid); + void runKokkosCudaVariant(VariantID vid); + void runKokkosOpenMPTargetVariant(VariantID vid); private: Real_ptr m_out1; Real_ptr m_out2; diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index f8cf6066e..20232cc99 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -58,7 +58,10 @@ class NESTED_INIT : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - + void runKokkosSeqVariant(VariantID vid); + void runKokkosOpenMPVariant(VariantID vid); + void runKokkosCudaVariant(VariantID vid); + void runKokkosOpenMPTargetVariant(VariantID vid); private: Index_type m_array_length; diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index a54778512..c119f727a 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -70,7 +70,10 @@ class REDUCE3_INT : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - + void runKokkosSeqVariant(VariantID vid); + void runKokkosOpenMPVariant(VariantID vid); + void runKokkosCudaVariant(VariantID vid); + void runKokkosOpenMPTargetVariant(VariantID vid); private: Int_ptr m_vec; Int_type m_vsum; diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 72f3148d5..1240f5e2b 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -67,7 +67,10 @@ class TRAP_INT : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - + void runKokkosSeqVariant(VariantID vid); + void runKokkosOpenMPVariant(VariantID vid); + void runKokkosCudaVariant(VariantID vid); + void runKokkosOpenMPTargetVariant(VariantID vid); private: Real_type m_x0; Real_type m_xp; diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 0df32a9f2..317df335c 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -394,7 +394,6 @@ KernelBase* getKernelObject(KernelID kid, kernel = new basic::IF_QUAD(run_params); break; } - /** case Basic_INIT3 : { kernel = new basic::INIT3(run_params); break; @@ -423,7 +422,6 @@ KernelBase* getKernelObject(KernelID kid, kernel = new basic::TRAP_INT(run_params); break; } - */ /** DZP: big comment block for unimplemented // // Lcals kernels... diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 00036a681..6f21a9aec 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -116,34 +116,34 @@ enum KernelID { // // Lcals kernels... // - Lcals_DIFF_PREDICT, - Lcals_EOS, - Lcals_FIRST_DIFF, - Lcals_FIRST_MIN, - Lcals_FIRST_SUM, - Lcals_GEN_LIN_RECUR, - Lcals_HYDRO_1D, - Lcals_HYDRO_2D, - Lcals_INT_PREDICT, - Lcals_PLANCKIAN, - Lcals_TRIDIAG_ELIM, +// Lcals_DIFF_PREDICT, +// Lcals_EOS, +// Lcals_FIRST_DIFF, +// Lcals_FIRST_MIN, +// Lcals_FIRST_SUM, +// Lcals_GEN_LIN_RECUR, +// Lcals_HYDRO_1D, +// Lcals_HYDRO_2D, +// Lcals_INT_PREDICT, +// Lcals_PLANCKIAN, +// Lcals_TRIDIAG_ELIM, // // Polybench kernels... // - Polybench_2MM, - Polybench_3MM, - Polybench_ADI, - Polybench_ATAX, - Polybench_FDTD_2D, - Polybench_FLOYD_WARSHALL, - Polybench_GEMM, - Polybench_GEMVER, - Polybench_GESUMMV, - Polybench_HEAT_3D, - Polybench_JACOBI_1D, - Polybench_JACOBI_2D, - Polybench_MVT, +// Polybench_2MM, +// Polybench_3MM, +// Polybench_ADI, +// Polybench_ATAX, +// Polybench_FDTD_2D, +// Polybench_FLOYD_WARSHALL, +// Polybench_GEMM, +// Polybench_GEMVER, +// Polybench_GESUMMV, +// Polybench_HEAT_3D, +// Polybench_JACOBI_1D, +// Polybench_JACOBI_2D, +// Polybench_MVT, // // Stream kernels... From 7a378062eccc430e961ff804d202d24c5eef1342 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Tue, 17 Nov 2020 14:46:16 -0800 Subject: [PATCH 021/124] Small changes to enable correct output --- src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp | 4 +-- src/basic-kokkos/IF_QUAD-KokkosCuda.cpp | 4 +-- src/common/RAJAPerfSuite.cpp | 37 ++++++++++++++++------- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp b/src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp index 63a77bfc6..fd73ccead 100644 --- a/src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp +++ b/src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp @@ -10,7 +10,7 @@ #include "RAJA/RAJA.hpp" -//#if defined(RAJA_ENABLE_CUDA) +#if defined(RAJA_ENABLE_CUDA) #include "common/CudaDataUtils.hpp" @@ -122,4 +122,4 @@ void ATOMIC_PI::runKokkosCudaVariant(VariantID vid) } // end namespace rajaperf #endif // RUN_KOKKOS -//#endif // RAJA_ENABLE_CUDA +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp b/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp index 15709bfed..88e792ef0 100644 --- a/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp +++ b/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp @@ -10,7 +10,7 @@ #include "RAJA/RAJA.hpp" -//#if defined(RAJA_ENABLE_CUDA) +#if defined(RAJA_ENABLE_CUDA) #include "common/CudaDataUtils.hpp" @@ -117,4 +117,4 @@ void IF_QUAD::runKokkosCudaVariant(VariantID vid) } // end namespace basic } // end namespace rajaperf -//#endif // RAJA_ENABLE_CUDA +#endif // RAJA_ENABLE_CUDA diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 317df335c..603fd006b 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -227,28 +227,18 @@ static const std::string VariantNames [] = std::string("Base_HIP"), std::string("RAJA_HIP"), -#if defined(RUN_KOKKOS) -#if defined(RUN_RAJA_SEQ) std::string("Kokkos_Lambda_Seq"), std::string("Kokkos_Functor_Seq"), -#endif -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) std::string("Kokkos_Lambda_OpenMP"), std::string("Kokkos_Functor_OpenMP"), -#endif -#if defined(RAJA_ENABLE_TARGET_OPENMP) + std::string("Kokkos_Lambda_OpenMPTarget"), std::string("Kokkos_Functor_OpenMPTarget"), -#endif -#if defined(RAJA_ENABLE_CUDA) std::string("Kokkos_Lambda_CUDA"), std::string("Kokkos_Functor_CUDA"), -#endif - -#endif // RUN_KOKKOS std::string("Unknown Variant") // Keep this at the end and DO NOT remove.... @@ -328,6 +318,13 @@ bool isVariantAvailable(VariantID vid) vid == RAJA_Seq ) { ret_val = true; } +#if defined(RUN_KOKKOS) + if ( vid == Kokkos_Lambda_Seq || + vid == Kokkos_Functor_Seq ) { + ret_val = true; + } +#endif // RUN_KOKKOS + #endif #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) @@ -336,6 +333,12 @@ bool isVariantAvailable(VariantID vid) vid == RAJA_OpenMP ) { ret_val = true; } +#if defined(RUN_KOKKOS) + if ( vid == Kokkos_Lambda_OpenMP || + vid == Kokkos_Functor_OpenMP ) { + ret_val = true; + } +#endif // RUN_KOKKOS #endif #if defined(RAJA_ENABLE_TARGET_OPENMP) @@ -343,6 +346,12 @@ bool isVariantAvailable(VariantID vid) vid == RAJA_OpenMPTarget ) { ret_val = true; } +#if defined(RUN_KOKKOS) + if ( vid == Kokkos_Lambda_OpenMPTarget || + vid == Kokkos_Functor_OpenMPTarget ) { + ret_val = true; + } +#endif // RUN_KOKKOS #endif #if defined(RAJA_ENABLE_CUDA) @@ -350,6 +359,12 @@ bool isVariantAvailable(VariantID vid) vid == RAJA_CUDA ) { ret_val = true; } +#if defined(RUN_KOKKOS) + if ( vid == Kokkos_Lambda_CUDA || + vid == Kokkos_Functor_CUDA ) { + ret_val = true; + } +#endif // RUN_KOKKOS #endif #if defined(RAJA_ENABLE_HIP) From 7aa5d8d5f37137d024553023ccebce3110bee585 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Fri, 20 Nov 2020 11:28:39 -0800 Subject: [PATCH 022/124] first implementation of a "kokkos-mechanics" test in RajaPerfSuite --- src/CMakeLists.txt | 2 + src/common/RAJAPerfSuite.cpp | 104 ++++++++++-------- src/common/RAJAPerfSuite.hpp | 3 + src/kokkos-mechanics/CMakeLists.txt | 16 +++ .../ViewAllocate-KokkosCuda.cpp | 72 ++++++++++++ .../ViewAllocate-KokkosSeq.cpp | 76 +++++++++++++ src/kokkos-mechanics/ViewAllocate-Stubs.cpp | 30 +++++ src/kokkos-mechanics/ViewAllocate.cpp | 52 +++++++++ src/kokkos-mechanics/ViewAllocate.hpp | 70 ++++++++++++ 9 files changed, 378 insertions(+), 47 deletions(-) create mode 100644 src/kokkos-mechanics/CMakeLists.txt create mode 100644 src/kokkos-mechanics/ViewAllocate-KokkosCuda.cpp create mode 100644 src/kokkos-mechanics/ViewAllocate-KokkosSeq.cpp create mode 100644 src/kokkos-mechanics/ViewAllocate-Stubs.cpp create mode 100644 src/kokkos-mechanics/ViewAllocate.cpp create mode 100644 src/kokkos-mechanics/ViewAllocate.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 75c5e646c..1bae44838 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -12,6 +12,7 @@ add_subdirectory(common) #add_subdirectory(apps) add_subdirectory(basic) add_subdirectory(basic-kokkos) +add_subdirectory(kokkos-mechanics) #add_subdirectory(lcals) #add_subdirectory(polybench) #add_subdirectory(stream) @@ -21,6 +22,7 @@ set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS #apps basic basic-kokkos + kokkos-mechanics #lcals #polybench #stream diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 603fd006b..8c692abf1 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -77,6 +77,8 @@ #include "apps/PRESSURE.hpp" #include "apps/VOL3D.hpp" +#include "kokkos-mechanics/ViewAllocate.hpp" + #include @@ -139,56 +141,58 @@ static const std::string KernelNames [] = // // Lcals kernels... +//// +// std::string("Lcals_DIFF_PREDICT"), +// std::string("Lcals_EOS"), +// std::string("Lcals_FIRST_DIFF"), +// std::string("Lcals_FIRST_MIN"), +// std::string("Lcals_FIRST_SUM"), +// std::string("Lcals_GEN_LIN_RECUR"), +// std::string("Lcals_HYDRO_1D"), +// std::string("Lcals_HYDRO_2D"), +// std::string("Lcals_INT_PREDICT"), +// std::string("Lcals_PLANCKIAN"), +// std::string("Lcals_TRIDIAG_ELIM"), // - std::string("Lcals_DIFF_PREDICT"), - std::string("Lcals_EOS"), - std::string("Lcals_FIRST_DIFF"), - std::string("Lcals_FIRST_MIN"), - std::string("Lcals_FIRST_SUM"), - std::string("Lcals_GEN_LIN_RECUR"), - std::string("Lcals_HYDRO_1D"), - std::string("Lcals_HYDRO_2D"), - std::string("Lcals_INT_PREDICT"), - std::string("Lcals_PLANCKIAN"), - std::string("Lcals_TRIDIAG_ELIM"), - -// -// Polybench kernels... -// - std::string("Polybench_2MM"), - std::string("Polybench_3MM"), - std::string("Polybench_ADI"), - std::string("Polybench_ATAX"), - std::string("Polybench_FDTD_2D"), - std::string("Polybench_FLOYD_WARSHALL"), - std::string("Polybench_GEMM"), - std::string("Polybench_GEMVER"), - std::string("Polybench_GESUMMV"), - std::string("Polybench_HEAT_3D"), - std::string("Polybench_JACOBI_1D"), - std::string("Polybench_JACOBI_2D"), - std::string("Polybench_MVT"), - +//// +//// Polybench kernels... +//// +// std::string("Polybench_2MM"), +// std::string("Polybench_3MM"), +// std::string("Polybench_ADI"), +// std::string("Polybench_ATAX"), +// std::string("Polybench_FDTD_2D"), +// std::string("Polybench_FLOYD_WARSHALL"), +// std::string("Polybench_GEMM"), +// std::string("Polybench_GEMVER"), +// std::string("Polybench_GESUMMV"), +// std::string("Polybench_HEAT_3D"), +// std::string("Polybench_JACOBI_1D"), +// std::string("Polybench_JACOBI_2D"), +// std::string("Polybench_MVT"), // -// Stream kernels... +//// +//// Stream kernels... +//// +// std::string("Stream_ADD"), +// std::string("Stream_COPY"), +// std::string("Stream_DOT"), +// std::string("Stream_MUL"), +// std::string("Stream_TRIAD"), // - std::string("Stream_ADD"), - std::string("Stream_COPY"), - std::string("Stream_DOT"), - std::string("Stream_MUL"), - std::string("Stream_TRIAD"), - -// -// Apps kernels... -// - std::string("Apps_COUPLE"), - std::string("Apps_DEL_DOT_VEC_2D"), - std::string("Apps_ENERGY"), - std::string("Apps_FIR"), - std::string("Apps_LTIMES"), - std::string("Apps_LTIMES_NOVIEW"), - std::string("Apps_PRESSURE"), - std::string("Apps_VOL3D"), +//// +//// Apps kernels... +//// +// std::string("Apps_COUPLE"), +// std::string("Apps_DEL_DOT_VEC_2D"), +// std::string("Apps_ENERGY"), +// std::string("Apps_FIR"), +// std::string("Apps_LTIMES"), +// std::string("Apps_LTIMES_NOVIEW"), +// std::string("Apps_PRESSURE"), +// std::string("Apps_VOL3D"), + + std::string("KokkosMechanics_ViewAllocate"), std::string("Unknown Kernel") // Keep this at the end and DO NOT remove.... @@ -602,6 +606,12 @@ KernelBase* getKernelObject(KernelID kid, break; } */ + + case KokkosMechanics_ViewAllocate : { + kernel = new kokkos_mechanics::ViewAllocate(run_params); + break; + } + default: { std::cout << "\n Unknown Kernel ID = " << kid << std::endl; } diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 6f21a9aec..5cd2ffdb9 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -166,6 +166,9 @@ enum KernelID { //Apps_PRESSURE, //Apps_VOL3D, + // Kokkos Mechanics Tests + KokkosMechanics_ViewAllocate, + NumKernels // Keep this one last and NEVER comment out (!!) }; diff --git a/src/kokkos-mechanics/CMakeLists.txt b/src/kokkos-mechanics/CMakeLists.txt new file mode 100644 index 000000000..2368c0db7 --- /dev/null +++ b/src/kokkos-mechanics/CMakeLists.txt @@ -0,0 +1,16 @@ +############################################################################### +# Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +blt_add_library( + NAME kokkos-mechanics + SOURCES ViewAllocate.cpp + ViewAllocate-Stubs.cpp + ViewAllocate-KokkosSeq.cpp + ViewAllocate-KokkosCuda.cpp + DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} + ) diff --git a/src/kokkos-mechanics/ViewAllocate-KokkosCuda.cpp b/src/kokkos-mechanics/ViewAllocate-KokkosCuda.cpp new file mode 100644 index 000000000..1575b8dc3 --- /dev/null +++ b/src/kokkos-mechanics/ViewAllocate-KokkosCuda.cpp @@ -0,0 +1,72 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ViewAllocate.hpp" + +#include "RAJA/RAJA.hpp" +#if defined (RAJA_ENABLE_CUDA) + +#include + +namespace rajaperf +{ +namespace kokkos_mechanics +{ + + +// Kokkos-ify here + +void ViewAllocate::runKokkosCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type data_size = getRunSize(); + + + +#if defined(RUN_KOKKOS) + + switch ( vid ) { + + // AJP added (following DAXPY example) -- + +//#if defined(RUN_KOKKOS) +//#if defined(RUN_OPENMP) + + + + case Kokkos_Lambda_CUDA : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // Test Device case / GPU + Kokkos::View + kk_view("kk_view", data_size); + + } + stopTimer(); + + break; + } + + default : { + std::cout << "\n ViewAllocate : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + + + +} + +} // end namespace basic +} // end namespace rajaperf +#endif // RAJA_ENABLE_CUDA diff --git a/src/kokkos-mechanics/ViewAllocate-KokkosSeq.cpp b/src/kokkos-mechanics/ViewAllocate-KokkosSeq.cpp new file mode 100644 index 000000000..4795c58d1 --- /dev/null +++ b/src/kokkos-mechanics/ViewAllocate-KokkosSeq.cpp @@ -0,0 +1,76 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ViewAllocate.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace kokkos_mechanics +{ + + +// Kokkos-ify here +//void ViewAllocate::runSeqVariant(VariantID vid) + +void ViewAllocate::runKokkosSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type data_size = getRunSize(); + + + +#if defined(RUN_KOKKOS) + + switch ( vid ) { + + // AJP added (following DAXPY example) -- + +//#if defined(RUN_KOKKOS) +//#if defined(RUN_OPENMP) + + +#if defined(RUN_RAJA_SEQ) + + case Kokkos_Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + +/* RAJA::forall( + RAJA::RangeSegment(ibegin, iend), ifquad_lam); +*/ + // Test host case / CPU + Kokkos::View + kk_view("kk_view", data_size); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n ViewAllocate : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + + + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/kokkos-mechanics/ViewAllocate-Stubs.cpp b/src/kokkos-mechanics/ViewAllocate-Stubs.cpp new file mode 100644 index 000000000..dd4c9325f --- /dev/null +++ b/src/kokkos-mechanics/ViewAllocate-Stubs.cpp @@ -0,0 +1,30 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ViewAllocate.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace kokkos_mechanics +{ + +void ViewAllocate::runSeqVariant(VariantID vid) +{ +} + +void ViewAllocate::runOpenMPVariant(VariantID vid) {} +void ViewAllocate::runCudaVariant(VariantID vid) {} +void ViewAllocate::runHipVariant(VariantID vid) {} +void ViewAllocate::runOpenMPTargetVariant(VariantID vid){} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/kokkos-mechanics/ViewAllocate.cpp b/src/kokkos-mechanics/ViewAllocate.cpp new file mode 100644 index 000000000..b9013a882 --- /dev/null +++ b/src/kokkos-mechanics/ViewAllocate.cpp @@ -0,0 +1,52 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ViewAllocate.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +namespace rajaperf +{ +namespace kokkos_mechanics +{ + +// Syntax for C++ constructor +ViewAllocate::ViewAllocate(const RunParams& params) + : KernelBase(rajaperf::KokkosMechanics_ViewAllocate, params) +{ + setDefaultSize(100000); + setDefaultReps(5000); + + setVariantDefined( Kokkos_Lambda_Seq); + setVariantDefined( Kokkos_Lambda_OpenMP); + setVariantDefined( Kokkos_Lambda_OpenMPTarget); + setVariantDefined( Kokkos_Lambda_CUDA); +} +//Defining the destructor (for the struct) +ViewAllocate::~ViewAllocate() +{ +} + +void ViewAllocate::setUp(VariantID vid) +{ +} + +void ViewAllocate::updateChecksum(VariantID vid) +{ +// checksum[vid] += calcChecksum(m_y, getRunSize()); +} + +void ViewAllocate::tearDown(VariantID vid) +{ + (void) vid; +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/kokkos-mechanics/ViewAllocate.hpp b/src/kokkos-mechanics/ViewAllocate.hpp new file mode 100644 index 000000000..01158b33e --- /dev/null +++ b/src/kokkos-mechanics/ViewAllocate.hpp @@ -0,0 +1,70 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// ViewAllocate kernel reference implementation: +/// +/// for (Index_type i = ibegin; i < iend; ++i ) { +/// y[i] += a * x[i] ; +/// } +/// + +#ifndef RAJAPerf_Basic_ViewAllocate_HPP +#define RAJAPerf_Basic_ViewAllocate_HPP + +#define ViewAllocate_DATA_SETUP \ + Real_ptr x = m_x; \ + Real_ptr y = m_y; \ + Real_type a = m_a; + +#define ViewAllocate_FUNCTOR_CONSTRUCT \ + x(m_x),\ + y(m_y), \ + a(m_a) + +#define ViewAllocate_BODY \ + y[i] += a * x[i] ; + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace kokkos_mechanics +{ + +class ViewAllocate : public KernelBase +{ +public: + + ViewAllocate(const RunParams& params); + + ~ViewAllocate(); + + void setUp(VariantID vid); + void updateChecksum(VariantID vid); + void tearDown(VariantID vid); + + void runSeqVariant(VariantID vid); + void runOpenMPVariant(VariantID vid); + void runCudaVariant(VariantID vid); + void runHipVariant(VariantID vid); + void runOpenMPTargetVariant(VariantID vid); + + void runKokkosSeqVariant(VariantID vid); + void runKokkosOpenMPVariant(VariantID vid); + void runKokkosCudaVariant(VariantID vid); + void runKokkosOpenMPTargetVariant(VariantID vid); +}; + +} // end namespace basic +} // end namespace rajaperf + +#endif // closing endif for header file include guard From 6858ba3742980dfc5b3e71e440217d9dda04cbbb Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Fri, 20 Nov 2020 14:45:46 -0800 Subject: [PATCH 023/124] Added stream-add with Kokkos Views --- src/common/Executor.cpp | 6 +- src/common/RAJAPerfSuite.cpp | 6 ++ src/common/RAJAPerfSuite.hpp | 1 + src/kokkos-mechanics/CMakeLists.txt | 4 + src/kokkos-mechanics/ViewAllocate.hpp | 14 ---- .../ViewStreamAdd-KokkosCuda.cpp | 82 +++++++++++++++++++ .../ViewStreamAdd-KokkosSeq.cpp | 77 +++++++++++++++++ src/kokkos-mechanics/ViewStreamAdd-Stubs.cpp | 30 +++++++ src/kokkos-mechanics/ViewStreamAdd.cpp | 57 +++++++++++++ src/kokkos-mechanics/ViewStreamAdd.hpp | 75 +++++++++++++++++ 10 files changed, 335 insertions(+), 17 deletions(-) create mode 100644 src/kokkos-mechanics/ViewStreamAdd-KokkosCuda.cpp create mode 100644 src/kokkos-mechanics/ViewStreamAdd-KokkosSeq.cpp create mode 100644 src/kokkos-mechanics/ViewStreamAdd-Stubs.cpp create mode 100644 src/kokkos-mechanics/ViewStreamAdd.cpp create mode 100644 src/kokkos-mechanics/ViewStreamAdd.hpp diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 4aaa46241..3ce07de11 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -44,6 +44,9 @@ Executor::~Executor() for (size_t ik = 0; ik < kernels.size(); ++ik) { delete kernels[ik]; } +#if defined(RUN_KOKKOS) + Kokkos::finalize(); // TODO DZP: should this be here? +#endif } @@ -453,9 +456,6 @@ void Executor::outputRunData() filename = out_fprefix + "-fom.csv"; writeFOMReport(filename); -#if defined(RUN_KOKKOS) - Kokkos::finalize(); // TODO DZP: should this be here? -#endif } diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 8c692abf1..cfa043226 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -78,6 +78,7 @@ #include "apps/VOL3D.hpp" #include "kokkos-mechanics/ViewAllocate.hpp" +#include "kokkos-mechanics/ViewStreamAdd.hpp" #include @@ -193,6 +194,7 @@ static const std::string KernelNames [] = // std::string("Apps_VOL3D"), std::string("KokkosMechanics_ViewAllocate"), + std::string("KokkosMechanics_ViewStreamAdd"), std::string("Unknown Kernel") // Keep this at the end and DO NOT remove.... @@ -612,6 +614,10 @@ KernelBase* getKernelObject(KernelID kid, break; } + case KokkosMechanics_ViewStreamAdd: { + kernel = new kokkos_mechanics::ViewStreamAdd(run_params); + break; + } default: { std::cout << "\n Unknown Kernel ID = " << kid << std::endl; } diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 5cd2ffdb9..44042d33c 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -168,6 +168,7 @@ enum KernelID { // Kokkos Mechanics Tests KokkosMechanics_ViewAllocate, + KokkosMechanics_ViewStreamAdd, NumKernels // Keep this one last and NEVER comment out (!!) diff --git a/src/kokkos-mechanics/CMakeLists.txt b/src/kokkos-mechanics/CMakeLists.txt index 2368c0db7..4cb068473 100644 --- a/src/kokkos-mechanics/CMakeLists.txt +++ b/src/kokkos-mechanics/CMakeLists.txt @@ -12,5 +12,9 @@ blt_add_library( ViewAllocate-Stubs.cpp ViewAllocate-KokkosSeq.cpp ViewAllocate-KokkosCuda.cpp + ViewStreamAdd.cpp + ViewStreamAdd-Stubs.cpp + ViewStreamAdd-KokkosSeq.cpp + ViewStreamAdd-KokkosCuda.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/kokkos-mechanics/ViewAllocate.hpp b/src/kokkos-mechanics/ViewAllocate.hpp index 01158b33e..3e8d45ff7 100644 --- a/src/kokkos-mechanics/ViewAllocate.hpp +++ b/src/kokkos-mechanics/ViewAllocate.hpp @@ -17,20 +17,6 @@ #ifndef RAJAPerf_Basic_ViewAllocate_HPP #define RAJAPerf_Basic_ViewAllocate_HPP -#define ViewAllocate_DATA_SETUP \ - Real_ptr x = m_x; \ - Real_ptr y = m_y; \ - Real_type a = m_a; - -#define ViewAllocate_FUNCTOR_CONSTRUCT \ - x(m_x),\ - y(m_y), \ - a(m_a) - -#define ViewAllocate_BODY \ - y[i] += a * x[i] ; - - #include "common/KernelBase.hpp" namespace rajaperf diff --git a/src/kokkos-mechanics/ViewStreamAdd-KokkosCuda.cpp b/src/kokkos-mechanics/ViewStreamAdd-KokkosCuda.cpp new file mode 100644 index 000000000..d3ad7ea5d --- /dev/null +++ b/src/kokkos-mechanics/ViewStreamAdd-KokkosCuda.cpp @@ -0,0 +1,82 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ViewStreamAdd.hpp" + +#include "RAJA/RAJA.hpp" +#if defined (RAJA_ENABLE_CUDA) + +#include + +namespace rajaperf +{ +namespace kokkos_mechanics +{ + + +// Kokkos-ify here + +void ViewStreamAdd::runKokkosCudaVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type data_size = getRunSize(); + + + +#if defined(RUN_KOKKOS) + + + Kokkos::View d_a("device_a",getRunSize()); + Kokkos::View d_b("device_b",getRunSize()); + Kokkos::View d_c("device_c",getRunSize()); + + Kokkos::deep_copy(d_a,h_a); + Kokkos::deep_copy(d_b,h_b); + Kokkos::deep_copy(d_c,h_c); + + switch ( vid ) { + + // AJP added (following DAXPY example) -- + +//#if defined(RUN_KOKKOS) +//#if defined(RUN_OPENMP) + + + + case Kokkos_Lambda_CUDA : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // Test Device case / GPU + Kokkos::parallel_for("perfsuite.kokkos_mechanics.view_stream_add.cuda.lambda",Kokkos::RangePolicy(0,data_size), [=] __device__ (int i) { + d_c[i] = d_a[i] + d_b[i]; + }); + + } + stopTimer(); + + break; + } + + default : { + std::cout << "\n ViewStreamAdd : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + + + +} + +} // end namespace basic +} // end namespace rajaperf +#endif // RAJA_ENABLE_CUDA diff --git a/src/kokkos-mechanics/ViewStreamAdd-KokkosSeq.cpp b/src/kokkos-mechanics/ViewStreamAdd-KokkosSeq.cpp new file mode 100644 index 000000000..029486f73 --- /dev/null +++ b/src/kokkos-mechanics/ViewStreamAdd-KokkosSeq.cpp @@ -0,0 +1,77 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ViewStreamAdd.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace kokkos_mechanics +{ + + +// Kokkos-ify here +//void ViewStreamAdd::runSeqVariant(VariantID vid) + +void ViewStreamAdd::runKokkosSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type data_size = getRunSize(); + + + +#if defined(RUN_KOKKOS) + + switch ( vid ) { + + // AJP added (following DAXPY example) -- + +//#if defined(RUN_KOKKOS) +//#if defined(RUN_OPENMP) + + +#if defined(RUN_RAJA_SEQ) + + case Kokkos_Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + +/* RAJA::forall( + RAJA::RangeSegment(ibegin, iend), ifquad_lam); +*/ + // Test host case / CPU + Kokkos::parallel_for("perfsuite.kokkos_mechanics.view_stream_add.seq.lambda",Kokkos::RangePolicy(0,data_size), [=](int i) { + h_c[i] = h_a[i] + h_b[i]; + }); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n ViewStreamAdd : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + + + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/kokkos-mechanics/ViewStreamAdd-Stubs.cpp b/src/kokkos-mechanics/ViewStreamAdd-Stubs.cpp new file mode 100644 index 000000000..c43d14ad4 --- /dev/null +++ b/src/kokkos-mechanics/ViewStreamAdd-Stubs.cpp @@ -0,0 +1,30 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ViewStreamAdd.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace kokkos_mechanics +{ + +void ViewStreamAdd::runSeqVariant(VariantID vid) +{ +} + +void ViewStreamAdd::runOpenMPVariant(VariantID vid) {} +void ViewStreamAdd::runCudaVariant(VariantID vid) {} +void ViewStreamAdd::runHipVariant(VariantID vid) {} +void ViewStreamAdd::runOpenMPTargetVariant(VariantID vid){} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/kokkos-mechanics/ViewStreamAdd.cpp b/src/kokkos-mechanics/ViewStreamAdd.cpp new file mode 100644 index 000000000..c48a1c4fc --- /dev/null +++ b/src/kokkos-mechanics/ViewStreamAdd.cpp @@ -0,0 +1,57 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ViewStreamAdd.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +namespace rajaperf +{ +namespace kokkos_mechanics +{ + +// Syntax for C++ constructor +ViewStreamAdd::ViewStreamAdd(const RunParams& params) + : KernelBase(rajaperf::KokkosMechanics_ViewStreamAdd, params) +{ + setDefaultSize(100000); + setDefaultReps(5000); + + setVariantDefined( Kokkos_Lambda_Seq); + setVariantDefined( Kokkos_Lambda_OpenMP); + setVariantDefined( Kokkos_Lambda_OpenMPTarget); + setVariantDefined( Kokkos_Lambda_CUDA); +} +//Defining the destructor (for the struct) +ViewStreamAdd::~ViewStreamAdd() +{ +} + +void ViewStreamAdd::setUp(VariantID vid) +{ + h_a = VT("host_a",getRunSize()); + h_b = VT("host_b",getRunSize()); + h_c = VT("host_c",getRunSize()); + Kokkos::deep_copy(h_a,1.0f); + Kokkos::deep_copy(h_b,2.0f); +} + +void ViewStreamAdd::updateChecksum(VariantID vid) +{ +// checksum[vid] += calcChecksum(m_y, getRunSize()); +} + +void ViewStreamAdd::tearDown(VariantID vid) +{ + (void) vid; +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/kokkos-mechanics/ViewStreamAdd.hpp b/src/kokkos-mechanics/ViewStreamAdd.hpp new file mode 100644 index 000000000..62861a8f4 --- /dev/null +++ b/src/kokkos-mechanics/ViewStreamAdd.hpp @@ -0,0 +1,75 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// ViewStreamAdd kernel reference implementation: +/// +/// for (Index_type i = ibegin; i < iend; ++i ) { +/// y[i] += a * x[i] ; +/// } +/// + +#ifndef RAJAPerf_Basic_ViewStreamAdd_HPP +#define RAJAPerf_Basic_ViewStreamAdd_HPP + +#define ViewStreamAdd_DATA_SETUP \ + Real_ptr x = m_x; \ + Real_ptr y = m_y; \ + Real_type a = m_a; + +#define ViewStreamAdd_FUNCTOR_CONSTRUCT \ + x(m_x),\ + y(m_y), \ + a(m_a) + +#define ViewStreamAdd_BODY \ + y[i] += a * x[i] ; + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace kokkos_mechanics +{ + +class ViewStreamAdd : public KernelBase +{ +public: + + ViewStreamAdd(const RunParams& params); + + ~ViewStreamAdd(); + + void setUp(VariantID vid); + void updateChecksum(VariantID vid); + void tearDown(VariantID vid); + + void runSeqVariant(VariantID vid); + void runOpenMPVariant(VariantID vid); + void runCudaVariant(VariantID vid); + void runHipVariant(VariantID vid); + void runOpenMPTargetVariant(VariantID vid); + + void runKokkosSeqVariant(VariantID vid); + void runKokkosOpenMPVariant(VariantID vid); + void runKokkosCudaVariant(VariantID vid); + void runKokkosOpenMPTargetVariant(VariantID vid); +private: + using VT=Kokkos::View; + VT h_a; + VT h_b; + VT h_c; +}; + +} // end namespace basic +} // end namespace rajaperf + +#endif // closing endif for header file include guard From 3f8acf90e25aba692dbfda62606aa4d7ca065236 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 14 Dec 2020 10:34:40 -0800 Subject: [PATCH 024/124] Adding basic kernels for Kokkos performance testing --- src/basic-kokkos/IF_QUAD-KokkosCuda.cpp | 2 +- src/basic-kokkos/INIT3-KokkosCuda.cpp | 23 +++++-- src/basic-kokkos/INIT3-KokkosOMP.cpp | 80 ++++++++++++---------- src/basic-kokkos/INIT3-KokkosSeq.cpp | 42 +++++++----- src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp | 13 +++- src/basic/DAXPY.cpp | 7 ++ src/basic/INIT3.cpp | 3 + src/basic/INIT3.hpp | 10 +-- src/basic/INIT_VIEW1D.cpp | 5 ++ 9 files changed, 121 insertions(+), 64 deletions(-) diff --git a/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp b/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp index 88e792ef0..aed4cfed9 100644 --- a/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp +++ b/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp @@ -89,7 +89,7 @@ void IF_QUAD::runKokkosCudaVariant(VariantID vid) // }); - Kokkos::parallel_for("Quad Cuda", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("Kokks IF_QUAD Cuda", Kokkos::RangePolicy(ibegin, iend), // Here, the function executes on the device / GPU [=] __device__ (Index_type i) {IF_QUAD_BODY}); //KOKKOS_LAMBDA (Index_type i) {IF_QUAD_BODY}); diff --git a/src/basic-kokkos/INIT3-KokkosCuda.cpp b/src/basic-kokkos/INIT3-KokkosCuda.cpp index 56c11f465..567aa5496 100644 --- a/src/basic-kokkos/INIT3-KokkosCuda.cpp +++ b/src/basic-kokkos/INIT3-KokkosCuda.cpp @@ -63,8 +63,12 @@ void INIT3::runKokkosCudaVariant(VariantID vid) INIT3_DATA_SETUP; +#if defined(RUN_KOKKOS) + if ( vid == Base_CUDA ) { +#if defined(RUN_CUDA) + INIT3_DATA_SETUP_CUDA; startTimer(); @@ -79,26 +83,35 @@ void INIT3::runKokkosCudaVariant(VariantID vid) INIT3_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { +// AJP modified lines below + } else if ( vid == Kokkos_Lambda_CUDA ) { INIT3_DATA_SETUP_CUDA; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::cuda_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - INIT3_BODY; - }); +// RAJA::forall< RAJA::cuda_exec >( +// RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { +// INIT3_BODY; +// }); + + Kokkos::parallel_for("Kokkos INIT3 Cuda", Kokkos::RangePolicy(ibegin, iend), + //Here, the function executes on the device / GPU + [=] __device__ (Index_type i) {INIT3_BODY}); } stopTimer(); INIT3_DATA_TEARDOWN_CUDA; +#endif // RUN_CUDA + } else { std::cout << "\n INIT3 : Unknown Cuda variant id = " << vid << std::endl; } + +#endif //RUN_KOKKOS } } // end namespace basic diff --git a/src/basic-kokkos/INIT3-KokkosOMP.cpp b/src/basic-kokkos/INIT3-KokkosOMP.cpp index be5316252..2392b4ffe 100644 --- a/src/basic-kokkos/INIT3-KokkosOMP.cpp +++ b/src/basic-kokkos/INIT3-KokkosOMP.cpp @@ -14,13 +14,15 @@ namespace rajaperf { +// Refers to both Kokkos and Raja namespaces; we are defining methods on a class in .. +// DAVID - help completing this comment! +// namespace basic { - void INIT3::runKokkosOpenMPVariant(VariantID vid) { -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) +//#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -32,61 +34,69 @@ void INIT3::runKokkosOpenMPVariant(VariantID vid) INIT3_BODY; }; - switch ( vid ) { +#if defined(RUN_KOKKOS) - case Base_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - #pragma omp parallel for - for (Index_type i = ibegin; i < iend; ++i ) { - INIT3_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_OpenMP : { + switch ( vid ) { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { +#if defined(RUN_OPENMP) - #pragma omp parallel for - for (Index_type i = ibegin; i < iend; ++i ) { - init3_lam(i); - } +// case Base_OpenMP : { +// +// startTimer(); +// for (RepIndex_type irep = 0; irep < run_reps; ++irep) { +// +// #pragma omp parallel for +// for (Index_type i = ibegin; i < iend; ++i ) { +// INIT3_BODY; +// } +// +// } +// stopTimer(); +// +// break; +// } +// +// case Lambda_OpenMP : { +// +// startTimer(); +// for (RepIndex_type irep = 0; irep < run_reps; ++irep) { +// +// #pragma omp parallel for +// for (Index_type i = ibegin; i < iend; ++i ) { +// init3_lam(i); +// } +// +// } +// stopTimer(); +// +// break; +// } - } - stopTimer(); - break; - } - case RAJA_OpenMP : { + case Kokkos_Lambda_OpenMP: { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - +/* RAJA::forall( RAJA::RangeSegment(ibegin, iend), init3_lam); - +*/ + Kokkos::parallel_for("Init3_OMP", Kokkos::RangePolicy(ibegin, iend), + [=] (Index_type i) {INIT3_BODY}); } stopTimer(); break; } +#endif // RUN_OPENMP default : { std::cout << "\n INIT3 : Unknown variant id = " << vid << std::endl; } - } -#endif +#endif // RUN_KOKKOS } } // end namespace basic diff --git a/src/basic-kokkos/INIT3-KokkosSeq.cpp b/src/basic-kokkos/INIT3-KokkosSeq.cpp index 663bb0ae6..069fbf8ee 100644 --- a/src/basic-kokkos/INIT3-KokkosSeq.cpp +++ b/src/basic-kokkos/INIT3-KokkosSeq.cpp @@ -30,47 +30,55 @@ void INIT3::runKokkosSeqVariant(VariantID vid) INIT3_BODY; }; +#if defined(RUN_KOKKOS) + switch ( vid ) { case Base_Seq : { - + startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + for(RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type i = ibegin; i < iend; ++i ) { - INIT3_BODY; + for(Index_type i = ibegin; i < iend; ++i) { + INIT3_BODY; } } stopTimer(); break; - } +} #if defined(RUN_RAJA_SEQ) - case Lambda_Seq : { + case Lambda_Seq : { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - init3_lam(i); + + for (Index_type i = ibegin; i < iend; ++i) { + init3_lam(i); } - } - stopTimer(); - break; } + stopTimer(); + + break; +} - case RAJA_Seq : { +// Nota bene -- Conversion of Raja code begins here + case Kokkos_Lambda_Seq : { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), init3_lam); - +// RAJA::forall( +// RAJA::RangeSegment(ibegin, iend), init3_lam); + + // Kokkos translation + Kokkos::parallel_for("Init3_Seq", Kokkos::RangePolicy(ibegin, iend), + [=] (Index_type i) {INIT3_BODY}); } stopTimer(); @@ -84,6 +92,8 @@ void INIT3::runKokkosSeqVariant(VariantID vid) } +#endif // RUN_KOKKOS + } } // end namespace basic diff --git a/src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp b/src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp index fe54d7a18..c9a29d521 100644 --- a/src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp +++ b/src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp @@ -26,6 +26,8 @@ void INIT_VIEW1D::runKokkosSeqVariant(VariantID vid) INIT_VIEW1D_DATA_SETUP; +#if defined(RUN_KOKKOS) + switch ( vid ) { case Base_Seq : { @@ -63,7 +65,8 @@ void INIT_VIEW1D::runKokkosSeqVariant(VariantID vid) break; } - case RAJA_Seq : { + // AJP began modificaiton here + case Kokkos_Lambda_Seq : { INIT_VIEW1D_VIEW_RAJA; @@ -74,8 +77,11 @@ void INIT_VIEW1D::runKokkosSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), initview1d_lam); +// RAJA::forall( +// RAJA::RangeSegment(ibegin, iend), initview1d_lam); + //Kokkos translation + Kokkos::parallel_for("InitView1D_Seq", Kokkos::RangePolicy(ibegin,iend), + [=] (Index_type i) {INIT_VIEW1D_BODY_RAJA}); } stopTimer(); @@ -90,6 +96,7 @@ void INIT_VIEW1D::runKokkosSeqVariant(VariantID vid) } +#endif // RUN_KOKKOS } } // end namespace basic diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index 3edd8da78..52988d2d9 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -40,6 +40,13 @@ DAXPY::DAXPY(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda_Seq ); + setVariantDefined( Kokkos_Lambda_OpenMP ); + setVariantDefined( Kokkos_Lambda_CUDA ); + + + } DAXPY::~DAXPY() diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index 1adbe906c..20d8ff0f2 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -27,16 +27,19 @@ INIT3::INIT3(const RunParams& params) setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); + setVariantDefined( Kokkos_Lambda_Seq ); setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); setVariantDefined( RAJA_OpenMP ); + setVariantDefined( Kokkos_Lambda_OpenMP ); setVariantDefined( Base_OpenMPTarget ); setVariantDefined( RAJA_OpenMPTarget ); setVariantDefined( Base_CUDA ); setVariantDefined( RAJA_CUDA ); + setVariantDefined( Kokkos_Lambda_CUDA); setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index d1380c20b..b518db649 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -55,10 +55,12 @@ class INIT3 : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - void runKokkosSeqVariant(VariantID vid); - void runKokkosOpenMPVariant(VariantID vid); - void runKokkosCudaVariant(VariantID vid); - void runKokkosOpenMPTargetVariant(VariantID vid); + + void runKokkosSeqVariant(VariantID vid); + void runKokkosOpenMPVariant(VariantID vid); + void runKokkosCudaVariant(VariantID vid); + void runKokkosOpenMPTargetVariant(VariantID vid); + private: Real_ptr m_out1; Real_ptr m_out2; diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index 8f8fed084..42b2588ac 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -40,6 +40,11 @@ INIT_VIEW1D::INIT_VIEW1D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda_Seq ); + setVariantDefined( Kokkos_Lambda_OpenMP ); + setVariantDefined( Kokkos_Lambda_CUDA ); + } INIT_VIEW1D::~INIT_VIEW1D() From 8bac825d532e92d0a7e75f290f4604f48fafab64 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 21 Dec 2020 13:28:14 -0800 Subject: [PATCH 025/124] csv_xml.py: parser for RAJAPerf-timing.csv to xml --- scripts/csv_xml.py | 175 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 scripts/csv_xml.py diff --git a/scripts/csv_xml.py b/scripts/csv_xml.py new file mode 100644 index 000000000..0a219e4cc --- /dev/null +++ b/scripts/csv_xml.py @@ -0,0 +1,175 @@ +import csv +from datetime import datetime +import os +import xml.etree.ElementTree as ET +import xml + +# https://stackabuse.com/reading-and-writing-xml-files-in-python/ + +# xmlformatter: +# https://www.freeformatter.com/xml-formatter.html#ad-output + +#infile = "/Users/ajpowel/Desktop/kokkos_perf_test/watchr/RAJAPerf-timing.csv" + +infile = "./RAJAPerf-timing.csv" + +def read_infile(infile): + """STUB""" + with open(infile, newline='') as csvfile: + rps_reader = csv.reader(csvfile, delimiter=',') + for row in rps_reader: + # The join() method is a string method and returns a string in + # which + # the elements of sequence have been joined by str separator. + print(', '.join(row)) + + + +def get_date(): + """STUB""" + date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") + return date + + +date = get_date() + +perf_report = ET.Element("performance-report") + +name ="RAJAPerf-timing.csv" + +time_units="seconds" + +perf_report.set("date", date) + +perf_report.set("name", name) + +perf_report.set("time-units", time_units) + +print(ET.tostring(perf_report)) + +perf_root = ET.SubElement(perf_report, 'timing') + +perf_root.set("end-time",date) + +perf_root.set("name", "kokkos_perf_suite") + +print(ET.tostring(perf_report)) + +# b'' + +# metadata TBD + +# create hierarchy + +test_suite_list = [] +with open(infile, newline='') as csvfile: + rps_reader = csv.reader(csvfile, delimiter=',') + for row in rps_reader: + #print(', '.join(row)) + test_suite_list.append(row) + + +suite_names_set = set([x[0][:x[0].find("_")] for x in test_suite_list[2:]]) + +#suite_names_set +#Out[135]: {'Basic', 'KokkosMechanics'} + + +heirarch_dict = dict() +for name in suite_names_set: + heirarch_dict[name] = [] + +# heirarch_dict +# Out[137]: {'KokkosMechanics': [], 'Basic': []} + +for item in test_suite_list[2:]: + key = item[0][:item[0].find("_")] + heirarch_dict[key].append(item) + #print(item) + +#NEXT STEPS: For the main test categories, Basic and KokkosMechanics, sum +# the test times over all of the kernels for each of their variants + +col_meanings_dict = dict() + +for index, item in enumerate(test_suite_list[1]): + #print(index, item) + col_meanings_dict[index] = item + +#col_meanings_dict +# Out[152]: +# {0: 'Kernel ', +# 1: ' Base_Seq ', +# 2: ' Lambda_Seq ', +# 3: ' RAJA_Seq ', +# 4: ' Base_CUDA ', +# 5: ' RAJA_CUDA ', +# 6: ' Kokkos_Lambda_Seq ', +# 7: ' Kokkos_Functor_Seq ', +# 8: ' Kokkos_Lambda_CUDA ', +# 9: ' Kokkos_Functor_CUDA'} + + +def associate_timings_with_xml(xml_element, timing_dict, suite_or_test_name): + """STUB -- xml_element will be an element of perf_report; + timing_dict = a map of variant names to test run times + """ + for key, value in timing_dict.items(): + xml_element.set(key, str(value)) + xml_element.set("name", suite_or_test_name.strip()) + xml_element.set("count", str(1)) + + + +def create_RPS_xml_report(suite_name, suite_data_list): + """STUB - suite_name is a string = Basic, KokkosMechanics, etc.; + suite_data_list will be the values for a key, Basic or KokkosMechanics + """ + aggregate_results_dict = dict() + #print(suite_data_list) + for list_item in suite_data_list: + for index, timing in enumerate(list_item[1:]): + if "Not run" in timing: + continue + variant_name = col_meanings_dict[index + 1] + if variant_name not in aggregate_results_dict: + aggregate_results_dict[variant_name] = 0.0 + # sums values of all the basic kernels + aggregate_results_dict[variant_name] += float(timing) + print(aggregate_results_dict) + + suite_root = ET.SubElement(perf_root, "timing") + associate_timings_with_xml(suite_root, aggregate_results_dict, suite_name) + for list_item in suite_data_list: + test_timings_dict = dict() + for index, timing in enumerate(list_item[1:]): + if "Not run" in timing: + continue + variant_name = col_meanings_dict[index + 1] + test_timings_dict[variant_name] = float(timing) + xml_element_for_a_kernel_test = ET.SubElement(suite_root, "timing") + associate_timings_with_xml(xml_element_for_a_kernel_test, +test_timings_dict, list_item[0]) + + + +def run(): + """STUB""" + + read_infile(infile) + + #create_RPS_xml_report("Basic", heirarch_dict["Basic"]) + + for key in heirarch_dict.keys(): + create_RPS_xml_report(key, heirarch_dict[key]) + + print(heirarch_dict["KokkosMechanics"]) + + print(ET.tostring(perf_report)) + + +if __name__ == "__main__": + run() + From c1193dcc85925896f9a26f41e5e6e6aee38ebb51 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 21 Dec 2020 14:03:54 -0800 Subject: [PATCH 026/124] csv_xml.py: raw xml dump of csv results --- scripts/csv_xml.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) mode change 100644 => 100755 scripts/csv_xml.py diff --git a/scripts/csv_xml.py b/scripts/csv_xml.py old mode 100644 new mode 100755 index 0a219e4cc..6a2f80008 --- a/scripts/csv_xml.py +++ b/scripts/csv_xml.py @@ -1,3 +1,5 @@ +#!/bin/env python + import csv from datetime import datetime import os @@ -9,7 +11,6 @@ # xmlformatter: # https://www.freeformatter.com/xml-formatter.html#ad-output -#infile = "/Users/ajpowel/Desktop/kokkos_perf_test/watchr/RAJAPerf-timing.csv" infile = "./RAJAPerf-timing.csv" @@ -17,11 +18,6 @@ def read_infile(infile): """STUB""" with open(infile, newline='') as csvfile: rps_reader = csv.reader(csvfile, delimiter=',') - for row in rps_reader: - # The join() method is a string method and returns a string in - # which - # the elements of sequence have been joined by str separator. - print(', '.join(row)) @@ -45,7 +41,7 @@ def get_date(): perf_report.set("time-units", time_units) -print(ET.tostring(perf_report)) +#print(ET.tostring(perf_report)) perf_root = ET.SubElement(perf_report, 'timing') @@ -53,7 +49,7 @@ def get_date(): perf_root.set("name", "kokkos_perf_suite") -print(ET.tostring(perf_report)) +#print(ET.tostring(perf_report)) # b' Date: Mon, 21 Dec 2020 15:49:30 -0800 Subject: [PATCH 027/124] csv_xml.py: updates to raw xml output --- scripts/csv_xml.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/csv_xml.py b/scripts/csv_xml.py index 6a2f80008..829fc8ccf 100755 --- a/scripts/csv_xml.py +++ b/scripts/csv_xml.py @@ -169,6 +169,8 @@ def run(): ET.dump(perf_report) + + if __name__ == "__main__": run() From 5ed43c0fc2aa2abf4909fa66d19f96f931bcecfc Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 21 Dec 2020 15:52:45 -0800 Subject: [PATCH 028/124] KokkosSeq: Kokkos-i-fied basic kernels --- src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp | 72 ++++++++++++------- .../INIT_VIEW1D_OFFSET-KokkosSeq.cpp | 16 ++++- src/basic/INIT_VIEW1D_OFFSET.cpp | 5 ++ 3 files changed, 63 insertions(+), 30 deletions(-) diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp b/src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp index aa10e4c9a..7b353c7c6 100644 --- a/src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp +++ b/src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp @@ -16,69 +16,87 @@ namespace rajaperf { namespace basic { -struct AtomicPIFunctor { - Real_type dx; - Real_ptr pi; - AtomicPIFunctor(Real_type m_dx, Real_ptr m_pi) : ATOMIC_PI_FUNCTOR_CONSTRUCT {} -}; - -void ATOMIC_PI::runKokkosSeqVariant(VariantID vid) +void ATOMIC_PI::runSeqVariant(VariantID vid) { - const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getRunSize(); + ATOMIC_PI_DATA_SETUP; -#if defined(RUN_KOKKOS) && defined(RUN_OPENMP) +#if defined(RUN_KOKKOS) + switch ( vid ) { - case Kokkos_Functor_OpenMP : { + case Base_Seq : { startTimer(); - //for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - // *pi = m_pi_init; - // RAJA::forall( - // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - // double x = (double(i) + 0.5) * dx; - // RAJA::atomicAdd(pi, dx / (1.0 + x * x)); - // }); - // *pi *= 4.0; + *pi = m_pi_init; + for (Index_type i = ibegin; i < iend; ++i ) { + double x = (double(i) + 0.5) * dx; + *pi += dx / (1.0 + x * x); + } + *pi *= 4.0; - //} + } stopTimer(); break; } - case Kokkos_Lambda_OpenMP : { + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto atomicpi_base_lam = [=](Index_type i) { + double x = (double(i) + 0.5) * dx; + *pi += dx / (1.0 + x * x); + }; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { *pi = m_pi_init; - - Kokkos::parallel_for("name",Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA(Index_type i){ - double x = ((double(i) + 0.5) * dx); - Kokkos::atomic_add(pi, dx / (1.0 + x * x)); - }); + for (Index_type i = ibegin; i < iend; ++i ) { + atomicpi_base_lam(i); + } *pi *= 4.0; + } stopTimer(); break; } + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + *pi = m_pi_init; + RAJA::forall( RAJA::RangeSegment(ibegin, iend), + [=](Index_type i) { + double x = (double(i) + 0.5) * dx; + RAJA::atomicAdd(pi, dx / (1.0 + x * x)); + }); + *pi *= 4.0; + + } + stopTimer(); + + break; + } +#endif //RUN_RAJA_SEQ default : { std::cout << "\n ATOMIC_PI : Unknown variant id = " << vid << std::endl; } } - -#endif +#endif //RUN_KOKKOS } } // end namespace basic diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosSeq.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosSeq.cpp index 834d22bad..4bcfb3a25 100644 --- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosSeq.cpp +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosSeq.cpp @@ -18,6 +18,7 @@ namespace basic { + void INIT_VIEW1D_OFFSET::runKokkosSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -26,6 +27,10 @@ void INIT_VIEW1D_OFFSET::runKokkosSeqVariant(VariantID vid) INIT_VIEW1D_OFFSET_DATA_SETUP; + +#if defined(RUN_KOKKOS) + + switch ( vid ) { case Base_Seq : { @@ -63,7 +68,9 @@ void INIT_VIEW1D_OFFSET::runKokkosSeqVariant(VariantID vid) break; } - case RAJA_Seq : { + // Conversion of Raja code to Kokkos starts here + // + case Kokkos_Lambda_Seq : { INIT_VIEW1D_OFFSET_VIEW_RAJA; @@ -74,8 +81,10 @@ void INIT_VIEW1D_OFFSET::runKokkosSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), initview1doffset_lam); +// RAJA::forall( +// RAJA::RangeSegment(ibegin, iend), initview1doffset_lam); + Kokkos::parallel_for("INIT_VIEW1D_OFFSET_SEQ Kokkos", Kokkos::RangePolicy(ibegin, iend), [=] (Index_type i) {INIT_VIEW1D_OFFSET_BODY_RAJA}); + } stopTimer(); @@ -90,6 +99,7 @@ void INIT_VIEW1D_OFFSET::runKokkosSeqVariant(VariantID vid) } +#endif // RUN_KOKKOS } } // end namespace basic diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 45cdda46b..0f810547b 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -40,6 +40,11 @@ INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda_Seq ); + setVariantDefined( Kokkos_Lambda_OpenMP ); + setVariantDefined( Kokkos_Lambda_CUDA ); + } INIT_VIEW1D_OFFSET::~INIT_VIEW1D_OFFSET() From ca36ef0c9c63025a648187bac46d1f076114f9bc Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 21 Dec 2020 16:57:33 -0800 Subject: [PATCH 029/124] KokkosSeq: Kokkos-i-fied basic kernels --- src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp | 21 ++++++++++++------- src/basic-kokkos/DAXPY-KokkosSeq.cpp | 9 ++++---- src/basic-kokkos/IF_QUAD-KokkosSeq.cpp | 10 ++------- src/basic-kokkos/INIT3-KokkosSeq.cpp | 2 +- src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp | 2 +- .../INIT_VIEW1D_OFFSET-KokkosSeq.cpp | 2 +- 6 files changed, 24 insertions(+), 22 deletions(-) diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp b/src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp index 7b353c7c6..dce51b6f4 100644 --- a/src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp +++ b/src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp @@ -18,7 +18,7 @@ namespace basic { -void ATOMIC_PI::runSeqVariant(VariantID vid) +void ATOMIC_PI::runKokkosSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -71,17 +71,24 @@ void ATOMIC_PI::runSeqVariant(VariantID vid) break; } - case RAJA_Seq : { + case Kokkos_Lambda_Seq : { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { *pi = m_pi_init; - RAJA::forall( RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - double x = (double(i) + 0.5) * dx; - RAJA::atomicAdd(pi, dx / (1.0 + x * x)); - }); +// RAJA::forall( RAJA::RangeSegment(ibegin, iend), +// [=](Index_type i) { +// double x = (double(i) + 0.5) * dx; +// RAJA::atomicAdd(pi, dx / (1.0 + x * x)); +// }); + + Kokkos::parallel_for("ATOMIC_PI-KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), + [=] (Index_type i) { + double x = (double(i) + 0.5) * dx; + Kokkos::atomic_add(pi, dx / (1.0 + x * x)); + }); + *pi *= 4.0; } diff --git a/src/basic-kokkos/DAXPY-KokkosSeq.cpp b/src/basic-kokkos/DAXPY-KokkosSeq.cpp index 9216e7950..659258eb0 100644 --- a/src/basic-kokkos/DAXPY-KokkosSeq.cpp +++ b/src/basic-kokkos/DAXPY-KokkosSeq.cpp @@ -37,14 +37,15 @@ void DAXPY::runKokkosSeqVariant(VariantID vid) DAXPY_BODY; }; +#if defined(RUN_KOKKOS) + switch ( vid ) { -#if defined(RUN_KOKKOS) #if defined(RUN_RAJA_SEQ) case Kokkos_Lambda_Seq: { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Kokkos::parallel_for("perfsuite.kokkos.seq.lambda", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("DAXPY-KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), [=](Index_type i) { DAXPY_BODY; }); } stopTimer(); @@ -55,14 +56,13 @@ void DAXPY::runKokkosSeqVariant(VariantID vid) DaxpyFunctor daxpy_functor_instance(y,x,a); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Kokkos::parallel_for("perfsuite.kokkos.seq.lambda", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("DAXPY-KokkosSeq Kokkos_Functor_Seq", Kokkos::RangePolicy(ibegin, iend), daxpy_functor_instance); } stopTimer(); break; } -#endif // RUN_KOKKOS #endif // RUN_RAJA_SEQ default : { std::cout << "\n DAXPY : Unknown variant id = " << vid << std::endl; @@ -70,6 +70,7 @@ void DAXPY::runKokkosSeqVariant(VariantID vid) } +#endif // RUN_KOKKOS } } // end namespace basic diff --git a/src/basic-kokkos/IF_QUAD-KokkosSeq.cpp b/src/basic-kokkos/IF_QUAD-KokkosSeq.cpp index 0b21faa73..68adde515 100644 --- a/src/basic-kokkos/IF_QUAD-KokkosSeq.cpp +++ b/src/basic-kokkos/IF_QUAD-KokkosSeq.cpp @@ -19,7 +19,6 @@ namespace basic // Kokkos-ify here -//void IF_QUAD::runSeqVariant(VariantID vid) void IF_QUAD::runKokkosSeqVariant(VariantID vid) { @@ -38,10 +37,6 @@ void IF_QUAD::runKokkosSeqVariant(VariantID vid) switch ( vid ) { - // AJP added (following DAXPY example) -- - -//#if defined(RUN_KOKKOS) -//#if defined(RUN_OPENMP) #if defined(RUN_RAJA_SEQ) @@ -55,7 +50,8 @@ void IF_QUAD::runKokkosSeqVariant(VariantID vid) RAJA::RangeSegment(ibegin, iend), ifquad_lam); */ // Translation - Kokkos::parallel_for("Quad", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("IF_QUAD_KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), + [=] (Index_type i) {IF_QUAD_BODY}); } @@ -74,8 +70,6 @@ void IF_QUAD::runKokkosSeqVariant(VariantID vid) #endif // RUN_KOKKOS - - } } // end namespace basic diff --git a/src/basic-kokkos/INIT3-KokkosSeq.cpp b/src/basic-kokkos/INIT3-KokkosSeq.cpp index 069fbf8ee..e3fee117b 100644 --- a/src/basic-kokkos/INIT3-KokkosSeq.cpp +++ b/src/basic-kokkos/INIT3-KokkosSeq.cpp @@ -77,7 +77,7 @@ void INIT3::runKokkosSeqVariant(VariantID vid) // RAJA::RangeSegment(ibegin, iend), init3_lam); // Kokkos translation - Kokkos::parallel_for("Init3_Seq", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("INIT3-KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), [=] (Index_type i) {INIT3_BODY}); } stopTimer(); diff --git a/src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp b/src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp index c9a29d521..38284ec76 100644 --- a/src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp +++ b/src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp @@ -80,7 +80,7 @@ void INIT_VIEW1D::runKokkosSeqVariant(VariantID vid) // RAJA::forall( // RAJA::RangeSegment(ibegin, iend), initview1d_lam); //Kokkos translation - Kokkos::parallel_for("InitView1D_Seq", Kokkos::RangePolicy(ibegin,iend), + Kokkos::parallel_for("INIT_VIEW1D_KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin,iend), [=] (Index_type i) {INIT_VIEW1D_BODY_RAJA}); } diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosSeq.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosSeq.cpp index 4bcfb3a25..844697a7c 100644 --- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosSeq.cpp +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosSeq.cpp @@ -83,7 +83,7 @@ void INIT_VIEW1D_OFFSET::runKokkosSeqVariant(VariantID vid) // RAJA::forall( // RAJA::RangeSegment(ibegin, iend), initview1doffset_lam); - Kokkos::parallel_for("INIT_VIEW1D_OFFSET_SEQ Kokkos", Kokkos::RangePolicy(ibegin, iend), [=] (Index_type i) {INIT_VIEW1D_OFFSET_BODY_RAJA}); + Kokkos::parallel_for("INIT_VIEW1D_OFFSET_KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), [=] (Index_type i) {INIT_VIEW1D_OFFSET_BODY_RAJA}); } From b1711090e985f5bad94bb27f74c8623233626a01 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 4 Jan 2021 14:35:12 -0800 Subject: [PATCH 030/124] csv_xml.py: rm "count", rm print statements --- scripts/csv_xml.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/csv_xml.py b/scripts/csv_xml.py index 829fc8ccf..5fd60bdbd 100755 --- a/scripts/csv_xml.py +++ b/scripts/csv_xml.py @@ -41,8 +41,6 @@ def get_date(): perf_report.set("time-units", time_units) -#print(ET.tostring(perf_report)) - perf_root = ET.SubElement(perf_report, 'timing') perf_root.set("end-time",date) @@ -114,7 +112,6 @@ def associate_timings_with_xml(xml_element, timing_dict, suite_or_test_name): for key, value in timing_dict.items(): xml_element.set(key, str(value)) xml_element.set("name", suite_or_test_name.strip()) - xml_element.set("count", str(1)) From 3381d88cca72c71ce168d7d1d62974f692e2d77c Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Tue, 5 Jan 2021 11:52:15 -0800 Subject: [PATCH 031/124] Add profiling hooks to enable SPOT --- src/common/Executor.cpp | 13 ++++++++++++- src/common/KernelBase.cpp | 6 ++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 3ce07de11..f72d6eeea 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -276,7 +276,18 @@ void Executor::setupSuite() } // kernel and variant input both look good #if defined(RUN_KOKKOS) - Kokkos::initialize(); + Kokkos::initialize(); + /** + * DZP: This is a terrible hack to just get the push/pop region + * callbacks without the begin_parallel_x/end_parallel_x ones, + * so we don't overfence and perturb performance + */ + auto events = Kokkos::Tools::Experimental::get_callbacks(); + auto push = events.push_region; + auto pop = events.pop_region; + Kokkos::Tools::Experimental::pause_tools(); + Kokkos::Tools::Experimental::set_push_region_callback(push); + Kokkos::Tools::Experimental::set_pop_region_callback(pop); #endif } // if kernel input looks good diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 3d0cd991e..45127a4f8 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -66,8 +66,14 @@ void KernelBase::execute(VariantID vid) resetDataInitCount(); this->setUp(vid); +#ifdef RUN_KOKKOS + Kokkos::Tools::pushRegion(this->getName() + ":"+getVariantName(vid)); +#endif this->runKernel(vid); +#ifdef RUN_KOKKOS + Kokkos::Tools::popRegion(); +#endif this->updateChecksum(vid); From 71901b6ae29c700a08982b3c93b1cad3dd6ea386 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 5 Jan 2021 14:42:21 -0800 Subject: [PATCH 032/124] Kokkos Sequential Basic Kernel Implementation --- src/basic-kokkos/MULADDSUB-KokkosSeq.cpp | 16 ++++++++--- src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp | 32 +++++++++++++++------- src/basic-kokkos/REDUCE3_INT-KokkosSeq.cpp | 31 ++++++++++++++++++--- src/basic-kokkos/TRAP_INT-KokkosSeq.cpp | 31 +++++++++++++++------ src/basic/MULADDSUB.cpp | 6 ++++ src/basic/NESTED_INIT.cpp | 4 +++ src/basic/REDUCE3_INT.cpp | 5 ++++ src/basic/TRAP_INT.cpp | 5 ++++ 8 files changed, 104 insertions(+), 26 deletions(-) diff --git a/src/basic-kokkos/MULADDSUB-KokkosSeq.cpp b/src/basic-kokkos/MULADDSUB-KokkosSeq.cpp index 868e537ae..60f5231ac 100644 --- a/src/basic-kokkos/MULADDSUB-KokkosSeq.cpp +++ b/src/basic-kokkos/MULADDSUB-KokkosSeq.cpp @@ -30,6 +30,9 @@ void MULADDSUB::runKokkosSeqVariant(VariantID vid) MULADDSUB_BODY; }; + +#if defined(RUN_KOKKOS) + switch ( vid ) { case Base_Seq : { @@ -63,13 +66,18 @@ void MULADDSUB::runKokkosSeqVariant(VariantID vid) break; } - case RAJA_Seq : { + case Kokkos_Lambda_Seq : { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), mas_lam); +// RAJA::forall( +// RAJA::RangeSegment(ibegin, iend), mas_lam); +// +// Kokkos translation +// If SIMD really matters , consider using Kokkos SIMD + Kokkos::parallel_for("MULTISUB-KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), + [=] (Index_type i) {MULADDSUB_BODY}); } stopTimer(); @@ -83,7 +91,7 @@ void MULADDSUB::runKokkosSeqVariant(VariantID vid) } } - +#endif // RUN_KOKKOS } } // end namespace basic diff --git a/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp b/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp index 92308a0d8..166ed0ef5 100644 --- a/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp +++ b/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp @@ -28,6 +28,8 @@ void NESTED_INIT::runKokkosSeqVariant(VariantID vid) NESTED_INIT_BODY; }; +#if defined RUN_KOKKOS + switch ( vid ) { case Base_Seq : { @@ -69,9 +71,9 @@ void NESTED_INIT::runKokkosSeqVariant(VariantID vid) break; } - case RAJA_Seq : { + case Kokkos_Lambda_Seq : { - using EXEC_POL = +/* using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::For<2, RAJA::loop_exec, // k RAJA::statement::For<1, RAJA::loop_exec, // j @@ -79,17 +81,27 @@ void NESTED_INIT::runKokkosSeqVariant(VariantID vid) RAJA::statement::Lambda<0> > > - > - >; + + > + > +; +*/ startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), - RAJA::RangeSegment(0, nj), - RAJA::RangeSegment(0, nk)), - nestedinit_lam - ); + + // There are tuning knobs in MDRange to optimize performance + // + Kokkos::parallel_for("NESTED_INIT KokkosSeq", Kokkos::MDRangePolicy, Kokkos::Serial>({0,0,0}, {ni, nj, nk}), + nestedinit_lam +); + +// RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), +// RAJA::RangeSegment(0, nj), +// RAJA::RangeSegment(0, nk)), +// nestedinit_lam +// ); } stopTimer(); @@ -103,7 +115,7 @@ void NESTED_INIT::runKokkosSeqVariant(VariantID vid) } } - +#endif //RUN_KOKKOS } } // end namespace basic diff --git a/src/basic-kokkos/REDUCE3_INT-KokkosSeq.cpp b/src/basic-kokkos/REDUCE3_INT-KokkosSeq.cpp index 5635a4d49..0d7bd2c58 100644 --- a/src/basic-kokkos/REDUCE3_INT-KokkosSeq.cpp +++ b/src/basic-kokkos/REDUCE3_INT-KokkosSeq.cpp @@ -27,6 +27,8 @@ void REDUCE3_INT::runKokkosSeqVariant(VariantID vid) REDUCE3_INT_DATA_SETUP; +#if defined(RUN_KOKKOS) + switch ( vid ) { case Base_Seq : { @@ -82,11 +84,11 @@ void REDUCE3_INT::runKokkosSeqVariant(VariantID vid) break; } - case RAJA_Seq : { + case Kokkos_Lambda_Seq : { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - +/* RAJA::ReduceSum vsum(m_vsum_init); RAJA::ReduceMin vmin(m_vmin_init); RAJA::ReduceMax vmax(m_vmax_init); @@ -99,7 +101,28 @@ void REDUCE3_INT::runKokkosSeqVariant(VariantID vid) m_vsum += static_cast(vsum.get()); m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); - +*/ + // These values are initilized elsewhere by RPS + Int_type max_value = m_vmax_init; + Int_type min_value = m_vmin_init; + Int_type sum = m_vsum_init; + + + // KOKKOS_LAMBDA IS A PRE-PROCESSOR DIRECTIVE; + // It makes the capture clause on the lambda work for Host and Device + parallel_reduce("REDUCE3-KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), + + [=](const int64_t i, Int_type& tl_max, Int_type& tl_min, Int_type& tl_sum){ + Int_type vec_i = vec[i]; + if (vec_i > tl_max) tl_max = vec_i; + if (vec_i < tl_min) tl_min = vec_i; + tl_sum += vec_i; + }, Kokkos::Max(max_value), Kokkos::Min(min_value), sum); + + m_vsum += static_cast(sum); + m_vmin = RAJA_MIN(m_vmin, static_cast(min_value)); + m_vmax = RAJA_MAX(m_vmax, static_cast(max_value)); + } stopTimer(); @@ -112,7 +135,7 @@ void REDUCE3_INT::runKokkosSeqVariant(VariantID vid) } } - +#endif // RUN_KOKKOS } } // end namespace basic diff --git a/src/basic-kokkos/TRAP_INT-KokkosSeq.cpp b/src/basic-kokkos/TRAP_INT-KokkosSeq.cpp index f4859927c..3cfde60ce 100644 --- a/src/basic-kokkos/TRAP_INT-KokkosSeq.cpp +++ b/src/basic-kokkos/TRAP_INT-KokkosSeq.cpp @@ -40,6 +40,8 @@ void TRAP_INT::runKokkosSeqVariant(VariantID vid) TRAP_INT_DATA_SETUP; +#if defined(RUN_KOKKOS) + switch ( vid ) { case Base_Seq : { @@ -86,19 +88,32 @@ void TRAP_INT::runKokkosSeqVariant(VariantID vid) break; } - case RAJA_Seq : { + case Kokkos_Lambda_Seq : { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sumx(m_sumx_init); +// RAJA::ReduceSum sumx(m_sumx_init); - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - TRAP_INT_BODY; - }); +// RAJA::forall( +// RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { +// TRAP_INT_BODY; +// +// Begin Kokkos translation +// A RAJA reduce translates into a Kokkoss::parallel_reduce +// To perform the translation: + // Declare and initialize variables + // To perform a reduction, you need: 1) an initial value; 2) iterate + // over an iterable; 3) to be able to extract the result at the end of + // the reduction (in this case, trap_integral_val) - m_sumx += static_cast(sumx.get()) * h; + Real_type trap_integral_val = m_sumx_init; + + Kokkos::parallel_reduce("TRAP_INT_KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), + [=] (const int64_t i, Real_type& sumx) {TRAP_INT_BODY}, trap_integral_val + ); + + m_sumx += static_cast(trap_integral_val) * h; } stopTimer(); @@ -112,7 +127,7 @@ void TRAP_INT::runKokkosSeqVariant(VariantID vid) } } - +#endif //RUN_KOKKOS } } // end namespace basic diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index fd46c3718..f3c2b6191 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -40,6 +40,12 @@ MULADDSUB::MULADDSUB(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda_Seq ); + setVariantDefined( Kokkos_Lambda_OpenMP ); + setVariantDefined( Kokkos_Lambda_CUDA ); + + } MULADDSUB::~MULADDSUB() diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index 2072d8999..3a697e257 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -47,6 +47,10 @@ NESTED_INIT::NESTED_INIT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda_Seq ); + setVariantDefined( Kokkos_Lambda_OpenMP ); + setVariantDefined( Kokkos_Lambda_CUDA ); } NESTED_INIT::~NESTED_INIT() diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index 07d180cf6..14e13a6ef 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -45,6 +45,11 @@ REDUCE3_INT::REDUCE3_INT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda_Seq ); + setVariantDefined( Kokkos_Lambda_OpenMP ); + setVariantDefined( Kokkos_Lambda_CUDA ); + } REDUCE3_INT::~REDUCE3_INT() diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index 66c012fd7..3dde1e237 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -40,6 +40,11 @@ TRAP_INT::TRAP_INT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda_Seq ); + setVariantDefined( Kokkos_Lambda_OpenMP ); + setVariantDefined( Kokkos_Lambda_CUDA ); + } TRAP_INT::~TRAP_INT() From 27d63c9955b8ae1dedc4a6cbe565c26137f2046a Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 7 Jan 2021 12:37:52 -0800 Subject: [PATCH 033/124] KokkosCuda basic kernels --- src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp | 2 +- src/basic-kokkos/DAXPY-KokkosCuda.cpp | 8 ++-- src/basic-kokkos/IF_QUAD-KokkosCuda.cpp | 2 +- src/basic-kokkos/INIT3-KokkosCuda.cpp | 6 +-- src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp | 18 ++++++--- .../INIT_VIEW1D_OFFSET-KokkosCuda.cpp | 16 +++++--- src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp | 40 ++++++++++++++----- src/basic-kokkos/TRAP_INT-KokkosCuda.cpp | 33 +++++++++++---- 8 files changed, 87 insertions(+), 38 deletions(-) diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp b/src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp index fd73ccead..4849389c2 100644 --- a/src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp +++ b/src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp @@ -97,7 +97,7 @@ void ATOMIC_PI::runKokkosCudaVariant(VariantID vid) }); */ - Kokkos::parallel_for("Atomic_PI Cuda", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("ATOMIC_PI-KokkosCuda Kokkkos_Lambda", Kokkos::RangePolicy(ibegin, iend), // Here, function executes on the device / GPU, and copies by VALUE // the "[=] __device__" indicates "KOKKOS_LAMBDA"; // KOKKOS_LAMBDA = #define KOKKOS_LAMBDA[=]__device__ diff --git a/src/basic-kokkos/DAXPY-KokkosCuda.cpp b/src/basic-kokkos/DAXPY-KokkosCuda.cpp index 414d556f5..bc24d7896 100644 --- a/src/basic-kokkos/DAXPY-KokkosCuda.cpp +++ b/src/basic-kokkos/DAXPY-KokkosCuda.cpp @@ -47,13 +47,14 @@ struct DaxpyCudaFunctor { void DAXPY::runKokkosCudaVariant(VariantID vid) { -#if defined(RUN_KOKKOS) -#if defined(RAJA_ENABLE_CUDA) const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getRunSize(); DAXPY_DATA_SETUP; + +#if defined(RUN_KOKKOS) + if ( vid == Kokkos_Functor_CUDA) { DAXPY_DATA_SETUP_CUDA; DaxpyCudaFunctor daxpy_functor_instance(y,x,a); @@ -76,7 +77,7 @@ void DAXPY::runKokkosCudaVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Kokkos::parallel_for("perfsuite.kokkos.cuda.lambda", + Kokkos::parallel_for("DAXPY-KokkosCuda Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), [=] __device__ (Index_type i) { DAXPY_BODY; }); @@ -89,7 +90,6 @@ void DAXPY::runKokkosCudaVariant(VariantID vid) } else { std::cout << "\n DAXPY : Unknown Cuda variant id = " << vid << std::endl; } -#endif // RAJA_ENABLE_CUDA #endif // RUN_KOKKOS } diff --git a/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp b/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp index aed4cfed9..69af69984 100644 --- a/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp +++ b/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp @@ -89,7 +89,7 @@ void IF_QUAD::runKokkosCudaVariant(VariantID vid) // }); - Kokkos::parallel_for("Kokks IF_QUAD Cuda", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("IF_QUAD-KokkosCuda Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), // Here, the function executes on the device / GPU [=] __device__ (Index_type i) {IF_QUAD_BODY}); //KOKKOS_LAMBDA (Index_type i) {IF_QUAD_BODY}); diff --git a/src/basic-kokkos/INIT3-KokkosCuda.cpp b/src/basic-kokkos/INIT3-KokkosCuda.cpp index 567aa5496..497e84d48 100644 --- a/src/basic-kokkos/INIT3-KokkosCuda.cpp +++ b/src/basic-kokkos/INIT3-KokkosCuda.cpp @@ -67,8 +67,7 @@ void INIT3::runKokkosCudaVariant(VariantID vid) if ( vid == Base_CUDA ) { -#if defined(RUN_CUDA) - +//#error WHATS UP INIT3_DATA_SETUP_CUDA; startTimer(); @@ -96,7 +95,7 @@ void INIT3::runKokkosCudaVariant(VariantID vid) // INIT3_BODY; // }); - Kokkos::parallel_for("Kokkos INIT3 Cuda", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("INIT3-KokkosCuda Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), //Here, the function executes on the device / GPU [=] __device__ (Index_type i) {INIT3_BODY}); @@ -105,7 +104,6 @@ void INIT3::runKokkosCudaVariant(VariantID vid) INIT3_DATA_TEARDOWN_CUDA; -#endif // RUN_CUDA } else { std::cout << "\n INIT3 : Unknown Cuda variant id = " << vid << std::endl; diff --git a/src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp b/src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp index 38a71bfcf..d0c958538 100644 --- a/src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp +++ b/src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp @@ -53,6 +53,8 @@ void INIT_VIEW1D::runKokkosCudaVariant(VariantID vid) INIT_VIEW1D_DATA_SETUP; +#if defined(RUN_KOKKOS) + if ( vid == Base_CUDA ) { INIT_VIEW1D_DATA_SETUP_CUDA; @@ -70,7 +72,8 @@ void INIT_VIEW1D::runKokkosCudaVariant(VariantID vid) INIT_VIEW1D_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { +// AJP modified lines below + } else if ( vid == Kokkos_Lambda_CUDA ) { INIT_VIEW1D_DATA_SETUP_CUDA; @@ -79,10 +82,14 @@ void INIT_VIEW1D::runKokkosCudaVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::cuda_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - INIT_VIEW1D_BODY_RAJA; - }); +// RAJA::forall< RAJA::cuda_exec >( +// RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { +// INIT_VIEW1D_BODY_RAJA; +// }); + + Kokkos::parallel_for("INIT_VIEW1D-KokkosCuda Kokkos-Lambda", Kokkos::RangePolicy(ibegin, iend), + // Here, the function executes on the device / GPU + [=] __device__ (Index_type i) {INIT_VIEW1D_BODY_RAJA}); } stopTimer(); @@ -92,6 +99,7 @@ void INIT_VIEW1D::runKokkosCudaVariant(VariantID vid) } else { std::cout << "\n INIT_VIEW1D : Unknown Cuda variant id = " << vid << std::endl; } +#endif //RUN_KOKKOS } } // end namespace basic diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp index eb903a68b..dec7483bc 100644 --- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp @@ -54,6 +54,8 @@ void INIT_VIEW1D_OFFSET::runKokkosCudaVariant(VariantID vid) INIT_VIEW1D_OFFSET_DATA_SETUP; +#if defined (RUN_KOKKOS) + if ( vid == Base_CUDA ) { INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA; @@ -71,7 +73,7 @@ void INIT_VIEW1D_OFFSET::runKokkosCudaVariant(VariantID vid) INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { + } else if ( vid == Kokkos_Lambda_CUDA ) { INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA; @@ -80,10 +82,13 @@ void INIT_VIEW1D_OFFSET::runKokkosCudaVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::cuda_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - INIT_VIEW1D_OFFSET_BODY_RAJA; - }); + + Kokkos::parallel_for("INIT_VIEW1D_OFFSET-KokkosCuda Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + [=] __device__ (Index_type i) { + INIT_VIEW1D_OFFSET_BODY_RAJA; + } +); } stopTimer(); @@ -99,3 +104,4 @@ void INIT_VIEW1D_OFFSET::runKokkosCudaVariant(VariantID vid) } // end namespace rajaperf #endif // RAJA_ENABLE_CUDA +#endif //RUN_KOKKOS diff --git a/src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp b/src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp index c1a32ff6f..0c47f70b8 100644 --- a/src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp +++ b/src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp @@ -90,6 +90,8 @@ void REDUCE3_INT::runKokkosCudaVariant(VariantID vid) REDUCE3_INT_DATA_SETUP; +#if defined RUN_KOKKOS + if ( vid == Base_CUDA ) { REDUCE3_INT_DATA_SETUP_CUDA; @@ -140,25 +142,42 @@ void REDUCE3_INT::runKokkosCudaVariant(VariantID vid) deallocCudaDeviceData(vmin); deallocCudaDeviceData(vmax); - } else if ( vid == RAJA_CUDA ) { + } else if ( vid == Kokkos_Lambda_CUDA ) { REDUCE3_INT_DATA_SETUP_CUDA; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { +/* RAJA::ReduceSum vsum(m_vsum_init); RAJA::ReduceMin vmin(m_vmin_init); RAJA::ReduceMax vmax(m_vmax_init); - - RAJA::forall< RAJA::cuda_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); - - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); +*/ + + Int_type max_value = m_vmax_init; + Int_type min_value = m_vmin_init; + Int_type sum = m_vsum_init; + + // KOKKOS_LAMBDA IS A PRE-PROCESSOR DIRECTIVE + // It makes the capture clause on the lambda work for Host and Device + + parallel_reduce("REDUCE3-KokkosCuda Kokkos_Lambda_Seq", + Kokkos::RangePolicy(ibegin, iend), + [=] __device__ (const int64_t i, Int_type& tl_max, Int_type& tl_min, Int_type& tl_sum) { + Int_type vec_i = vec[i]; + if (vec_i > tl_max) tl_max = vec_i; + if (vec_i < tl_min) tl_min= vec_i; + tl_sum += vec_i; + }, + Kokkos::Max(max_value), + Kokkos::Min(min_value), + sum); + + + m_vsum += static_cast(sum); + m_vmin = RAJA_MIN(m_vmin, static_cast(min_value)); + m_vmax = RAJA_MAX(m_vmax, static_cast(max_value)); } stopTimer(); @@ -168,6 +187,7 @@ void REDUCE3_INT::runKokkosCudaVariant(VariantID vid) } else { std::cout << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; } +#endif //RUN_KOKKOS } } // end namespace basic diff --git a/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp b/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp index e47d4c3c5..9bf4c9c68 100644 --- a/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp +++ b/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp @@ -94,6 +94,8 @@ void TRAP_INT::runKokkosCudaVariant(VariantID vid) TRAP_INT_DATA_SETUP; +#if defined RUN_KOKKOS + if ( vid == Base_CUDA ) { TRAP_INT_DATA_SETUP_CUDA; @@ -126,23 +128,37 @@ void TRAP_INT::runKokkosCudaVariant(VariantID vid) TRAP_INT_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { + } else if ( vid == Kokkos_Lambda_CUDA ) { TRAP_INT_DATA_SETUP_CUDA; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sumx(m_sumx_init); - RAJA::forall< RAJA::cuda_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - TRAP_INT_BODY; - }); + // Begin Kokkos translation + // A RAJA reduce translates into a + // Kokkoss::parallel_reduce + // To perform the translation: + // Declare and initialize variables + // To perform a reduction, you need: + // 1) an initial value; + // 2) iterate over an iterable; + // 3) to be able to extract the result at the end of the reduction (in this case, trap_integral_val) - m_sumx += static_cast(sumx.get()) * h; - } + Real_type trap_integral_val = m_sumx_init; + + parallel_reduce("TRAP_INT_KokkosCuda Kokkos_Lambda_Seq", + Kokkos::RangePolicy(ibegin, iend), + [=] __device__ (const int64_t i, Real_type& sumx) { + TRAP_INT_BODY}, + trap_integral_val + ); + + m_sumx += static_cast(trap_integral_val) * h; + + } stopTimer(); TRAP_INT_DATA_TEARDOWN_CUDA; @@ -150,6 +166,7 @@ void TRAP_INT::runKokkosCudaVariant(VariantID vid) } else { std::cout << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; } +#endif //RUN_KOKKOS } } // end namespace basic From eec425ef42113bb0091ab3a01301673bd201e346 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Thu, 7 Jan 2021 13:17:02 -0800 Subject: [PATCH 034/124] Unbroke the build system (hahaoops) --- CMakeLists.txt | 7 ++++--- src/basic-kokkos/TRAP_INT-KokkosCuda.cpp | 7 +++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d259cfb3c..c3aa9d8dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,12 +121,13 @@ if(ENABLE_KOKKOS) # ENABLE_CUDA IS A RAJA PERFSUITE OPTION if(ENABLE_CUDA) - set(Kokkos_ENABLE_CUDA ON) - set(Kokkos_ARCH_VOLTA70 ON) #TODO: better + set(Kokkos_ENABLE_CUDA ON CACHE BOOL "Docstring") + set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "Docstring") + set(Kokkos_ARCH_VOLTA70 ON CACHE BOOL "Docstring") #TODO: better enable_language(CUDA) endif() if(ENABLE_OPENMP) - set(Kokkos_ENABLE_OPENMP ON) + set(Kokkos_ENABLE_OPENMP CACHE BOOL ON) endif() add_subdirectory(tpl/kokkos) diff --git a/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp b/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp index 9bf4c9c68..02edfc1c3 100644 --- a/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp +++ b/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp @@ -9,7 +9,7 @@ #include "TRAP_INT.hpp" #include "RAJA/RAJA.hpp" - +#include #if defined(RAJA_ENABLE_CUDA) #include "common/CudaDataUtils.hpp" @@ -24,8 +24,7 @@ namespace basic // // Function used in TRAP_INT loop. // -RAJA_INLINE -RAJA_DEVICE +KOKKOS_INLINE_FUNCTION Real_type trap_int_func(Real_type x, Real_type y, Real_type xp, @@ -151,7 +150,7 @@ void TRAP_INT::runKokkosCudaVariant(VariantID vid) parallel_reduce("TRAP_INT_KokkosCuda Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), - [=] __device__ (const int64_t i, Real_type& sumx) { + KOKKOS_LAMBDA (const int64_t i, Real_type& sumx) { TRAP_INT_BODY}, trap_integral_val ); From c1f72a53ebb0d8d697b9860ad214f2344de5b127 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 11 Jan 2021 13:52:54 -0800 Subject: [PATCH 035/124] rm "newline" parameter from open statement --- scripts/csv_xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/csv_xml.py b/scripts/csv_xml.py index 5fd60bdbd..c68fa56f6 100755 --- a/scripts/csv_xml.py +++ b/scripts/csv_xml.py @@ -16,7 +16,7 @@ def read_infile(infile): """STUB""" - with open(infile, newline='') as csvfile: + with open(infile) as csvfile: rps_reader = csv.reader(csvfile, delimiter=',') From 9835eff8680e58501f201e0a7ddb12358167ccf8 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 11 Jan 2021 13:59:54 -0800 Subject: [PATCH 036/124] rm the second "newline" paramter from open statement --- scripts/csv_xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/csv_xml.py b/scripts/csv_xml.py index c68fa56f6..5d9bcd3b6 100755 --- a/scripts/csv_xml.py +++ b/scripts/csv_xml.py @@ -58,7 +58,7 @@ def get_date(): # create hierarchy test_suite_list = [] -with open(infile, newline='') as csvfile: +with open(infile) as csvfile: rps_reader = csv.reader(csvfile, delimiter=',') for row in rps_reader: test_suite_list.append(row) From 8bdcaf78043396db028afb0ee834d58898f2abce Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Tue, 12 Jan 2021 15:49:25 -0800 Subject: [PATCH 037/124] Update Kokkos --- tpl/kokkos | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/kokkos b/tpl/kokkos index c4f78ff3a..d680eabdb 160000 --- a/tpl/kokkos +++ b/tpl/kokkos @@ -1 +1 @@ -Subproject commit c4f78ff3ad12bf6d74b7f325617c27fde73d2ab8 +Subproject commit d680eabdbccc9b30ce2708b1446507cd860d94e9 From e1b705ac2156177ea04b0f62e69f759f723f8bf2 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Wed, 13 Jan 2021 07:33:30 -0800 Subject: [PATCH 038/124] Added metadata calls, for testing --- src/common/Executor.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index f72d6eeea..6fcd7376e 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -346,6 +346,11 @@ void Executor::reportRunSummary(ostream& str) const str << "\t Kernel rep factor = " << run_params.getRepFactor() << endl; str << "\t Output files will be named " << ofiles << endl; +#if defined(RUN_KOKKOS) + Kokkos::Tools::declareMetadata("replication_factor",std::to_string(run_params.getRepFactor())); + Kokkos::Tools::declareMetadata("size_factor",std::to_string(run_params.getSizeFactor())); +#endif + str << "\nThe following kernels and variants (when available) will be run:\n"; str << "\nVariants" From 91f7910013e91529c902776f4ad303a072675bd0 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Wed, 13 Jan 2021 08:13:48 -0800 Subject: [PATCH 039/124] Added in metadata callbacks --- src/common/Executor.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 6fcd7376e..acd891ad2 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -285,9 +285,11 @@ void Executor::setupSuite() auto events = Kokkos::Tools::Experimental::get_callbacks(); auto push = events.push_region; auto pop = events.pop_region; + auto metadata = events.declare_metadata; Kokkos::Tools::Experimental::pause_tools(); Kokkos::Tools::Experimental::set_push_region_callback(push); Kokkos::Tools::Experimental::set_pop_region_callback(pop); + Kokkos::Tools::Experimental::set_declare_metadata_callback(metadata); #endif } // if kernel input looks good @@ -347,6 +349,7 @@ void Executor::reportRunSummary(ostream& str) const str << "\t Output files will be named " << ofiles << endl; #if defined(RUN_KOKKOS) + std::cout << "DECLARING METADATA HERRRRRRE\n"; Kokkos::Tools::declareMetadata("replication_factor",std::to_string(run_params.getRepFactor())); Kokkos::Tools::declareMetadata("size_factor",std::to_string(run_params.getSizeFactor())); #endif From 9b4cfdc8423dc8f6f51b52cb2e381d9e8b8e693e Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Wed, 13 Jan 2021 09:10:37 -0800 Subject: [PATCH 040/124] KokkosCuda kernels; ATTN: NESTED_INIT not working --- src/basic-kokkos/MULADDSUB-KokkosCuda.cpp | 15 ++++--- src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp | 47 +++++++++++++++++---- src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp | 4 +- 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/src/basic-kokkos/MULADDSUB-KokkosCuda.cpp b/src/basic-kokkos/MULADDSUB-KokkosCuda.cpp index cb30f4969..905e98a8f 100644 --- a/src/basic-kokkos/MULADDSUB-KokkosCuda.cpp +++ b/src/basic-kokkos/MULADDSUB-KokkosCuda.cpp @@ -63,6 +63,8 @@ void MULADDSUB::runKokkosCudaVariant(VariantID vid) MULADDSUB_DATA_SETUP; +#if defined RUN_KOKKOS + if ( vid == Base_CUDA ) { MULADDSUB_DATA_SETUP_CUDA; @@ -79,17 +81,19 @@ void MULADDSUB::runKokkosCudaVariant(VariantID vid) MULADDSUB_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { + } else if ( vid == Kokkos_Lambda_CUDA ) { MULADDSUB_DATA_SETUP_CUDA; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall< RAJA::cuda_exec >( - RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - MULADDSUB_BODY; - }); + Kokkos::parallel_for("MULTISUB-KokkosCuda Kokkos_Lambda_CUDA", + Kokkos::RangePolicy(ibegin, iend), + [=] __device__ (Index_type i) { + MULADDSUB_BODY + } +); } stopTimer(); @@ -99,6 +103,7 @@ void MULADDSUB::runKokkosCudaVariant(VariantID vid) } else { std::cout << "\n MULADDSUB : Unknown Cuda variant id = " << vid << std::endl; } +#endif //RUN_KOKKOS } } // end namespace basic diff --git a/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp b/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp index b64b0b960..43e7c3746 100644 --- a/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp +++ b/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp @@ -45,6 +45,14 @@ void NESTED_INIT::runKokkosCudaVariant(VariantID vid) NESTED_INIT_DATA_SETUP; + auto nestedinit_lam = [=] __device__ (Index_type i, Index_type j, Index_type k) { + NESTED_INIT_BODY; + }; + + + +#if defined RUN_KOKKOS + if ( vid == Base_CUDA ) { NESTED_INIT_DATA_SETUP_CUDA; @@ -63,10 +71,10 @@ void NESTED_INIT::runKokkosCudaVariant(VariantID vid) NESTED_INIT_DATA_TEARDOWN_CUDA; - } else if ( vid == RAJA_CUDA ) { + } else if ( vid == Kokkos_Lambda_CUDA ) { NESTED_INIT_DATA_SETUP_CUDA; - +/* using EXEC_POL = RAJA::KernelPolicy< RAJA::statement::CudaKernelAsync< @@ -80,25 +88,48 @@ void NESTED_INIT::runKokkosCudaVariant(VariantID vid) > >; +*/ startTimer(); + + std::cout << "ni "<< ni << std::endl; + std::cout << "nj "<< nj << std::endl; + std::cout << "nk "<< nk << std::endl; + + + std::cout << "m_array_length " << m_array_length << std::endl; + std::cout << "m_array " << std::hex << m_array << std::hex << std::endl; + + + + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), - RAJA::RangeSegment(0, nj), - RAJA::RangeSegment(0, nk)), - [=] __device__ (Index_type i, Index_type j, Index_type k) { - NESTED_INIT_BODY; - }); +// RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), +// RAJA::RangeSegment(0, nj), +// RAJA::RangeSegment(0, nk)), +// [=] __device__ (Index_type i, Index_type j, Index_type k) { +// NESTED_INIT_BODY; + + Kokkos::parallel_for("NESTED_INIT Kokkos_Lambda_Cuda", + Kokkos::MDRangePolicy, + Kokkos::Cuda>({0,0,0}, {ni, nj, nk}), + nestedinit_lam); } stopTimer(); + // Checks for errors + Kokkos::fence(); NESTED_INIT_DATA_TEARDOWN_CUDA; } else { std::cout << "\n NESTED_INIT : Unknown Cuda variant id = " << vid << std::endl; } + +#endif //RUN_KOKKOS } } // end namespace basic diff --git a/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp b/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp index 166ed0ef5..500623335 100644 --- a/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp +++ b/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp @@ -94,8 +94,8 @@ void NESTED_INIT::runKokkosSeqVariant(VariantID vid) // There are tuning knobs in MDRange to optimize performance // Kokkos::parallel_for("NESTED_INIT KokkosSeq", Kokkos::MDRangePolicy, Kokkos::Serial>({0,0,0}, {ni, nj, nk}), - nestedinit_lam -); + nestedinit_lam); + // RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), // RAJA::RangeSegment(0, nj), From 233072e3e377fa6c9483d2313f6721821b935424 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Wed, 13 Jan 2021 09:46:49 -0800 Subject: [PATCH 041/124] Revert to standard Kokkos nomenclature --- src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp b/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp index 43e7c3746..6b5a9fab7 100644 --- a/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp +++ b/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp @@ -45,7 +45,7 @@ void NESTED_INIT::runKokkosCudaVariant(VariantID vid) NESTED_INIT_DATA_SETUP; - auto nestedinit_lam = [=] __device__ (Index_type i, Index_type j, Index_type k) { + auto nestedinit_lam = KOKKOS_LAMBDA (Index_type i, Index_type j, Index_type k) { NESTED_INIT_BODY; }; From a62d462120ec0d65fd6be21aec79f19598e6e94a Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Wed, 13 Jan 2021 10:23:42 -0800 Subject: [PATCH 042/124] Fixes for NESTED_INIT --- src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp | 109 +++++++++----------- 1 file changed, 49 insertions(+), 60 deletions(-) diff --git a/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp b/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp index 6b5a9fab7..74f741723 100644 --- a/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp +++ b/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp @@ -16,16 +16,14 @@ #include -namespace rajaperf -{ -namespace basic -{ +namespace rajaperf { +namespace basic { -#define NESTED_INIT_DATA_SETUP_CUDA \ +#define NESTED_INIT_DATA_SETUP_CUDA \ allocAndInitCudaDeviceData(array, m_array, m_array_length); -#define NESTED_INIT_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_array, array, m_array_length); \ +#define NESTED_INIT_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_array, array, m_array_length); \ deallocCudaDeviceData(array); //__global__ void nested_init(Real_ptr array, @@ -38,22 +36,20 @@ namespace basic // NESTED_INIT_BODY; //} - -void NESTED_INIT::runKokkosCudaVariant(VariantID vid) -{ +void NESTED_INIT::runKokkosCudaVariant(VariantID vid) { const Index_type run_reps = getRunReps(); NESTED_INIT_DATA_SETUP; - auto nestedinit_lam = KOKKOS_LAMBDA (Index_type i, Index_type j, Index_type k) { - NESTED_INIT_BODY; - }; - - + auto nestedinit_lam = + KOKKOS_LAMBDA(Index_type i, Index_type j, Index_type k) { + auto ind = i + ni * (j + nj * k); + NESTED_INIT_BODY; + }; #if defined RUN_KOKKOS - if ( vid == Base_CUDA ) { + if (vid == Base_CUDA) { NESTED_INIT_DATA_SETUP_CUDA; @@ -63,76 +59,69 @@ void NESTED_INIT::runKokkosCudaVariant(VariantID vid) dim3 nthreads_per_block(ni, 1, 1); dim3 nblocks(1, nj, nk); - //nested_init<<>>(array, + // nested_init<<>>(array, // ni, nj); - } stopTimer(); NESTED_INIT_DATA_TEARDOWN_CUDA; - } else if ( vid == Kokkos_Lambda_CUDA ) { + } else if (vid == Kokkos_Lambda_CUDA) { NESTED_INIT_DATA_SETUP_CUDA; -/* - using EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::CudaKernelAsync< - RAJA::statement::For<2, RAJA::cuda_block_z_loop, // k - RAJA::statement::For<1, RAJA::cuda_block_y_loop, // j - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // i - RAJA::statement::Lambda<0> + /* + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::CudaKernelAsync< + RAJA::statement::For<2, RAJA::cuda_block_z_loop, // k + RAJA::statement::For<1, RAJA::cuda_block_y_loop, // j + RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // i + RAJA::statement::Lambda<0> + > + > > > - > - > - >; + >; -*/ + */ startTimer(); - std::cout << "ni "<< ni << std::endl; - std::cout << "nj "<< nj << std::endl; - std::cout << "nk "<< nk << std::endl; - - - std::cout << "m_array_length " << m_array_length << std::endl; - std::cout << "m_array " << std::hex << m_array << std::hex << std::endl; - - - - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { -// RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), -// RAJA::RangeSegment(0, nj), -// RAJA::RangeSegment(0, nk)), -// [=] __device__ (Index_type i, Index_type j, Index_type k) { -// NESTED_INIT_BODY; - - Kokkos::parallel_for("NESTED_INIT Kokkos_Lambda_Cuda", - Kokkos::MDRangePolicy, - Kokkos::Cuda>({0,0,0}, {ni, nj, nk}), - nestedinit_lam); - + // RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, + // ni), + // RAJA::RangeSegment(0, + // nj), + // RAJA::RangeSegment(0, + // nk)), + // [=] __device__ (Index_type i, Index_type j, Index_type k) { + // NESTED_INIT_BODY; + + Kokkos::parallel_for( + "NESTED_INIT Kokkos_Lambda_Cuda", + Kokkos::MDRangePolicy< + Kokkos::Rank<3, Kokkos::Iterate::Right, Kokkos::Iterate::Right>, + Kokkos::Cuda>({0, 0, 0}, {ni, nj, nk}), + + KOKKOS_LAMBDA(Index_type i, Index_type j, Index_type k) { + NESTED_INIT_BODY; + }); } stopTimer(); - // Checks for errors - Kokkos::fence(); + // Checks for errors NESTED_INIT_DATA_TEARDOWN_CUDA; } else { - std::cout << "\n NESTED_INIT : Unknown Cuda variant id = " << vid << std::endl; + std::cout << "\n NESTED_INIT : Unknown Cuda variant id = " << vid + << std::endl; } -#endif //RUN_KOKKOS +#endif // RUN_KOKKOS } } // end namespace basic } // end namespace rajaperf -#endif // RAJA_ENABLE_CUDA +#endif // RAJA_ENABLE_CUDA From 2d35bc9868dc9cd9168c64b00cdeeedc2f854bf2 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Wed, 13 Jan 2021 10:25:08 -0800 Subject: [PATCH 043/124] Remove debug output --- src/common/Executor.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index acd891ad2..e3d198ea2 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -349,7 +349,6 @@ void Executor::reportRunSummary(ostream& str) const str << "\t Output files will be named " << ofiles << endl; #if defined(RUN_KOKKOS) - std::cout << "DECLARING METADATA HERRRRRRE\n"; Kokkos::Tools::declareMetadata("replication_factor",std::to_string(run_params.getRepFactor())); Kokkos::Tools::declareMetadata("size_factor",std::to_string(run_params.getSizeFactor())); #endif From 25d3ac4304f9e33c79f084d03f11fb8884e889e9 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 14 Jan 2021 15:06:16 -0800 Subject: [PATCH 044/124] adding date stamp to xml file --- scripts/csv_xml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/csv_xml.py b/scripts/csv_xml.py index 5d9bcd3b6..533f6fcf9 100755 --- a/scripts/csv_xml.py +++ b/scripts/csv_xml.py @@ -23,7 +23,7 @@ def read_infile(infile): def get_date(): """STUB""" - date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") + date = datetime.now().strftime("%-Y-%m-%dT%H:%M:%S") return date @@ -31,7 +31,7 @@ def get_date(): perf_report = ET.Element("performance-report") -name ="RAJAPerf-timing.csv" +name ="RAJAPerf" + date + ".xml" time_units="seconds" From cb4bd108feb15415665ed32aa7ed0a638acdf8e7 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Fri, 15 Jan 2021 14:38:10 -0800 Subject: [PATCH 045/124] tidy'ing up changes --- src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp b/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp index 74f741723..25f81cb55 100644 --- a/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp +++ b/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp @@ -41,11 +41,7 @@ void NESTED_INIT::runKokkosCudaVariant(VariantID vid) { NESTED_INIT_DATA_SETUP; - auto nestedinit_lam = - KOKKOS_LAMBDA(Index_type i, Index_type j, Index_type k) { - auto ind = i + ni * (j + nj * k); - NESTED_INIT_BODY; - }; + #if defined RUN_KOKKOS From 6cb52f2793378889d05a8c0542b5f979190b3c39 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 18 Jan 2021 08:34:32 -0800 Subject: [PATCH 046/124] csv_xml.py: lower casing variants --- scripts/csv_xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/csv_xml.py b/scripts/csv_xml.py index 533f6fcf9..7905d6456 100755 --- a/scripts/csv_xml.py +++ b/scripts/csv_xml.py @@ -110,7 +110,7 @@ def associate_timings_with_xml(xml_element, timing_dict, suite_or_test_name): timing_dict = a map of variant names to test run times """ for key, value in timing_dict.items(): - xml_element.set(key, str(value)) + xml_element.set(key.lower(), str(value)) xml_element.set("name", suite_or_test_name.strip()) From c34a43d2665d9493598ed7e30e091533fd24870d Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 2 Feb 2021 14:04:37 -0800 Subject: [PATCH 047/124] re-worked NESTED_INIT kernel in Kokkos design --- src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp | 274 +++++++++++++++++++-- 1 file changed, 249 insertions(+), 25 deletions(-) diff --git a/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp b/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp index 500623335..3f75f0517 100644 --- a/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp +++ b/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp @@ -17,11 +17,225 @@ namespace rajaperf namespace basic { +//Kokkos Design Spirit: +//WE NEED: +//1) Use KokkosViews --> a wrapper around pointers for host and device memory +//management +//2) Use default execution space +// +// +// +// +// NEW FUNCTION WILL: +// 1) Take in a raw pointer (e.g., float*, int*, etc.) +// 2) From this pointer, return a Kokkos::View +// +// Return type : Kokkos::View +// Kokkos::View takes tempalted arguments +// To write "generically" implies templated arguments +// https://eli.thegreenplace.net/2014/variadic-templates-in-c/ +// +template + + +// This is a TEMPLATED STRUCT. This struct will contain the type of a pointer of n dimensions +// This struct is templated on the template that immediately precedes the struct declaration. +struct PointerOfNdimensions; + +// This template block declares a specialization, which means that you say the +// template arguments that you're NOT specializing +template + +// Here, we are specialising a template according to the type of argument that +// is passed. In this example, we've specialized the PointedAt template +// argument for the case that the number of dimensions is 0. All we will do in +// this struct is to define a type. + +// This struct is a specialization of : +// template +struct PointerOfNdimensions { + // "using" is a type alias + // if you derefernce a pointer, you're just left with an object, the value + // of that pointer + using type = PointedAt; +}; + +// NO SPECIALIZATION, i.e., we fix no templated arguments +template + +struct PointerOfNdimensions { + // PointerOfNdimensions is a type + // My type is a pointer to the type of myself, decremented + using type = typename PointerOfNdimensions::type*; + +}; + + +template + +// FUNCTION THAT GETS A VIEW FROM A POINTER WITH RETURN TYPE KOKKOS::VIEW +// +auto getViewFromPointer(PointedAt* kokkos_ptr, Boundaries... boundaries) + // Recall: PointerOfNdimensions is struct that exists solely to hold a + // type + // -> connotes "return type after the arrow" + -> typename Kokkos::View< + typename PointerOfNdimensions ::type, + //typename Kokkos::DefaultHostExecutionSpace::memory_space> + //This more generic expression allow moving the + //View-wrapped pointer b/w + //Host and GPU + typename Kokkos::DefaultExecutionSpace::memory_space> + + +{ + // This says construct the pointer_holder variable from arguments passed to + // the template block + // + using host_view_type = typename Kokkos::View< + typename PointerOfNdimensions ::type, + typename Kokkos::DefaultHostExecutionSpace::memory_space>; + + // FYI - Device can be GPU, OpenMPTarget, HIP (for targeting an AMD GPU), SYCL (library in Intel + // Compiler) + // + using device_view_type = typename Kokkos::View< + typename PointerOfNdimensions ::type, + typename Kokkos::DefaultExecutionSpace::memory_space>; + + + + // When copying data, we can either change the Layout or the memory_space + // (host or device), but we cannot change both! + // Here, we are mirroring data on the host to the device, i.e., Layout is + // as if on the device, but the data is actually on the host. The host + // mirror will be Layout Left (optimal for the device), but data are + // actually on the HOST! + + // Here, "using" is type alias; in this example,its our gpu Layout on cpu + using mirror_view_type = typename device_view_type::HostMirror; + + // Assignment statement; we are constructing a host_view_type with the name pointer_holder. The value of kokkos_ptr + // is the pointer we're wrapping on the Host, and the Boundaries parameter + // pack values, boundaries, will also be part of this this host_view_type + // object. + + host_view_type pointer_holder (kokkos_ptr, boundaries...); + + // boundaries will contain the array dimenions; an allocation is implicitly made here + device_view_type device_data_copy ( "StringName", boundaries...); + + mirror_view_type cpu_to_gpu_mirror = Kokkos::create_mirror_view(device_data_copy); + + // We need to deep_copy our existing data, the contents of + // pointer_holder, into the mirror_view; + // Copying from Host to Device has two steps: 1) Change the layout, 2) + // change the memory_space (host or device). Step 1 is to change the + // layout to enable sending data from CPU to GPU. Step 2 is actually + // sending the optimal data layout to the GPU + + // This step changes the Layout to be optimal for the gpu + Kokkos::deep_copy(cpu_to_gpu_mirror, pointer_holder); + + + // The mirror view data layout on the HOST is like the layout for the GPU. GPU-optimized layouts are LayoutLeft, + // i.e., column-major + // This deep_copy copy GPU-layout data on the HOST to the Device + + // Actual copying of the data from the host to the gpu + Kokkos::deep_copy(device_data_copy, cpu_to_gpu_mirror); + + + // Kokkos::View return type + + return device_data_copy; + +} + +/////////////////////////////////////////////////////////////////////////////// +//THIS FUNCTION WILL MOVE DATA IN A KOKKOS::VIEW BACK TO HOST FROM DEVICE, AND +//STORE IN AN EXISTING POINTER +/////////////////////////////////////////////////////////////////////////////// + + + +template + +// DEFINING FUNCTION THAT GETS A VIEW FROM A POINTER WITH RETURN TYPE KOKKOS::VIEW +//"my_view" parameter is equivalent to device_data_copy +// +void moveDataToHostFromKokkosView(PointedAt* kokkos_ptr, ExistingView my_view, Boundaries... boundaries) + +{ + // This says construct the pointer_holder variable from arguments passed to + // the template block + // + using host_view_type = typename Kokkos::View< + typename PointerOfNdimensions ::type, + typename Kokkos::DefaultHostExecutionSpace::memory_space>; + + // FYI - Device can be GPU, OpenMPTarget, HIP (for targeting an AMD GPU), SYCL (library in Intel + // Compiler) + // + using device_view_type = typename Kokkos::View< + typename PointerOfNdimensions ::type, + typename Kokkos::DefaultExecutionSpace::memory_space>; + + + + // When copying data, we can either change the Layout or the memory_space + // (host or device), but we cannot change both! + // Here, we are mirroring data on the host to the device, i.e., Layout is + // as if on the device, but the data is actually on the host. The host + // mirror will be Layout Left (optimal for the device), but data are + // actually on the HOST! + + // Here, "using" is type alias; in this example,its our gpu Layout on cpu + using mirror_view_type = typename device_view_type::HostMirror; + + // Assignment statement; we are constructing a host_view_type with the name pointer_holder. The value of kokkos_ptr + // is the pointer we're wrapping on the Host, and the Boundaries parameter + // pack values, boundaries, will also be part of this this host_view_type + // object. + + host_view_type pointer_holder (kokkos_ptr, boundaries...); + + // Layout is optimal for gpu, but located on CPU + mirror_view_type cpu_to_gpu_mirror = Kokkos::create_mirror_view(my_view); + + + + // We need to deep_copy our existing data, the contents of + // pointer_holder, into the mirror_view; + // Copying from Host to Device has two steps: 1) Change the layout, 2) + // change the memory_space (host or device). Step 1 is to change the + // layout to enable sending data from CPU to GPU. Step 2 is actually + // sending the optimal data layout to the GPU + + // This step changes the Layout to be optimal for the gpu + + + // The mirror view data layout on the HOST is like the layout for the GPU. GPU-optimized layouts are LayoutLeft, + // i.e., column-major + // This deep_copy copy GPU-layout data on the HOST to the Device + + // Actual copying of the data from the gpu to the cpu + Kokkos::deep_copy(cpu_to_gpu_mirror, my_view); + + //This copies from the mirror on the cpu + Kokkos::deep_copy(pointer_holder, cpu_to_gpu_mirror); + +} + + +////////////////////////////////////////////////////////////////////////////// void NESTED_INIT::runKokkosSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); + NESTED_INIT_DATA_SETUP; auto nestedinit_lam = [=](Index_type i, Index_type j, Index_type k) { @@ -71,40 +285,50 @@ void NESTED_INIT::runKokkosSeqVariant(VariantID vid) break; } - case Kokkos_Lambda_Seq : { - -/* using EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::For<2, RAJA::loop_exec, // k - RAJA::statement::For<1, RAJA::loop_exec, // j - RAJA::statement::For<0, RAJA::loop_exec,// i - RAJA::statement::Lambda<0> - > - > +// Kokkos_Lambda_Seq variant + case Kokkos_Lambda_Seq : { - > - > -; -*/ + // Wrap the nested init array pointer in a Kokkos View + // In a Kokkos View, array arguments for array boundaries go from outmost + // to innermost dimension sizes + // See the basic NESTED_INIT.hpp file for defnition of NESTED_INIT + + auto array_kokkos_view = getViewFromPointer(array, nk, nj, ni); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - // There are tuning knobs in MDRange to optimize performance - // - Kokkos::parallel_for("NESTED_INIT KokkosSeq", Kokkos::MDRangePolicy, Kokkos::Serial>({0,0,0}, {ni, nj, nk}), - nestedinit_lam); + // MDRange can be optimized + + Kokkos::parallel_for("NESTED_INIT KokkosSeq", + // Range policy + Kokkos::MDRangePolicy, + // Execution space + Kokkos::DefaultExecutionSpace>({0,0,0}, {nk,nj,ni}), + // Loop body + KOKKOS_LAMBDA(Index_type k, Index_type j, Index_type i) { + //NESTED_INIT_BODY no longer useful, because we're not + //operating on the array, but on the Kokkos::View + // array_kokkos_view created to hold value for + // getViewFromPointer(array, nk, nj, ni) + // MD Views are index'ed via "()" + // + // KOKKOS-FIED translation of NESTED_INIT_BODY: + // #define NESTED_INIT_BODY + // array[i+ni*(j+nj*k)] = 0.00000001 * i * j * k ; + // + array_kokkos_view(k, j, i) = 0.00000001 * i * j * k; + } +); - -// RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), -// RAJA::RangeSegment(0, nj), -// RAJA::RangeSegment(0, nk)), -// nestedinit_lam -// ); - } stopTimer(); + // "Moves" mirror data from GPU to CPU (void, i.e., no retrun type). In + // this moving of data back to Host, the layout is changed back to Layout + // Right, vs. the LayoutLeft of the GPU + moveDataToHostFromKokkosView(array, array_kokkos_view, nk, nj, ni); break; } From 8fca574a057a3dddec4c2dcc34058da2b930f05e Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Tue, 2 Feb 2021 14:25:53 -0800 Subject: [PATCH 048/124] Sed surgery --- ..._PI-KokkosSeq.cpp => ATOMIC_PI-Kokkos.cpp} | 0 src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp | 125 ----------- src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp | 85 -------- .../ATOMIC_PI-KokkosOMPTarget.cpp | 103 --------- src/basic-kokkos/ATOMIC_PI-OMPTarget.cpp | 103 --------- src/basic-kokkos/CMakeLists.txt | 50 +---- src/basic-kokkos/CMakeOther.txt | 72 ------- .../{DAXPY-KokkosSeq.cpp => DAXPY-Kokkos.cpp} | 0 src/basic-kokkos/DAXPY-KokkosCuda.cpp | 99 --------- src/basic-kokkos/DAXPY-KokkosOMP.cpp | 76 ------- src/basic-kokkos/DAXPY-KokkosOMPTarget.cpp | 93 --------- src/basic-kokkos/DAXPY-KokkosOpenMP.cpp | 76 ------- ..._QUAD-KokkosSeq.cpp => IF_QUAD-Kokkos.cpp} | 0 src/basic-kokkos/IF_QUAD-KokkosCuda.cpp | 120 ----------- src/basic-kokkos/IF_QUAD-KokkosOMP.cpp | 85 -------- src/basic-kokkos/IF_QUAD-KokkosOMPTarget.cpp | 99 --------- .../{INIT3-KokkosSeq.cpp => INIT3-Kokkos.cpp} | 0 src/basic-kokkos/INIT3-KokkosCuda.cpp | 118 ----------- src/basic-kokkos/INIT3-KokkosOMP.cpp | 103 --------- src/basic-kokkos/INIT3-KokkosOMPTarget.cpp | 101 --------- ...D-KokkosSeq.cpp => INIT_VIEW1D-Kokkos.cpp} | 0 src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp | 108 ---------- src/basic-kokkos/INIT_VIEW1D-KokkosOMP.cpp | 99 --------- .../INIT_VIEW1D-KokkosOMPTarget.cpp | 93 --------- ...sSeq.cpp => INIT_VIEW1D_OFFSET-Kokkos.cpp} | 0 .../INIT_VIEW1D_OFFSET-KokkosCuda.cpp | 107 ---------- .../INIT_VIEW1D_OFFSET-KokkosOMP.cpp | 99 --------- .../INIT_VIEW1D_OFFSET-KokkosOMPTarget.cpp | 94 --------- ...SUB-KokkosSeq.cpp => MULADDSUB-Kokkos.cpp} | 0 src/basic-kokkos/MULADDSUB-KokkosCuda.cpp | 112 ---------- src/basic-kokkos/MULADDSUB-KokkosOMP.cpp | 93 --------- .../MULADDSUB-KokkosOMPTarget.cpp | 101 --------- ...T-KokkosSeq.cpp => NESTED_INIT-Kokkos.cpp} | 0 src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp | 123 ----------- src/basic-kokkos/NESTED_INIT-KokkosOMP.cpp | 134 ------------ .../NESTED_INIT-KokkosOMPTarget.cpp | 98 --------- ...T-KokkosSeq.cpp => REDUCE3_INT-Kokkos.cpp} | 0 src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp | 196 ------------------ src/basic-kokkos/REDUCE3_INT-KokkosOMP.cpp | 126 ----------- .../REDUCE3_INT-KokkosOMPTarget.cpp | 110 ---------- ..._INT-KokkosSeq.cpp => TRAP_INT-Kokkos.cpp} | 0 src/basic-kokkos/TRAP_INT-KokkosCuda.cpp | 174 ---------------- src/basic-kokkos/TRAP_INT-KokkosOMP.cpp | 122 ----------- src/basic-kokkos/TRAP_INT-KokkosOMPTarget.cpp | 111 ---------- 44 files changed, 10 insertions(+), 3598 deletions(-) rename src/basic-kokkos/{ATOMIC_PI-KokkosSeq.cpp => ATOMIC_PI-Kokkos.cpp} (100%) delete mode 100644 src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp delete mode 100644 src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp delete mode 100644 src/basic-kokkos/ATOMIC_PI-KokkosOMPTarget.cpp delete mode 100644 src/basic-kokkos/ATOMIC_PI-OMPTarget.cpp delete mode 100644 src/basic-kokkos/CMakeOther.txt rename src/basic-kokkos/{DAXPY-KokkosSeq.cpp => DAXPY-Kokkos.cpp} (100%) delete mode 100644 src/basic-kokkos/DAXPY-KokkosCuda.cpp delete mode 100644 src/basic-kokkos/DAXPY-KokkosOMP.cpp delete mode 100644 src/basic-kokkos/DAXPY-KokkosOMPTarget.cpp delete mode 100644 src/basic-kokkos/DAXPY-KokkosOpenMP.cpp rename src/basic-kokkos/{IF_QUAD-KokkosSeq.cpp => IF_QUAD-Kokkos.cpp} (100%) delete mode 100644 src/basic-kokkos/IF_QUAD-KokkosCuda.cpp delete mode 100644 src/basic-kokkos/IF_QUAD-KokkosOMP.cpp delete mode 100644 src/basic-kokkos/IF_QUAD-KokkosOMPTarget.cpp rename src/basic-kokkos/{INIT3-KokkosSeq.cpp => INIT3-Kokkos.cpp} (100%) delete mode 100644 src/basic-kokkos/INIT3-KokkosCuda.cpp delete mode 100644 src/basic-kokkos/INIT3-KokkosOMP.cpp delete mode 100644 src/basic-kokkos/INIT3-KokkosOMPTarget.cpp rename src/basic-kokkos/{INIT_VIEW1D-KokkosSeq.cpp => INIT_VIEW1D-Kokkos.cpp} (100%) delete mode 100644 src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp delete mode 100644 src/basic-kokkos/INIT_VIEW1D-KokkosOMP.cpp delete mode 100644 src/basic-kokkos/INIT_VIEW1D-KokkosOMPTarget.cpp rename src/basic-kokkos/{INIT_VIEW1D_OFFSET-KokkosSeq.cpp => INIT_VIEW1D_OFFSET-Kokkos.cpp} (100%) delete mode 100644 src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp delete mode 100644 src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosOMP.cpp delete mode 100644 src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosOMPTarget.cpp rename src/basic-kokkos/{MULADDSUB-KokkosSeq.cpp => MULADDSUB-Kokkos.cpp} (100%) delete mode 100644 src/basic-kokkos/MULADDSUB-KokkosCuda.cpp delete mode 100644 src/basic-kokkos/MULADDSUB-KokkosOMP.cpp delete mode 100644 src/basic-kokkos/MULADDSUB-KokkosOMPTarget.cpp rename src/basic-kokkos/{NESTED_INIT-KokkosSeq.cpp => NESTED_INIT-Kokkos.cpp} (100%) delete mode 100644 src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp delete mode 100644 src/basic-kokkos/NESTED_INIT-KokkosOMP.cpp delete mode 100644 src/basic-kokkos/NESTED_INIT-KokkosOMPTarget.cpp rename src/basic-kokkos/{REDUCE3_INT-KokkosSeq.cpp => REDUCE3_INT-Kokkos.cpp} (100%) delete mode 100644 src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp delete mode 100644 src/basic-kokkos/REDUCE3_INT-KokkosOMP.cpp delete mode 100644 src/basic-kokkos/REDUCE3_INT-KokkosOMPTarget.cpp rename src/basic-kokkos/{TRAP_INT-KokkosSeq.cpp => TRAP_INT-Kokkos.cpp} (100%) delete mode 100644 src/basic-kokkos/TRAP_INT-KokkosCuda.cpp delete mode 100644 src/basic-kokkos/TRAP_INT-KokkosOMP.cpp delete mode 100644 src/basic-kokkos/TRAP_INT-KokkosOMPTarget.cpp diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp b/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp similarity index 100% rename from src/basic-kokkos/ATOMIC_PI-KokkosSeq.cpp rename to src/basic-kokkos/ATOMIC_PI-Kokkos.cpp diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp b/src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp deleted file mode 100644 index 4849389c2..000000000 --- a/src/basic-kokkos/ATOMIC_PI-KokkosCuda.cpp +++ /dev/null @@ -1,125 +0,0 @@ -////~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "ATOMIC_PI.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_CUDA) - -#include "common/CudaDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define thread block size for CUDA execution - // - - -#define ATOMIC_PI_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(pi, m_pi, 1); - -#define ATOMIC_PI_DATA_TEARDOWN_CUDA \ - deallocCudaDeviceData(pi); - -// AJP COMMENTED THIS DEF OUT; IT IS THE DEF OF A RAJA KERNEL -/*__global__ void atomic_pi(Real_ptr pi, - Real_type dx, - Index_type iend) -{ - Index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < iend) { - double x = (double(i) + 0.5) * dx; - RAJA::atomicAdd(pi, dx / (1.0 + x * x)); - } -} - -*/ -// AJP Kokkos-ifying here: - -void ATOMIC_PI::runKokkosCudaVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - ATOMIC_PI_DATA_SETUP; - -#if defined(RUN_KOKKOS) - - if ( vid == Base_CUDA ) { - -#if defined (RUN_CUDA) - - ATOMIC_PI_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - initCudaDeviceData(pi, &m_pi_init, 1); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - atomic_pi<<>>( pi, dx, iend ); - - getCudaDeviceData(m_pi, pi, 1); - *m_pi *= 4.0; - - } - stopTimer(); - - ATOMIC_PI_DATA_TEARDOWN_CUDA; - -#endif //RUN_CUDA - - } else if ( vid == Kokkos_Lambda_CUDA ) { - - ATOMIC_PI_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - initCudaDeviceData(pi, &m_pi_init, 1); - -// RAJA::forall< RAJA::cuda_exec >( -/* RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { - double x = (double(i) + 0.5) * dx; - RAJA::atomicAdd(pi, dx / (1.0 + x * x)); - }); -*/ - - Kokkos::parallel_for("ATOMIC_PI-KokkosCuda Kokkkos_Lambda", Kokkos::RangePolicy(ibegin, iend), - // Here, function executes on the device / GPU, and copies by VALUE - // the "[=] __device__" indicates "KOKKOS_LAMBDA"; - // KOKKOS_LAMBDA = #define KOKKOS_LAMBDA[=]__device__ - [=] __device__ (Index_type i) { - double x = (double(i) + 0.5) * dx; - Kokkos::atomic_add(pi, dx / (1.0 + x * x)); -}); - getCudaDeviceData(m_pi, pi, 1); - *m_pi *= 4.0; - - }; - stopTimer(); - - ATOMIC_PI_DATA_TEARDOWN_CUDA; - - } else { - std::cout << "\n ATOMIC_PI : Unknown Cuda variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RUN_KOKKOS -#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp b/src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp deleted file mode 100644 index d6cd9a1e6..000000000 --- a/src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp +++ /dev/null @@ -1,85 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "ATOMIC_PI.hpp" - -#include "RAJA/RAJA.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ -struct AtomicPIFunctor { - Real_type dx; - Real_ptr pi; - - AtomicPIFunctor(Real_type m_dx, Real_ptr m_pi) : ATOMIC_PI_FUNCTOR_CONSTRUCT {} -}; - - -void ATOMIC_PI::runKokkosOpenMPVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - ATOMIC_PI_DATA_SETUP; - -#if defined(RUN_KOKKOS) && defined(RUN_OPENMP) - switch ( vid ) { - - case Kokkos_Functor_OpenMP : { - - startTimer(); - //for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - // *pi = m_pi_init; - // RAJA::forall( - // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - // double x = (double(i) + 0.5) * dx; - // RAJA::atomicAdd(pi, dx / (1.0 + x * x)); - // }); - // *pi *= 4.0; - - //} - stopTimer(); - - break; - } - case Kokkos_Lambda_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - *pi = m_pi_init; - - Kokkos::parallel_for("name",Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA(Index_type i){ - double x = ((double(i) + 0.5) * dx); - Kokkos::atomic_add(pi, dx / (1.0 + x * x)); - }); - *pi *= 4.0; - } - stopTimer(); - - break; - } - - - default : { - std::cout << "\n ATOMIC_PI : Unknown variant id = " << vid << std::endl; - } - - } - -#endif -} - -} // end namespace basic -} // end namespace rajaperf diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosOMPTarget.cpp b/src/basic-kokkos/ATOMIC_PI-KokkosOMPTarget.cpp deleted file mode 100644 index 934415cd9..000000000 --- a/src/basic-kokkos/ATOMIC_PI-KokkosOMPTarget.cpp +++ /dev/null @@ -1,103 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "ATOMIC_PI.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - -#include "common/OpenMPTargetDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define threads per team for target execution - // - const size_t threads_per_team = 256; - -#define ATOMIC_PI_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(pi, m_pi, 1, did, hid); - -#define ATOMIC_PI_DATA_TEARDOWN_OMP_TARGET \ - deallocOpenMPDeviceData(pi, did); - - -void ATOMIC_PI::runKokkosOpenMPTargetVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - ATOMIC_PI_DATA_SETUP; - - if ( vid == Base_OpenMPTarget ) { - - ATOMIC_PI_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - initOpenMPDeviceData(pi, &m_pi_init, 1, did, hid); - - #pragma omp target is_device_ptr(pi) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) - for (Index_type i = ibegin; i < iend; ++i ) { - double x = (double(i) + 0.5) * dx; - #pragma omp atomic - *pi += dx / (1.0 + x * x); - } - - getOpenMPDeviceData(m_pi, pi, 1, hid, did); - *m_pi *= 4.0; - - } - stopTimer(); - - ATOMIC_PI_DATA_TEARDOWN_OMP_TARGET; - - } else if ( vid == RAJA_OpenMPTarget ) { - - ATOMIC_PI_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - initOpenMPDeviceData(pi, &m_pi_init, 1, did, hid); - - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - double x = (double(i) + 0.5) * dx; - RAJA::atomicAdd(pi, dx / (1.0 + x * x)); - }); - - getOpenMPDeviceData(m_pi, pi, 1, hid, did); - *m_pi *= 4.0; - - } - stopTimer(); - - ATOMIC_PI_DATA_TEARDOWN_OMP_TARGET; - - } else { - std::cout << "\n ATOMIC_PI : Unknown OMP Target variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/ATOMIC_PI-OMPTarget.cpp b/src/basic-kokkos/ATOMIC_PI-OMPTarget.cpp deleted file mode 100644 index 578b5ed99..000000000 --- a/src/basic-kokkos/ATOMIC_PI-OMPTarget.cpp +++ /dev/null @@ -1,103 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "ATOMIC_PI.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - -#include "common/OpenMPTargetDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define threads per team for target execution - // - const size_t threads_per_team = 256; - -#define ATOMIC_PI_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(pi, m_pi, 1, did, hid); - -#define ATOMIC_PI_DATA_TEARDOWN_OMP_TARGET \ - deallocOpenMPDeviceData(pi, did); - - -void ATOMIC_PI::runOpenMPTargetVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - ATOMIC_PI_DATA_SETUP; - - if ( vid == Base_OpenMPTarget ) { - - ATOMIC_PI_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - initOpenMPDeviceData(pi, &m_pi_init, 1, did, hid); - - #pragma omp target is_device_ptr(pi) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) - for (Index_type i = ibegin; i < iend; ++i ) { - double x = (double(i) + 0.5) * dx; - #pragma omp atomic - *pi += dx / (1.0 + x * x); - } - - getOpenMPDeviceData(m_pi, pi, 1, hid, did); - *m_pi *= 4.0; - - } - stopTimer(); - - ATOMIC_PI_DATA_TEARDOWN_OMP_TARGET; - - } else if ( vid == RAJA_OpenMPTarget ) { - - ATOMIC_PI_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - initOpenMPDeviceData(pi, &m_pi_init, 1, did, hid); - - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - double x = (double(i) + 0.5) * dx; - RAJA::atomicAdd(pi, dx / (1.0 + x * x)); - }); - - getOpenMPDeviceData(m_pi, pi, 1, hid, did); - *m_pi *= 4.0; - - } - stopTimer(); - - ATOMIC_PI_DATA_TEARDOWN_OMP_TARGET; - - } else { - std::cout << "\n ATOMIC_PI : Unknown OMP Target variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/CMakeLists.txt b/src/basic-kokkos/CMakeLists.txt index f559af456..7f62c271f 100644 --- a/src/basic-kokkos/CMakeLists.txt +++ b/src/basic-kokkos/CMakeLists.txt @@ -11,46 +11,16 @@ include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/../basic) blt_add_library( NAME basic-kokkos SOURCES - ATOMIC_PI-KokkosSeq.cpp - ATOMIC_PI-KokkosCuda.cpp - ATOMIC_PI-KokkosOMP.cpp - ATOMIC_PI-KokkosOMPTarget.cpp - DAXPY-KokkosSeq.cpp - DAXPY-KokkosCuda.cpp - DAXPY-KokkosOMP.cpp - DAXPY-KokkosOMPTarget.cpp - IF_QUAD-KokkosSeq.cpp - IF_QUAD-KokkosCuda.cpp - IF_QUAD-KokkosOMP.cpp - IF_QUAD-KokkosOMPTarget.cpp - INIT3-KokkosSeq.cpp - INIT3-KokkosCuda.cpp - INIT3-KokkosOMP.cpp - INIT3-KokkosOMPTarget.cpp - INIT_VIEW1D-KokkosSeq.cpp - INIT_VIEW1D-KokkosCuda.cpp - INIT_VIEW1D-KokkosOMP.cpp - INIT_VIEW1D-KokkosOMPTarget.cpp - INIT_VIEW1D_OFFSET-KokkosSeq.cpp - INIT_VIEW1D_OFFSET-KokkosCuda.cpp - INIT_VIEW1D_OFFSET-KokkosOMP.cpp - INIT_VIEW1D_OFFSET-KokkosOMPTarget.cpp - MULADDSUB-KokkosSeq.cpp - MULADDSUB-KokkosCuda.cpp - MULADDSUB-KokkosOMP.cpp - MULADDSUB-KokkosOMPTarget.cpp - NESTED_INIT-KokkosSeq.cpp - NESTED_INIT-KokkosCuda.cpp - NESTED_INIT-KokkosOMP.cpp - NESTED_INIT-KokkosOMPTarget.cpp - REDUCE3_INT-KokkosSeq.cpp - REDUCE3_INT-KokkosCuda.cpp - REDUCE3_INT-KokkosOMP.cpp - REDUCE3_INT-KokkosOMPTarget.cpp - TRAP_INT-KokkosSeq.cpp - TRAP_INT-KokkosCuda.cpp - TRAP_INT-KokkosOMPTarget.cpp - TRAP_INT-KokkosOMP.cpp + ATOMIC_PI-Kokkos.cpp + DAXPY-Kokkos.cpp + IF_QUAD-Kokkos.cpp + INIT3-Kokkos.cpp + INIT_VIEW1D-Kokkos.cpp + INIT_VIEW1D_OFFSET-Kokkos.cpp + MULADDSUB-Kokkos.cpp + NESTED_INIT-Kokkos.cpp + REDUCE3_INT-Kokkos.cpp + TRAP_INT-Kokkos.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/basic-kokkos/CMakeOther.txt b/src/basic-kokkos/CMakeOther.txt deleted file mode 100644 index b4b886a91..000000000 --- a/src/basic-kokkos/CMakeOther.txt +++ /dev/null @@ -1,72 +0,0 @@ -############################################################################### -# Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -# and RAJA Performance Suite project contributors. -# See the RAJAPerf/COPYRIGHT file for details. -# -# SPDX-License-Identifier: (BSD-3-Clause) -############################################################################### - -blt_add_library( - NAME basic - SOURCES ATOMIC_PI.cpp - ATOMIC_PI-Seq.cpp - ATOMIC_PI-Hip.cpp - ATOMIC_PI-Cuda.cpp - ATOMIC_PI-OMP.cpp - ATOMIC_PI-OMPTarget.cpp - DAXPY.cpp - DAXPY-Seq.cpp - DAXPY-Hip.cpp - DAXPY-Cuda.cpp - DAXPY-OMP.cpp - DAXPY-OMPTarget.cpp - IF_QUAD.cpp - IF_QUAD-Seq.cpp - IF_QUAD-Hip.cpp - IF_QUAD-Cuda.cpp - IF_QUAD-OMP.cpp - IF_QUAD-OMPTarget.cpp - INIT3.cpp - INIT3-Seq.cpp - INIT3-Hip.cpp - INIT3-Cuda.cpp - INIT3-OMP.cpp - INIT3-OMPTarget.cpp - INIT_VIEW1D.cpp - INIT_VIEW1D-Seq.cpp - INIT_VIEW1D-Hip.cpp - INIT_VIEW1D-Cuda.cpp - INIT_VIEW1D-OMP.cpp - INIT_VIEW1D-OMPTarget.cpp - INIT_VIEW1D_OFFSET.cpp - INIT_VIEW1D_OFFSET-Seq.cpp - INIT_VIEW1D_OFFSET-Hip.cpp - INIT_VIEW1D_OFFSET-Cuda.cpp - INIT_VIEW1D_OFFSET-OMP.cpp - INIT_VIEW1D_OFFSET-OMPTarget.cpp - MULADDSUB.cpp - MULADDSUB-Seq.cpp - MULADDSUB-Hip.cpp - MULADDSUB-Cuda.cpp - MULADDSUB-OMP.cpp - MULADDSUB-OMPTarget.cpp - NESTED_INIT.cpp - NESTED_INIT-Seq.cpp - NESTED_INIT-Hip.cpp - NESTED_INIT-Cuda.cpp - NESTED_INIT-OMP.cpp - NESTED_INIT-OMPTarget.cpp - REDUCE3_INT.cpp - REDUCE3_INT-Seq.cpp - REDUCE3_INT-Hip.cpp - REDUCE3_INT-Cuda.cpp - REDUCE3_INT-OMP.cpp - REDUCE3_INT-OMPTarget.cpp - TRAP_INT.cpp - TRAP_INT-Seq.cpp - TRAP_INT-Hip.cpp - TRAP_INT-Cuda.cpp - TRAP_INT-OMPTarget.cpp - TRAP_INT-OMP.cpp - DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} - ) diff --git a/src/basic-kokkos/DAXPY-KokkosSeq.cpp b/src/basic-kokkos/DAXPY-Kokkos.cpp similarity index 100% rename from src/basic-kokkos/DAXPY-KokkosSeq.cpp rename to src/basic-kokkos/DAXPY-Kokkos.cpp diff --git a/src/basic-kokkos/DAXPY-KokkosCuda.cpp b/src/basic-kokkos/DAXPY-KokkosCuda.cpp deleted file mode 100644 index bc24d7896..000000000 --- a/src/basic-kokkos/DAXPY-KokkosCuda.cpp +++ /dev/null @@ -1,99 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "DAXPY.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_CUDA) - -#include "common/CudaDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - -struct DaxpyCudaFunctor { - Real_ptr x; - Real_ptr y; - Real_type a; - DaxpyCudaFunctor(Real_ptr m_x, Real_ptr m_y, Real_type m_a) : DAXPY_FUNCTOR_CONSTRUCT { } - KOKKOS_FUNCTION void operator()(Index_type i) const { DAXPY_BODY; } -}; - - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - -#define DAXPY_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(x, m_x, iend); \ - allocAndInitCudaDeviceData(y, m_y, iend); - -#define DAXPY_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_y, y, iend); \ - deallocCudaDeviceData(x); \ - deallocCudaDeviceData(y); - - -void DAXPY::runKokkosCudaVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - DAXPY_DATA_SETUP; - -#if defined(RUN_KOKKOS) - - if ( vid == Kokkos_Functor_CUDA) { - DAXPY_DATA_SETUP_CUDA; - DaxpyCudaFunctor daxpy_functor_instance(y,x,a); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Kokkos::parallel_for("perfsuite.kokkos.cuda.functor", Kokkos::RangePolicy(ibegin, iend), - daxpy_functor_instance); - - } - stopTimer(); - - DAXPY_DATA_TEARDOWN_CUDA; - - } else if ( vid == Kokkos_Lambda_CUDA ) { - - DAXPY_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Kokkos::parallel_for("DAXPY-KokkosCuda Kokkos_Lambda", - Kokkos::RangePolicy(ibegin, iend), [=] __device__ (Index_type i) { - DAXPY_BODY; - }); - - } - stopTimer(); - - DAXPY_DATA_TEARDOWN_CUDA; - - } else { - std::cout << "\n DAXPY : Unknown Cuda variant id = " << vid << std::endl; - } -#endif // RUN_KOKKOS -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/DAXPY-KokkosOMP.cpp b/src/basic-kokkos/DAXPY-KokkosOMP.cpp deleted file mode 100644 index 2b59c8012..000000000 --- a/src/basic-kokkos/DAXPY-KokkosOMP.cpp +++ /dev/null @@ -1,76 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "DAXPY.hpp" - -#include "RAJA/RAJA.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - -struct DaxpyFunctor { - Real_ptr x; - Real_ptr y; - Real_type a; - DaxpyFunctor(Real_ptr m_x, Real_ptr m_y, Real_type m_a) : DAXPY_FUNCTOR_CONSTRUCT { } - void operator()(Index_type i) const { DAXPY_BODY; } -}; - -void DAXPY::runKokkosOpenMPVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - DAXPY_DATA_SETUP; - - auto daxpy_lam = [=](Index_type i) { - DAXPY_BODY; - }; - - switch ( vid ) { - -#if defined(RUN_KOKKOS) -#if defined(RUN_OPENMP) - case Kokkos_Lambda_OpenMP: { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Kokkos::parallel_for("perfsuite.kokkos.openmp.lambda", Kokkos::RangePolicy(ibegin, iend), - [=](Index_type i) { DAXPY_BODY; }); - } - stopTimer(); - - break; - } - case Kokkos_Functor_OpenMP: { - DaxpyFunctor daxpy_functor_instance(y,x,a); - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Kokkos::parallel_for("perfsuite.kokkos.openmp.lambda", Kokkos::RangePolicy(ibegin, iend), - daxpy_functor_instance); - } - stopTimer(); - - break; - } -#endif // RUN_KOKKOS -#endif // RUN_RAJA_SEQ - default : { - std::cout << "\n DAXPY : Unknown variant id = " << vid << std::endl; - } - - } - -} - -} // end namespace basic -} // end namespace rajaperf diff --git a/src/basic-kokkos/DAXPY-KokkosOMPTarget.cpp b/src/basic-kokkos/DAXPY-KokkosOMPTarget.cpp deleted file mode 100644 index 98783a19f..000000000 --- a/src/basic-kokkos/DAXPY-KokkosOMPTarget.cpp +++ /dev/null @@ -1,93 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "DAXPY.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - -#include "common/OpenMPTargetDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define threads per team for target execution - // - const size_t threads_per_team = 256; - -#define DAXPY_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \ - allocAndInitOpenMPDeviceData(y, m_y, iend, did, hid); - -#define DAXPY_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_y, y, iend, hid, did); \ - deallocOpenMPDeviceData(x, did); \ - deallocOpenMPDeviceData(y, did); - - -void DAXPY::runKokkosOpenMPTargetVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - DAXPY_DATA_SETUP; - - if ( vid == Base_OpenMPTarget ) { - - DAXPY_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - #pragma omp target is_device_ptr(x, y) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) - for (Index_type i = ibegin; i < iend; ++i ) { - DAXPY_BODY; - } - - } - stopTimer(); - - DAXPY_DATA_TEARDOWN_OMP_TARGET; - - } else if ( vid == RAJA_OpenMPTarget ) { - - DAXPY_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - DAXPY_BODY; - }); - - } - stopTimer(); - - DAXPY_DATA_TEARDOWN_OMP_TARGET; - - } else { - std::cout << "\n DAXPY : Unknown OMP Target variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/DAXPY-KokkosOpenMP.cpp b/src/basic-kokkos/DAXPY-KokkosOpenMP.cpp deleted file mode 100644 index 2b59c8012..000000000 --- a/src/basic-kokkos/DAXPY-KokkosOpenMP.cpp +++ /dev/null @@ -1,76 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "DAXPY.hpp" - -#include "RAJA/RAJA.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - -struct DaxpyFunctor { - Real_ptr x; - Real_ptr y; - Real_type a; - DaxpyFunctor(Real_ptr m_x, Real_ptr m_y, Real_type m_a) : DAXPY_FUNCTOR_CONSTRUCT { } - void operator()(Index_type i) const { DAXPY_BODY; } -}; - -void DAXPY::runKokkosOpenMPVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - DAXPY_DATA_SETUP; - - auto daxpy_lam = [=](Index_type i) { - DAXPY_BODY; - }; - - switch ( vid ) { - -#if defined(RUN_KOKKOS) -#if defined(RUN_OPENMP) - case Kokkos_Lambda_OpenMP: { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Kokkos::parallel_for("perfsuite.kokkos.openmp.lambda", Kokkos::RangePolicy(ibegin, iend), - [=](Index_type i) { DAXPY_BODY; }); - } - stopTimer(); - - break; - } - case Kokkos_Functor_OpenMP: { - DaxpyFunctor daxpy_functor_instance(y,x,a); - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Kokkos::parallel_for("perfsuite.kokkos.openmp.lambda", Kokkos::RangePolicy(ibegin, iend), - daxpy_functor_instance); - } - stopTimer(); - - break; - } -#endif // RUN_KOKKOS -#endif // RUN_RAJA_SEQ - default : { - std::cout << "\n DAXPY : Unknown variant id = " << vid << std::endl; - } - - } - -} - -} // end namespace basic -} // end namespace rajaperf diff --git a/src/basic-kokkos/IF_QUAD-KokkosSeq.cpp b/src/basic-kokkos/IF_QUAD-Kokkos.cpp similarity index 100% rename from src/basic-kokkos/IF_QUAD-KokkosSeq.cpp rename to src/basic-kokkos/IF_QUAD-Kokkos.cpp diff --git a/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp b/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp deleted file mode 100644 index 69af69984..000000000 --- a/src/basic-kokkos/IF_QUAD-KokkosCuda.cpp +++ /dev/null @@ -1,120 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "IF_QUAD.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_CUDA) - -#include "common/CudaDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - -#define IF_QUAD_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(a, m_a, iend); \ - allocAndInitCudaDeviceData(b, m_b, iend); \ - allocAndInitCudaDeviceData(c, m_c, iend); \ - allocAndInitCudaDeviceData(x1, m_x1, iend); \ - allocAndInitCudaDeviceData(x2, m_x2, iend); - -#define IF_QUAD_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_x1, x1, iend); \ - getCudaDeviceData(m_x2, x2, iend); \ - deallocCudaDeviceData(a); \ - deallocCudaDeviceData(b); \ - deallocCudaDeviceData(c); \ - deallocCudaDeviceData(x1); \ - deallocCudaDeviceData(x2); - -// AJP started Kokkos-ifying here -void IF_QUAD::runKokkosCudaVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - IF_QUAD_DATA_SETUP; - -#if defined(RUN_KOKKOS) - - if ( vid == Base_CUDA ) { - -#if defined(RUN_CUDA) - - IF_QUAD_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - // QUESTION: Should "RAJA_DIVIDE_CEILING_INT be changed? - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - //ifquad<<>>( x1, x2, a, b, c, - // iend ); - - } - stopTimer(); - - IF_QUAD_DATA_TEARDOWN_CUDA; - -#endif // RUN_CUDA - - } else if ( vid == Kokkos_Lambda_CUDA ) { -// } else if ( vid == RAJA_CUDA ) { - - IF_QUAD_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - -// RAJA::forall< RAJA::cuda_exec >( -// RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { -// IF_QUAD_BODY; -// }); - - - Kokkos::parallel_for("IF_QUAD-KokkosCuda Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), - // Here, the function executes on the device / GPU - [=] __device__ (Index_type i) {IF_QUAD_BODY}); - //KOKKOS_LAMBDA (Index_type i) {IF_QUAD_BODY}); - - -// >( -// RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { -// IF_QUAD_BODY; -// }); - - - } - stopTimer(); - - IF_QUAD_DATA_TEARDOWN_CUDA; - - } else { - std::cout << "\n IF_QUAD : Unknown Cuda variant id = " << vid << std::endl; - } - - -#endif // RUN_KOKKOS -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/IF_QUAD-KokkosOMP.cpp b/src/basic-kokkos/IF_QUAD-KokkosOMP.cpp deleted file mode 100644 index c2e3bb006..000000000 --- a/src/basic-kokkos/IF_QUAD-KokkosOMP.cpp +++ /dev/null @@ -1,85 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "IF_QUAD.hpp" - -#include "RAJA/RAJA.hpp" - -#include - -namespace rajaperf -{ -// Refers to both Kokkos and Raja namespaces; we are defining methods on a class in the -namespace basic -{ - - -// Kokkos-ify here -//void IF_QUAD::runSeqVariant(VariantID vid) - -void IF_QUAD::runKokkosOpenMPVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - IF_QUAD_DATA_SETUP; - - auto ifquad_lam = [=](Index_type i) { - IF_QUAD_BODY; - }; - - -#if defined(RUN_KOKKOS) - - switch ( vid ) { - - // AJP added (following DAXPY example) -- - -//#if defined(RUN_KOKKOS) -//#if defined(RUN_OPENMP) - - -#if defined(RUN_OPENMP) - -//#if defined(RUN_RAJA_SEQ) - - case Kokkos_Lambda_OpenMP: { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - -/* RAJA::forall( - RAJA::RangeSegment(ibegin, iend), ifquad_lam); -*/ - // Translation - Kokkos::parallel_for("Quad", Kokkos::RangePolicy(ibegin, iend), - [=] (Index_type i) {IF_QUAD_BODY}); - - } - stopTimer(); - - break; - } -#endif // RUN_OPENMP - - default : { - std::cout << "\n IF_QUAD : Unknown variant id = " << vid << std::endl; - } - - } - -#endif // RUN_KOKKOS - - - - -} - -} // end namespace basic -} // end namespace rajaperf diff --git a/src/basic-kokkos/IF_QUAD-KokkosOMPTarget.cpp b/src/basic-kokkos/IF_QUAD-KokkosOMPTarget.cpp deleted file mode 100644 index 8a93dcd28..000000000 --- a/src/basic-kokkos/IF_QUAD-KokkosOMPTarget.cpp +++ /dev/null @@ -1,99 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "IF_QUAD.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - -#include "common/OpenMPTargetDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define threads per team for target execution - // - const size_t threads_per_team = 256; - -#define IF_QUAD_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid); \ - allocAndInitOpenMPDeviceData(b, m_b, iend, did, hid); \ - allocAndInitOpenMPDeviceData(c, m_c, iend, did, hid); \ - allocAndInitOpenMPDeviceData(x1, m_x1, iend, did, hid); \ - allocAndInitOpenMPDeviceData(x2, m_x2, iend, did, hid); - -#define IF_QUAD_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_x1, x1, iend, hid, did); \ - getOpenMPDeviceData(m_x2, x2, iend, hid, did); \ - deallocOpenMPDeviceData(a, did); \ - deallocOpenMPDeviceData(b, did); \ - deallocOpenMPDeviceData(c, did); \ - deallocOpenMPDeviceData(x1, did); \ - deallocOpenMPDeviceData(x2, did); - -void IF_QUAD::runKokkosOpenMPTargetVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - IF_QUAD_DATA_SETUP; - - if ( vid == Base_OpenMPTarget ) { - - IF_QUAD_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - #pragma omp target is_device_ptr(a, b, c, x1, x2) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) - for (Index_type i = ibegin; i < iend; ++i ) { - IF_QUAD_BODY; - } - - } - stopTimer(); - - IF_QUAD_DATA_TEARDOWN_OMP_TARGET; - - } else if ( vid == RAJA_OpenMPTarget ) { - - IF_QUAD_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - IF_QUAD_BODY; - }); - - } - stopTimer(); - - IF_QUAD_DATA_TEARDOWN_OMP_TARGET; - - } else { - std::cout << "\n IF_QUAD : Unknown OMP Target variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/INIT3-KokkosSeq.cpp b/src/basic-kokkos/INIT3-Kokkos.cpp similarity index 100% rename from src/basic-kokkos/INIT3-KokkosSeq.cpp rename to src/basic-kokkos/INIT3-Kokkos.cpp diff --git a/src/basic-kokkos/INIT3-KokkosCuda.cpp b/src/basic-kokkos/INIT3-KokkosCuda.cpp deleted file mode 100644 index 497e84d48..000000000 --- a/src/basic-kokkos/INIT3-KokkosCuda.cpp +++ /dev/null @@ -1,118 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "INIT3.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_CUDA) - -#include "common/CudaDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - -#define INIT3_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(out1, m_out1, iend); \ - allocAndInitCudaDeviceData(out2, m_out2, iend); \ - allocAndInitCudaDeviceData(out3, m_out3, iend); \ - allocAndInitCudaDeviceData(in1, m_in1, iend); \ - allocAndInitCudaDeviceData(in2, m_in2, iend); - -#define INIT3_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_out1, out1, iend); \ - getCudaDeviceData(m_out2, out2, iend); \ - getCudaDeviceData(m_out3, out3, iend); \ - deallocCudaDeviceData(out1); \ - deallocCudaDeviceData(out2); \ - deallocCudaDeviceData(out3); \ - deallocCudaDeviceData(in1); \ - deallocCudaDeviceData(in2); - -//__global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3, -// Real_ptr in1, Real_ptr in2, -// Index_type iend) -//{ -// Index_type i = blockIdx.x * blockDim.x + threadIdx.x; -// if (i < iend) { -// INIT3_BODY; -// } -//} - - -void INIT3::runKokkosCudaVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - INIT3_DATA_SETUP; - -#if defined(RUN_KOKKOS) - - if ( vid == Base_CUDA ) { - -//#error WHATS UP - INIT3_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - //init3<<>>( out1, out2, out3, in1, in2, - // iend ); - - } - stopTimer(); - - INIT3_DATA_TEARDOWN_CUDA; - -// AJP modified lines below - } else if ( vid == Kokkos_Lambda_CUDA ) { - - INIT3_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - -// RAJA::forall< RAJA::cuda_exec >( -// RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { -// INIT3_BODY; -// }); - - Kokkos::parallel_for("INIT3-KokkosCuda Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), - //Here, the function executes on the device / GPU - [=] __device__ (Index_type i) {INIT3_BODY}); - - } - stopTimer(); - - INIT3_DATA_TEARDOWN_CUDA; - - - } else { - std::cout << "\n INIT3 : Unknown Cuda variant id = " << vid << std::endl; - } - -#endif //RUN_KOKKOS -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/INIT3-KokkosOMP.cpp b/src/basic-kokkos/INIT3-KokkosOMP.cpp deleted file mode 100644 index 2392b4ffe..000000000 --- a/src/basic-kokkos/INIT3-KokkosOMP.cpp +++ /dev/null @@ -1,103 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "INIT3.hpp" - -#include "RAJA/RAJA.hpp" - -#include - -namespace rajaperf -{ -// Refers to both Kokkos and Raja namespaces; we are defining methods on a class in .. -// DAVID - help completing this comment! -// -namespace basic -{ - -void INIT3::runKokkosOpenMPVariant(VariantID vid) -{ -//#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - INIT3_DATA_SETUP; - - auto init3_lam = [=](Index_type i) { - INIT3_BODY; - }; - -#if defined(RUN_KOKKOS) - - switch ( vid ) { - -#if defined(RUN_OPENMP) - -// case Base_OpenMP : { -// -// startTimer(); -// for (RepIndex_type irep = 0; irep < run_reps; ++irep) { -// -// #pragma omp parallel for -// for (Index_type i = ibegin; i < iend; ++i ) { -// INIT3_BODY; -// } -// -// } -// stopTimer(); -// -// break; -// } -// -// case Lambda_OpenMP : { -// -// startTimer(); -// for (RepIndex_type irep = 0; irep < run_reps; ++irep) { -// -// #pragma omp parallel for -// for (Index_type i = ibegin; i < iend; ++i ) { -// init3_lam(i); -// } -// -// } -// stopTimer(); -// -// break; -// } - - - - case Kokkos_Lambda_OpenMP: { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { -/* - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), init3_lam); -*/ - Kokkos::parallel_for("Init3_OMP", Kokkos::RangePolicy(ibegin, iend), - [=] (Index_type i) {INIT3_BODY}); - } - stopTimer(); - - break; - } -#endif // RUN_OPENMP - - default : { - std::cout << "\n INIT3 : Unknown variant id = " << vid << std::endl; - } - } - -#endif // RUN_KOKKOS -} - -} // end namespace basic -} // end namespace rajaperf diff --git a/src/basic-kokkos/INIT3-KokkosOMPTarget.cpp b/src/basic-kokkos/INIT3-KokkosOMPTarget.cpp deleted file mode 100644 index c81db46de..000000000 --- a/src/basic-kokkos/INIT3-KokkosOMPTarget.cpp +++ /dev/null @@ -1,101 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "INIT3.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - -#include "common/OpenMPTargetDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define threads per team for target execution - // - const size_t threads_per_team = 256; - -#define INIT3_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(out1, m_out1, iend, did, hid); \ - allocAndInitOpenMPDeviceData(out2, m_out2, iend, did, hid); \ - allocAndInitOpenMPDeviceData(out3, m_out3, iend, did, hid); \ - allocAndInitOpenMPDeviceData(in1, m_in1, iend, did, hid); \ - allocAndInitOpenMPDeviceData(in2, m_in2, iend, did, hid); - -#define INIT3_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_out1, out1, iend, hid, did); \ - getOpenMPDeviceData(m_out2, out2, iend, hid, did); \ - getOpenMPDeviceData(m_out3, out3, iend, hid, did); \ - deallocOpenMPDeviceData(out1, did); \ - deallocOpenMPDeviceData(out2, did); \ - deallocOpenMPDeviceData(out3, did); \ - deallocOpenMPDeviceData(in1, did); \ - deallocOpenMPDeviceData(in2, did); - - -void INIT3::runKokkosOpenMPTargetVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - INIT3_DATA_SETUP; - - if ( vid == Base_OpenMPTarget ) { - - INIT3_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - #pragma omp target is_device_ptr(out1, out2, out3, in1, in2) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) - for (Index_type i = ibegin; i < iend; ++i ) { - INIT3_BODY; - } - - } - stopTimer(); - - INIT3_DATA_TEARDOWN_OMP_TARGET; - - } else if ( vid == RAJA_OpenMPTarget ) { - - INIT3_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - INIT3_BODY; - }); - - } - stopTimer(); - - INIT3_DATA_TEARDOWN_OMP_TARGET; - - } else { - std::cout << "\n INIT3 : Unknown OMP Target variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp similarity index 100% rename from src/basic-kokkos/INIT_VIEW1D-KokkosSeq.cpp rename to src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp diff --git a/src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp b/src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp deleted file mode 100644 index d0c958538..000000000 --- a/src/basic-kokkos/INIT_VIEW1D-KokkosCuda.cpp +++ /dev/null @@ -1,108 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "INIT_VIEW1D.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_CUDA) - -#include "common/CudaDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - -#define INIT_VIEW1D_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(a, m_a, getRunSize()); - -#define INIT_VIEW1D_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_a, a, getRunSize()); \ - deallocCudaDeviceData(a); - -//__global__ void initview1d(Real_ptr a, -// Real_type v, -// const Index_type iend) -//{ -// Index_type i = blockIdx.x * blockDim.x + threadIdx.x; -// if (i < iend) { -// INIT_VIEW1D_BODY; -// } -//} - - -void INIT_VIEW1D::runKokkosCudaVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - INIT_VIEW1D_DATA_SETUP; - -#if defined(RUN_KOKKOS) - - if ( vid == Base_CUDA ) { - - INIT_VIEW1D_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - //initview1d<<>>( a, - // v, - // iend ); - - } - stopTimer(); - - INIT_VIEW1D_DATA_TEARDOWN_CUDA; - -// AJP modified lines below - } else if ( vid == Kokkos_Lambda_CUDA ) { - - INIT_VIEW1D_DATA_SETUP_CUDA; - - INIT_VIEW1D_VIEW_RAJA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - -// RAJA::forall< RAJA::cuda_exec >( -// RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) { -// INIT_VIEW1D_BODY_RAJA; -// }); - - Kokkos::parallel_for("INIT_VIEW1D-KokkosCuda Kokkos-Lambda", Kokkos::RangePolicy(ibegin, iend), - // Here, the function executes on the device / GPU - [=] __device__ (Index_type i) {INIT_VIEW1D_BODY_RAJA}); - - } - stopTimer(); - - INIT_VIEW1D_DATA_TEARDOWN_CUDA; - - } else { - std::cout << "\n INIT_VIEW1D : Unknown Cuda variant id = " << vid << std::endl; - } -#endif //RUN_KOKKOS -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/INIT_VIEW1D-KokkosOMP.cpp b/src/basic-kokkos/INIT_VIEW1D-KokkosOMP.cpp deleted file mode 100644 index 0596813a2..000000000 --- a/src/basic-kokkos/INIT_VIEW1D-KokkosOMP.cpp +++ /dev/null @@ -1,99 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "INIT_VIEW1D.hpp" - -#include "RAJA/RAJA.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - -void INIT_VIEW1D::runKokkosOpenMPVariant(VariantID vid) -{ -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - INIT_VIEW1D_DATA_SETUP; - - switch ( vid ) { - - case Base_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - #pragma omp parallel for - for (Index_type i = ibegin; i < iend; ++i ) { - INIT_VIEW1D_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_OpenMP : { - - auto initview1d_base_lam = [=](Index_type i) { - INIT_VIEW1D_BODY; - }; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - #pragma omp parallel for - for (Index_type i = ibegin; i < iend; ++i ) { - initview1d_base_lam(i); - } - - } - stopTimer(); - - break; - } - - case RAJA_OpenMP : { - - INIT_VIEW1D_VIEW_RAJA; - - auto initview1d_lam = [=](Index_type i) { - INIT_VIEW1D_BODY_RAJA; - }; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), initview1d_lam); - - } - stopTimer(); - - break; - } - - default : { - std::cout << "\n INIT_VIEW1D : Unknown variant id = " << vid << std::endl; - } - - } - -#endif -} - -} // end namespace basic -} // end namespace rajaperf diff --git a/src/basic-kokkos/INIT_VIEW1D-KokkosOMPTarget.cpp b/src/basic-kokkos/INIT_VIEW1D-KokkosOMPTarget.cpp deleted file mode 100644 index ffe170c77..000000000 --- a/src/basic-kokkos/INIT_VIEW1D-KokkosOMPTarget.cpp +++ /dev/null @@ -1,93 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "INIT_VIEW1D.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - -#include "common/OpenMPTargetDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define threads per team for target execution - // - const size_t threads_per_team = 256; - -#define INIT_VIEW1D_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(a, m_a, getRunSize(), did, hid); - -#define INIT_VIEW1D_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_a, a, getRunSize(), hid, did); \ - deallocOpenMPDeviceData(a, did); - - -void INIT_VIEW1D::runKokkosOpenMPTargetVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - INIT_VIEW1D_DATA_SETUP; - - if ( vid == Base_OpenMPTarget ) { - - INIT_VIEW1D_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - #pragma omp target is_device_ptr(a) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) - for (Index_type i = ibegin; i < iend; ++i ) { - INIT_VIEW1D_BODY; - } - - } - stopTimer(); - - INIT_VIEW1D_DATA_TEARDOWN_OMP_TARGET; - - } else if ( vid == RAJA_OpenMPTarget ) { - - INIT_VIEW1D_DATA_SETUP_OMP_TARGET; - - INIT_VIEW1D_VIEW_RAJA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - INIT_VIEW1D_BODY_RAJA; - }); - - } - stopTimer(); - - INIT_VIEW1D_DATA_TEARDOWN_OMP_TARGET; - - } else { - std::cout << "\n INIT_VIEW1D : Unknown OMP Targetvariant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosSeq.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp similarity index 100% rename from src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosSeq.cpp rename to src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp deleted file mode 100644 index dec7483bc..000000000 --- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosCuda.cpp +++ /dev/null @@ -1,107 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "INIT_VIEW1D_OFFSET.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_CUDA) - -#include "common/CudaDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - -#define INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(a, m_a, getRunSize()); - -#define INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_a, a, getRunSize()); \ - deallocCudaDeviceData(a); - -//__global__ void initview1d_offset(Real_ptr a, -// Real_type v, -// const Index_type ibegin, -// const Index_type iend) -//{ -// Index_type i = blockIdx.x * blockDim.x + threadIdx.x; -// if (i >= ibegin && i < iend) { -// INIT_VIEW1D_OFFSET_BODY; -// } -//} - - -void INIT_VIEW1D_OFFSET::runKokkosCudaVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 1; - const Index_type iend = getRunSize()+1; - - INIT_VIEW1D_OFFSET_DATA_SETUP; - -#if defined (RUN_KOKKOS) - - if ( vid == Base_CUDA ) { - - INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - //initview1d_offset<<>>( a, v, - // ibegin, - // iend ); - - } - stopTimer(); - - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA; - - } else if ( vid == Kokkos_Lambda_CUDA ) { - - INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA; - - INIT_VIEW1D_OFFSET_VIEW_RAJA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - - Kokkos::parallel_for("INIT_VIEW1D_OFFSET-KokkosCuda Kokkos_Lambda", - Kokkos::RangePolicy(ibegin, iend), - [=] __device__ (Index_type i) { - INIT_VIEW1D_OFFSET_BODY_RAJA; - } -); - - } - stopTimer(); - - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA; - - } else { - std::cout << "\n INIT_VIEW1D_OFFSET : Unknown Cuda variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_CUDA -#endif //RUN_KOKKOS diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosOMP.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosOMP.cpp deleted file mode 100644 index ac0577b96..000000000 --- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosOMP.cpp +++ /dev/null @@ -1,99 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "INIT_VIEW1D_OFFSET.hpp" - -#include "RAJA/RAJA.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - -void INIT_VIEW1D_OFFSET::runKokkosOpenMPVariant(VariantID vid) -{ -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 1; - const Index_type iend = getRunSize()+1; - - INIT_VIEW1D_OFFSET_DATA_SETUP; - - switch ( vid ) { - - case Base_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - #pragma omp parallel for - for (Index_type i = ibegin; i < iend; ++i ) { - INIT_VIEW1D_OFFSET_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_OpenMP : { - - auto initview1doffset_base_lam = [=](Index_type i) { - INIT_VIEW1D_OFFSET_BODY; - }; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - #pragma omp parallel for - for (Index_type i = ibegin; i < iend; ++i ) { - initview1doffset_base_lam(i); - } - - } - stopTimer(); - - break; - } - - case RAJA_OpenMP : { - - INIT_VIEW1D_OFFSET_VIEW_RAJA; - - auto initview1doffset_lam = [=](Index_type i) { - INIT_VIEW1D_OFFSET_BODY_RAJA; - }; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), initview1doffset_lam); - - } - stopTimer(); - - break; - } - - default : { - std::cout << "\n INIT_VIEW1D_OFFSET : Unknown variant id = " << vid << std::endl; - } - - } - -#endif -} - -} // end namespace basic -} // end namespace rajaperf diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosOMPTarget.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosOMPTarget.cpp deleted file mode 100644 index 285b3b69d..000000000 --- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-KokkosOMPTarget.cpp +++ /dev/null @@ -1,94 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "INIT_VIEW1D_OFFSET.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - -#include "common/OpenMPTargetDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define threads per team for target execution - // - const size_t threads_per_team = 256; - -#define INIT_VIEW1D_OFFSET_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(a, m_a, getRunSize(), did, hid); - -#define INIT_VIEW1D_OFFSET_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_a, a, getRunSize(), hid, did); \ - deallocOpenMPDeviceData(a, did); - - -void INIT_VIEW1D_OFFSET::runKokkosOpenMPTargetVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 1; - const Index_type iend = getRunSize()+1; - - INIT_VIEW1D_OFFSET_DATA_SETUP; - - if ( vid == Base_OpenMPTarget ) { - - INIT_VIEW1D_OFFSET_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - #pragma omp target is_device_ptr(a) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) - for (Index_type i = ibegin; i < iend; ++i ) { - INIT_VIEW1D_OFFSET_BODY; - } - - } - stopTimer(); - - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_OMP_TARGET; - - } else if ( vid == RAJA_OpenMPTarget ) { - - INIT_VIEW1D_OFFSET_DATA_SETUP_OMP_TARGET; - - INIT_VIEW1D_OFFSET_VIEW_RAJA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - INIT_VIEW1D_OFFSET_BODY_RAJA; - }); - - } - stopTimer(); - - INIT_VIEW1D_OFFSET_DATA_TEARDOWN_OMP_TARGET; - - } else { - std::cout << "\n INIT_VIEW1D_OFFSET : Unknown OMP Targetvariant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_TARGET_OPENMP - diff --git a/src/basic-kokkos/MULADDSUB-KokkosSeq.cpp b/src/basic-kokkos/MULADDSUB-Kokkos.cpp similarity index 100% rename from src/basic-kokkos/MULADDSUB-KokkosSeq.cpp rename to src/basic-kokkos/MULADDSUB-Kokkos.cpp diff --git a/src/basic-kokkos/MULADDSUB-KokkosCuda.cpp b/src/basic-kokkos/MULADDSUB-KokkosCuda.cpp deleted file mode 100644 index 905e98a8f..000000000 --- a/src/basic-kokkos/MULADDSUB-KokkosCuda.cpp +++ /dev/null @@ -1,112 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "MULADDSUB.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_CUDA) - -#include "common/CudaDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - -#define MULADDSUB_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(out1, m_out1, iend); \ - allocAndInitCudaDeviceData(out2, m_out2, iend); \ - allocAndInitCudaDeviceData(out3, m_out3, iend); \ - allocAndInitCudaDeviceData(in1, m_in1, iend); \ - allocAndInitCudaDeviceData(in2, m_in2, iend); - -#define MULADDSUB_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_out1, out1, iend); \ - getCudaDeviceData(m_out2, out2, iend); \ - getCudaDeviceData(m_out3, out3, iend); \ - deallocCudaDeviceData(out1); \ - deallocCudaDeviceData(out2); \ - deallocCudaDeviceData(out3); \ - deallocCudaDeviceData(in1); \ - deallocCudaDeviceData(in2); - -//__global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3, -// Real_ptr in1, Real_ptr in2, -// Index_type iend) -//{ -// Index_type i = blockIdx.x * blockDim.x + threadIdx.x; -// if (i < iend) { -// MULADDSUB_BODY; -// } -//} - - -void MULADDSUB::runKokkosCudaVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - MULADDSUB_DATA_SETUP; - -#if defined RUN_KOKKOS - - if ( vid == Base_CUDA ) { - - MULADDSUB_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - //muladdsub<<>>( out1, out2, out3, in1, in2, - // iend ); - - } - stopTimer(); - - MULADDSUB_DATA_TEARDOWN_CUDA; - - } else if ( vid == Kokkos_Lambda_CUDA ) { - - MULADDSUB_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Kokkos::parallel_for("MULTISUB-KokkosCuda Kokkos_Lambda_CUDA", - Kokkos::RangePolicy(ibegin, iend), - [=] __device__ (Index_type i) { - MULADDSUB_BODY - } -); - - } - stopTimer(); - - MULADDSUB_DATA_TEARDOWN_CUDA; - - } else { - std::cout << "\n MULADDSUB : Unknown Cuda variant id = " << vid << std::endl; - } -#endif //RUN_KOKKOS -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/MULADDSUB-KokkosOMP.cpp b/src/basic-kokkos/MULADDSUB-KokkosOMP.cpp deleted file mode 100644 index 9df7b0129..000000000 --- a/src/basic-kokkos/MULADDSUB-KokkosOMP.cpp +++ /dev/null @@ -1,93 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "MULADDSUB.hpp" - -#include "RAJA/RAJA.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - -void MULADDSUB::runKokkosOpenMPVariant(VariantID vid) -{ -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - MULADDSUB_DATA_SETUP; - - auto mas_lam = [=](Index_type i) { - MULADDSUB_BODY; - }; - - switch ( vid ) { - - case Base_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - #pragma omp parallel for - for (Index_type i = ibegin; i < iend; ++i ) { - MULADDSUB_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - #pragma omp parallel for - for (Index_type i = ibegin; i < iend; ++i ) { - mas_lam(i); - } - - } - stopTimer(); - - break; - } - - case RAJA_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), mas_lam); - - } - stopTimer(); - - break; - } - - default : { - std::cout << "\n MULADDSUB : Unknown variant id = " << vid << std::endl; - } - - } - -#endif -} - -} // end namespace basic -} // end namespace rajaperf diff --git a/src/basic-kokkos/MULADDSUB-KokkosOMPTarget.cpp b/src/basic-kokkos/MULADDSUB-KokkosOMPTarget.cpp deleted file mode 100644 index ca664f0e8..000000000 --- a/src/basic-kokkos/MULADDSUB-KokkosOMPTarget.cpp +++ /dev/null @@ -1,101 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "MULADDSUB.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - -#include "common/OpenMPTargetDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define threads per team for target execution - // - const size_t threads_per_team = 256; - -#define MULADDSUB_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(out1, m_out1, iend, did, hid); \ - allocAndInitOpenMPDeviceData(out2, m_out2, iend, did, hid); \ - allocAndInitOpenMPDeviceData(out3, m_out3, iend, did, hid); \ - allocAndInitOpenMPDeviceData(in1, m_in1, iend, did, hid); \ - allocAndInitOpenMPDeviceData(in2, m_in2, iend, did, hid); - -#define MULADDSUB_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_out1, out1, iend, hid, did); \ - getOpenMPDeviceData(m_out2, out2, iend, hid, did); \ - getOpenMPDeviceData(m_out3, out3, iend, hid, did); \ - deallocOpenMPDeviceData(out1, did); \ - deallocOpenMPDeviceData(out2, did); \ - deallocOpenMPDeviceData(out3, did); \ - deallocOpenMPDeviceData(in1, did); \ - deallocOpenMPDeviceData(in2, did); - - -void MULADDSUB::runKokkosOpenMPTargetVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - MULADDSUB_DATA_SETUP; - - if ( vid == Base_OpenMPTarget ) { - - MULADDSUB_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - #pragma omp target is_device_ptr(out1, out2, out3, in1, in2) device( did ) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static, 1) - for (Index_type i = ibegin; i < iend; ++i ) { - MULADDSUB_BODY; - } - - } - stopTimer(); - - MULADDSUB_DATA_TEARDOWN_OMP_TARGET; - - } else if ( vid == RAJA_OpenMPTarget ) { - - MULADDSUB_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - MULADDSUB_BODY; - }); - - } - stopTimer(); - - MULADDSUB_DATA_TEARDOWN_OMP_TARGET; - - } else { - std::cout << "\n MULADDSUB : Unknown OMP Target variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp similarity index 100% rename from src/basic-kokkos/NESTED_INIT-KokkosSeq.cpp rename to src/basic-kokkos/NESTED_INIT-Kokkos.cpp diff --git a/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp b/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp deleted file mode 100644 index 25f81cb55..000000000 --- a/src/basic-kokkos/NESTED_INIT-KokkosCuda.cpp +++ /dev/null @@ -1,123 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "NESTED_INIT.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_CUDA) - -#include "common/CudaDataUtils.hpp" - -#include - -namespace rajaperf { -namespace basic { - -#define NESTED_INIT_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(array, m_array, m_array_length); - -#define NESTED_INIT_DATA_TEARDOWN_CUDA \ - getCudaDeviceData(m_array, array, m_array_length); \ - deallocCudaDeviceData(array); - -//__global__ void nested_init(Real_ptr array, -// Index_type ni, Index_type nj) -//{ -// Index_type i = threadIdx.x; -// Index_type j = blockIdx.y; -// Index_type k = blockIdx.z; -// -// NESTED_INIT_BODY; -//} - -void NESTED_INIT::runKokkosCudaVariant(VariantID vid) { - const Index_type run_reps = getRunReps(); - - NESTED_INIT_DATA_SETUP; - - - -#if defined RUN_KOKKOS - - if (vid == Base_CUDA) { - - NESTED_INIT_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - dim3 nthreads_per_block(ni, 1, 1); - dim3 nblocks(1, nj, nk); - - // nested_init<<>>(array, - // ni, nj); - } - stopTimer(); - - NESTED_INIT_DATA_TEARDOWN_CUDA; - - } else if (vid == Kokkos_Lambda_CUDA) { - - NESTED_INIT_DATA_SETUP_CUDA; - /* - using EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::CudaKernelAsync< - RAJA::statement::For<2, RAJA::cuda_block_z_loop, // k - RAJA::statement::For<1, RAJA::cuda_block_y_loop, // j - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // i - RAJA::statement::Lambda<0> - > - > - > - > - >; - - */ - - startTimer(); - - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - // RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, - // ni), - // RAJA::RangeSegment(0, - // nj), - // RAJA::RangeSegment(0, - // nk)), - // [=] __device__ (Index_type i, Index_type j, Index_type k) { - // NESTED_INIT_BODY; - - Kokkos::parallel_for( - "NESTED_INIT Kokkos_Lambda_Cuda", - Kokkos::MDRangePolicy< - Kokkos::Rank<3, Kokkos::Iterate::Right, Kokkos::Iterate::Right>, - Kokkos::Cuda>({0, 0, 0}, {ni, nj, nk}), - - KOKKOS_LAMBDA(Index_type i, Index_type j, Index_type k) { - NESTED_INIT_BODY; - }); - } - stopTimer(); - // Checks for errors - - NESTED_INIT_DATA_TEARDOWN_CUDA; - - } else { - std::cout << "\n NESTED_INIT : Unknown Cuda variant id = " << vid - << std::endl; - } - -#endif // RUN_KOKKOS -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/NESTED_INIT-KokkosOMP.cpp b/src/basic-kokkos/NESTED_INIT-KokkosOMP.cpp deleted file mode 100644 index bfdaca2cd..000000000 --- a/src/basic-kokkos/NESTED_INIT-KokkosOMP.cpp +++ /dev/null @@ -1,134 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "NESTED_INIT.hpp" - -#include "RAJA/RAJA.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - -//#define USE_OMP_COLLAPSE -#undef USE_OMP_COLLAPSE - - -void NESTED_INIT::runKokkosOpenMPVariant(VariantID vid) -{ -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - - const Index_type run_reps = getRunReps(); - - NESTED_INIT_DATA_SETUP; - - auto nestedinit_lam = [=](Index_type i, Index_type j, Index_type k) { - NESTED_INIT_BODY; - }; - - switch ( vid ) { - - case Base_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - -#if defined(USE_OMP_COLLAPSE) - #pragma omp parallel for collapse(3) -#else - #pragma omp parallel for -#endif - for (Index_type k = 0; k < nk; ++k ) { - for (Index_type j = 0; j < nj; ++j ) { - for (Index_type i = 0; i < ni; ++i ) { - NESTED_INIT_BODY; - } - } - } - - } - stopTimer(); - - break; - } - - case Lambda_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - -#if defined(USE_OMP_COLLAPSE) - #pragma omp parallel for collapse(3) -#else - #pragma omp parallel for -#endif - for (Index_type k = 0; k < nk; ++k ) { - for (Index_type j = 0; j < nj; ++j ) { - for (Index_type i = 0; i < ni; ++i ) { - nestedinit_lam(i, j, k); - } - } - } - - } - stopTimer(); - - break; - } - - case RAJA_OpenMP : { - -#if defined(USE_OMP_COLLAPSE) - using EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::Collapse, // k, j, i - RAJA::statement::Lambda<0> - > - >; -#else - using EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::For<2, RAJA::omp_parallel_for_exec, // k - RAJA::statement::For<1, RAJA::loop_exec, // j - RAJA::statement::For<0, RAJA::loop_exec, // i - RAJA::statement::Lambda<0> - > - > - > - >; -#endif - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), - RAJA::RangeSegment(0, nj), - RAJA::RangeSegment(0, nk)), - nestedinit_lam - ); - - } - stopTimer(); - - break; - } - - default : { - std::cout << "\n NESTED_INIT : Unknown variant id = " << vid << std::endl; - } - - } - -#endif -} - -} // end namespace basic -} // end namespace rajaperf diff --git a/src/basic-kokkos/NESTED_INIT-KokkosOMPTarget.cpp b/src/basic-kokkos/NESTED_INIT-KokkosOMPTarget.cpp deleted file mode 100644 index d61173a47..000000000 --- a/src/basic-kokkos/NESTED_INIT-KokkosOMPTarget.cpp +++ /dev/null @@ -1,98 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "NESTED_INIT.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - -#include "common/OpenMPTargetDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - -#define NESTED_INIT_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(array, m_array, m_array_length, did, hid); - -#define NESTED_INIT_DATA_TEARDOWN_OMP_TARGET \ - getOpenMPDeviceData(m_array, array, m_array_length, hid, did); \ - deallocOpenMPDeviceData(array, did); - - -void NESTED_INIT::runKokkosOpenMPTargetVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - - NESTED_INIT_DATA_SETUP; - - if ( vid == Base_OpenMPTarget ) { - - NESTED_INIT_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - #pragma omp target is_device_ptr(array) device( did ) - #pragma omp teams distribute parallel for schedule(static, 1) collapse(3) - for (Index_type k = 0; k < nk; ++k ) { - for (Index_type j = 0; j < nj; ++j ) { - for (Index_type i = 0; i < ni; ++i ) { - NESTED_INIT_BODY; - } - } - } - - } - stopTimer(); - - NESTED_INIT_DATA_TEARDOWN_OMP_TARGET; - - } else if ( vid == RAJA_OpenMPTarget ) { - - NESTED_INIT_DATA_SETUP_OMP_TARGET; - - using EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::Collapse, // k, j, i - RAJA::statement::Lambda<0> - > - >; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), - RAJA::RangeSegment(0, nj), - RAJA::RangeSegment(0, nk)), - [=](Index_type i, Index_type j, Index_type k) { - NESTED_INIT_BODY; - }); - - } - stopTimer(); - - NESTED_INIT_DATA_TEARDOWN_OMP_TARGET; - - } else { - std::cout << "\n NESTED_INIT : Unknown variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/REDUCE3_INT-KokkosSeq.cpp b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp similarity index 100% rename from src/basic-kokkos/REDUCE3_INT-KokkosSeq.cpp rename to src/basic-kokkos/REDUCE3_INT-Kokkos.cpp diff --git a/src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp b/src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp deleted file mode 100644 index 0c47f70b8..000000000 --- a/src/basic-kokkos/REDUCE3_INT-KokkosCuda.cpp +++ /dev/null @@ -1,196 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "REDUCE3_INT.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_CUDA) - -#include "common/CudaDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - -#define REDUCE3_INT_DATA_SETUP_CUDA \ - allocAndInitCudaDeviceData(vec, m_vec, iend); - -#define REDUCE3_INT_DATA_TEARDOWN_CUDA \ - deallocCudaDeviceData(vec); - - -//__global__ void reduce3int(Int_ptr vec, -// Int_ptr vsum, Int_type vsum_init, -// Int_ptr vmin, Int_type vmin_init, -// Int_ptr vmax, Int_type vmax_init, -// Index_type iend) -//{ -// extern __shared__ Int_type psum[ ]; -// Int_type* pmin = (Int_type*)&psum[ 1 * blockDim.x ]; -// Int_type* pmax = (Int_type*)&psum[ 2 * blockDim.x ]; -// -// Index_type i = blockIdx.x * blockDim.x + threadIdx.x; -// -// psum[ threadIdx.x ] = vsum_init; -// pmin[ threadIdx.x ] = vmin_init; -// pmax[ threadIdx.x ] = vmax_init; -// -// for ( ; i < iend ; i += gridDim.x * blockDim.x ) { -// psum[ threadIdx.x ] += vec[ i ]; -// pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], vec[ i ] ); -// pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], vec[ i ] ); -// } -// __syncthreads(); -// -// for ( i = blockDim.x / 2; i > 0; i /= 2 ) { -// if ( threadIdx.x < i ) { -// psum[ threadIdx.x ] += psum[ threadIdx.x + i ]; -// pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], pmin[ threadIdx.x + i ] ); -// pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], pmax[ threadIdx.x + i ] ); -// } -// __syncthreads(); -// } -// -//#if 1 // serialized access to shared data; -// if ( threadIdx.x == 0 ) { -// RAJA::atomicAdd( vsum, psum[ 0 ] ); -// RAJA::atomicMin( vmin, pmin[ 0 ] ); -// RAJA::atomicMax( vmax, pmax[ 0 ] ); -// } -//#else // this doesn't work due to data races -// if ( threadIdx.x == 0 ) { -// *vsum += psum[ 0 ]; -// *vmin = RAJA_MIN( *vmin, pmin[ 0 ] ); -// *vmax = RAJA_MAX( *vmax, pmax[ 0 ] ); -// } -//#endif -//} - - -void REDUCE3_INT::runKokkosCudaVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - REDUCE3_INT_DATA_SETUP; - -#if defined RUN_KOKKOS - - if ( vid == Base_CUDA ) { - - REDUCE3_INT_DATA_SETUP_CUDA; - - Int_ptr vsum; - allocAndInitCudaDeviceData(vsum, &m_vsum_init, 1); - Int_ptr vmin; - allocAndInitCudaDeviceData(vmin, &m_vmin_init, 1); - Int_ptr vmax; - allocAndInitCudaDeviceData(vmax, &m_vmax_init, 1); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - initCudaDeviceData(vsum, &m_vsum_init, 1); - initCudaDeviceData(vmin, &m_vmin_init, 1); - initCudaDeviceData(vmax, &m_vmax_init, 1); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - //reduce3int<<>>(vec, - // vsum, m_vsum_init, - // vmin, m_vmin_init, - // vmax, m_vmax_init, - // iend ); - - Int_type lsum; - Int_ptr plsum = &lsum; - getCudaDeviceData(plsum, vsum, 1); - m_vsum += lsum; - - Int_type lmin; - Int_ptr plmin = &lmin; - getCudaDeviceData(plmin, vmin, 1); - m_vmin = RAJA_MIN(m_vmin, lmin); - - Int_type lmax; - Int_ptr plmax = &lmax; - getCudaDeviceData(plmax, vmax, 1); - m_vmax = RAJA_MAX(m_vmax, lmax); - - } - stopTimer(); - - REDUCE3_INT_DATA_TEARDOWN_CUDA; - - deallocCudaDeviceData(vsum); - deallocCudaDeviceData(vmin); - deallocCudaDeviceData(vmax); - - } else if ( vid == Kokkos_Lambda_CUDA ) { - - REDUCE3_INT_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - -/* - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); -*/ - - Int_type max_value = m_vmax_init; - Int_type min_value = m_vmin_init; - Int_type sum = m_vsum_init; - - // KOKKOS_LAMBDA IS A PRE-PROCESSOR DIRECTIVE - // It makes the capture clause on the lambda work for Host and Device - - parallel_reduce("REDUCE3-KokkosCuda Kokkos_Lambda_Seq", - Kokkos::RangePolicy(ibegin, iend), - [=] __device__ (const int64_t i, Int_type& tl_max, Int_type& tl_min, Int_type& tl_sum) { - Int_type vec_i = vec[i]; - if (vec_i > tl_max) tl_max = vec_i; - if (vec_i < tl_min) tl_min= vec_i; - tl_sum += vec_i; - }, - Kokkos::Max(max_value), - Kokkos::Min(min_value), - sum); - - - m_vsum += static_cast(sum); - m_vmin = RAJA_MIN(m_vmin, static_cast(min_value)); - m_vmax = RAJA_MAX(m_vmax, static_cast(max_value)); - - } - stopTimer(); - - REDUCE3_INT_DATA_TEARDOWN_CUDA; - - } else { - std::cout << "\n REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl; - } -#endif //RUN_KOKKOS -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/REDUCE3_INT-KokkosOMP.cpp b/src/basic-kokkos/REDUCE3_INT-KokkosOMP.cpp deleted file mode 100644 index 07cc5a2b6..000000000 --- a/src/basic-kokkos/REDUCE3_INT-KokkosOMP.cpp +++ /dev/null @@ -1,126 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "REDUCE3_INT.hpp" - -#include "RAJA/RAJA.hpp" - -#include -#include - -namespace rajaperf -{ -namespace basic -{ - - -void REDUCE3_INT::runKokkosOpenMPVariant(VariantID vid) -{ -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - REDUCE3_INT_DATA_SETUP; - - switch ( vid ) { - - case Base_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Int_type vsum = m_vsum_init; - Int_type vmin = m_vmin_init; - Int_type vmax = m_vmax_init; - - #pragma omp parallel for reduction(+:vsum), \ - reduction(min:vmin), \ - reduction(max:vmax) - for (Index_type i = ibegin; i < iend; ++i ) { - REDUCE3_INT_BODY; - } - - m_vsum += vsum; - m_vmin = RAJA_MIN(m_vmin, vmin); - m_vmax = RAJA_MAX(m_vmax, vmax); - - } - stopTimer(); - - break; - } - - case Lambda_OpenMP : { - - auto reduce3int_base_lam = [=](Index_type i) -> Int_type { - return vec[i]; - }; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Int_type vsum = m_vsum_init; - Int_type vmin = m_vmin_init; - Int_type vmax = m_vmax_init; - - #pragma omp parallel for reduction(+:vsum), \ - reduction(min:vmin), \ - reduction(max:vmax) - for (Index_type i = ibegin; i < iend; ++i ) { - vsum += reduce3int_base_lam(i); - vmin = RAJA_MIN(vmin, reduce3int_base_lam(i)); - vmax = RAJA_MAX(vmax, reduce3int_base_lam(i)); - } - - m_vsum += vsum; - m_vmin = RAJA_MIN(m_vmin, vmin); - m_vmax = RAJA_MAX(m_vmax, vmax); - - } - stopTimer(); - - break; - } - - case RAJA_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); - - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); - - } - stopTimer(); - - break; - } - - default : { - std::cout << "\n REDUCE3_INT : Unknown variant id = " << vid << std::endl; - } - - } - -#endif -} - -} // end namespace basic -} // end namespace rajaperf diff --git a/src/basic-kokkos/REDUCE3_INT-KokkosOMPTarget.cpp b/src/basic-kokkos/REDUCE3_INT-KokkosOMPTarget.cpp deleted file mode 100644 index b96b05794..000000000 --- a/src/basic-kokkos/REDUCE3_INT-KokkosOMPTarget.cpp +++ /dev/null @@ -1,110 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "REDUCE3_INT.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - -#include "common/OpenMPTargetDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - - // - // Define threads per team for target execution - // - const size_t threads_per_team = 256; - -#define REDUCE3_INT_DATA_SETUP_OMP_TARGET \ - int hid = omp_get_initial_device(); \ - int did = omp_get_default_device(); \ -\ - allocAndInitOpenMPDeviceData(vec, m_vec, iend, did, hid); - -#define REDUCE3_INT_DATA_TEARDOWN_OMP_TARGET \ - deallocOpenMPDeviceData(vec, did); \ - - -void REDUCE3_INT::runKokkosOpenMPTargetVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - REDUCE3_INT_DATA_SETUP; - - if ( vid == Base_OpenMPTarget ) { - - REDUCE3_INT_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Int_type vsum = m_vsum_init; - Int_type vmin = m_vmin_init; - Int_type vmax = m_vmax_init; - - #pragma omp target is_device_ptr(vec) device( did ) map(tofrom:vsum, vmin, vmax) - #pragma omp teams distribute parallel for thread_limit(threads_per_team) schedule(static,1) \ - reduction(+:vsum) \ - reduction(min:vmin) \ - reduction(max:vmax) - for (Index_type i = ibegin; i < iend; ++i ) { - REDUCE3_INT_BODY; - } - - m_vsum += vsum; - m_vmin = RAJA_MIN(m_vmin, vmin); - m_vmax = RAJA_MAX(m_vmax, vmax); - - } - stopTimer(); - - REDUCE3_INT_DATA_TEARDOWN_OMP_TARGET; - - } else if ( vid == RAJA_OpenMPTarget ) { - - REDUCE3_INT_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); - - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); - - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); - - } - stopTimer(); - - REDUCE3_INT_DATA_TEARDOWN_OMP_TARGET; - - } else { - std::cout << "\n REDUCE3_INT : Unknown OMP Target variant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic-kokkos/TRAP_INT-KokkosSeq.cpp b/src/basic-kokkos/TRAP_INT-Kokkos.cpp similarity index 100% rename from src/basic-kokkos/TRAP_INT-KokkosSeq.cpp rename to src/basic-kokkos/TRAP_INT-Kokkos.cpp diff --git a/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp b/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp deleted file mode 100644 index 02edfc1c3..000000000 --- a/src/basic-kokkos/TRAP_INT-KokkosCuda.cpp +++ /dev/null @@ -1,174 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "TRAP_INT.hpp" - -#include "RAJA/RAJA.hpp" -#include -#if defined(RAJA_ENABLE_CUDA) - -#include "common/CudaDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - -// -// Function used in TRAP_INT loop. -// -KOKKOS_INLINE_FUNCTION -Real_type trap_int_func(Real_type x, - Real_type y, - Real_type xp, - Real_type yp) -{ - Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); - denom = 1.0/sqrt(denom); - return denom; -} - - - // - // Define thread block size for CUDA execution - // - const size_t block_size = 256; - - -#define TRAP_INT_DATA_SETUP_CUDA // nothing to do here... - -#define TRAP_INT_DATA_TEARDOWN_CUDA // nothing to do here... - - -//__global__ void trapint(Real_type x0, Real_type xp, -// Real_type y, Real_type yp, -// Real_type h, -// Real_ptr sumx, -// Index_type iend) -//{ -// extern __shared__ Real_type psumx[ ]; -// -// Index_type i = blockIdx.x * blockDim.x + threadIdx.x; -// -// psumx[ threadIdx.x ] = 0.0; -// for ( ; i < iend ; i += gridDim.x * blockDim.x ) { -// Real_type x = x0 + i*h; -// Real_type val = trap_int_func(x, y, xp, yp); -// psumx[ threadIdx.x ] += val; -// } -// __syncthreads(); -// -// for ( i = blockDim.x / 2; i > 0; i /= 2 ) { -// if ( threadIdx.x < i ) { -// psumx[ threadIdx.x ] += psumx[ threadIdx.x + i ]; -// } -// __syncthreads(); -// } -// -//#if 1 // serialized access to shared data; -// if ( threadIdx.x == 0 ) { -// RAJA::atomicAdd( sumx, psumx[ 0 ] ); -// } -//#else // this doesn't work due to data races -// if ( threadIdx.x == 0 ) { -// *sumx += psumx[ 0 ]; -// } -//#endif -// -//} - - -void TRAP_INT::runKokkosCudaVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - TRAP_INT_DATA_SETUP; - -#if defined RUN_KOKKOS - - if ( vid == Base_CUDA ) { - - TRAP_INT_DATA_SETUP_CUDA; - - Real_ptr sumx; - allocAndInitCudaDeviceData(sumx, &m_sumx_init, 1); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - initCudaDeviceData(sumx, &m_sumx_init, 1); - - const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size); - //trapint<<>>(x0, xp, - // y, yp, - // h, - // sumx, - // iend); - - Real_type lsumx; - Real_ptr plsumx = &lsumx; - getCudaDeviceData(plsumx, sumx, 1); - m_sumx += lsumx * h; - - } - stopTimer(); - - deallocCudaDeviceData(sumx); - - TRAP_INT_DATA_TEARDOWN_CUDA; - - } else if ( vid == Kokkos_Lambda_CUDA ) { - - TRAP_INT_DATA_SETUP_CUDA; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - - // Begin Kokkos translation - // A RAJA reduce translates into a - // Kokkoss::parallel_reduce - // To perform the translation: - // Declare and initialize variables - // To perform a reduction, you need: - // 1) an initial value; - // 2) iterate over an iterable; - // 3) to be able to extract the result at the end of the reduction (in this case, trap_integral_val) - - - Real_type trap_integral_val = m_sumx_init; - - parallel_reduce("TRAP_INT_KokkosCuda Kokkos_Lambda_Seq", - Kokkos::RangePolicy(ibegin, iend), - KOKKOS_LAMBDA (const int64_t i, Real_type& sumx) { - TRAP_INT_BODY}, - trap_integral_val - ); - - m_sumx += static_cast(trap_integral_val) * h; - - } - stopTimer(); - - TRAP_INT_DATA_TEARDOWN_CUDA; - - } else { - std::cout << "\n TRAP_INT : Unknown Cuda variant id = " << vid << std::endl; - } -#endif //RUN_KOKKOS -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_CUDA diff --git a/src/basic-kokkos/TRAP_INT-KokkosOMP.cpp b/src/basic-kokkos/TRAP_INT-KokkosOMP.cpp deleted file mode 100644 index 94f8f2f3f..000000000 --- a/src/basic-kokkos/TRAP_INT-KokkosOMP.cpp +++ /dev/null @@ -1,122 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "TRAP_INT.hpp" - -#include "RAJA/RAJA.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - -// -// Function used in TRAP_INT loop. -// -RAJA_INLINE -Real_type trap_int_func(Real_type x, - Real_type y, - Real_type xp, - Real_type yp) -{ - Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); - denom = 1.0/sqrt(denom); - return denom; -} - - -void TRAP_INT::runKokkosOpenMPVariant(VariantID vid) -{ -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - TRAP_INT_DATA_SETUP; - - switch ( vid ) { - - case Base_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Real_type sumx = m_sumx_init; - - #pragma omp parallel for reduction(+:sumx) - for (Index_type i = ibegin; i < iend; ++i ) { - TRAP_INT_BODY; - } - - m_sumx += sumx * h; - - } - stopTimer(); - - break; - } - - case Lambda_OpenMP : { - - auto trapint_base_lam = [=](Index_type i) -> Real_type { - Real_type x = x0 + i*h; - return trap_int_func(x, y, xp, yp); - }; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Real_type sumx = m_sumx_init; - - #pragma omp parallel for reduction(+:sumx) - for (Index_type i = ibegin; i < iend; ++i ) { - sumx += trapint_base_lam(i); - } - - m_sumx += sumx * h; - - } - stopTimer(); - - break; - } - - case RAJA_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sumx(m_sumx_init); - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - TRAP_INT_BODY; - }); - - m_sumx += static_cast(sumx.get()) * h; - - } - stopTimer(); - - break; - } - - default : { - std::cout << "\n TRAP_INT : Unknown variant id = " << vid << std::endl; - } - - } - -#endif -} - -} // end namespace basic -} // end namespace rajaperf diff --git a/src/basic-kokkos/TRAP_INT-KokkosOMPTarget.cpp b/src/basic-kokkos/TRAP_INT-KokkosOMPTarget.cpp deleted file mode 100644 index 7ac80bdbb..000000000 --- a/src/basic-kokkos/TRAP_INT-KokkosOMPTarget.cpp +++ /dev/null @@ -1,111 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "TRAP_INT.hpp" - -#include "RAJA/RAJA.hpp" - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - -#include "common/OpenMPTargetDataUtils.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ - -// -// Function used in TRAP_INT loop. -// -RAJA_INLINE -Real_type trap_int_func(Real_type x, - Real_type y, - Real_type xp, - Real_type yp) -{ - Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp); - denom = 1.0/sqrt(denom); - return denom; -} - - // - // Define threads per team for target execution - // - const size_t threads_per_team = 256; - - -#define TRAP_INT_DATA_SETUP_OMP_TARGET // nothing to do here... - -#define TRAP_INT_DATA_TEARDOWN_OMP_TARGET // nothing to do here... - - -void TRAP_INT::runKokkosOpenMPTargetVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - TRAP_INT_DATA_SETUP; - - if ( vid == Base_OpenMPTarget ) { - - TRAP_INT_DATA_SETUP_OMP_TARGET; - - #pragma omp target enter data map(to:x0,xp,y,yp,h) - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Real_type sumx = m_sumx_init; - - #pragma omp target teams distribute parallel for map(tofrom: sumx) reduction(+:sumx) \ - thread_limit(threads_per_team) schedule(static, 1) - - for (Index_type i = ibegin; i < iend; ++i ) { - TRAP_INT_BODY; - } - - m_sumx += sumx * h; - - } - stopTimer(); - - #pragma omp target exit data map(delete: x0,xp,y,yp,h) - - } else if ( vid == RAJA_OpenMPTarget ) { - - TRAP_INT_DATA_SETUP_OMP_TARGET; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum sumx(m_sumx_init); - - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - TRAP_INT_BODY; - }); - - m_sumx += static_cast(sumx.get()) * h; - - } - stopTimer(); - - TRAP_INT_DATA_TEARDOWN_OMP_TARGET; - - } else { - std::cout << "\n TRAP_INT : Unknown OMP Targetvariant id = " << vid << std::endl; - } -} - -} // end namespace basic -} // end namespace rajaperf - -#endif // RAJA_ENABLE_TARGET_OPENMP From 8dc5c01ddde3a29e474d7e3cbcc7ab54bc33e781 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Tue, 2 Feb 2021 14:36:27 -0800 Subject: [PATCH 049/124] Sed surgery 2 --- src/basic-kokkos/ATOMIC_PI-Kokkos.cpp | 6 ++-- src/basic-kokkos/DAXPY-Kokkos.cpp | 6 ++-- src/basic-kokkos/IF_QUAD-Kokkos.cpp | 6 ++-- src/basic-kokkos/INIT3-Kokkos.cpp | 6 ++-- src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp | 6 ++-- .../INIT_VIEW1D_OFFSET-Kokkos.cpp | 6 ++-- src/basic-kokkos/MULADDSUB-Kokkos.cpp | 6 ++-- src/basic-kokkos/NESTED_INIT-Kokkos.cpp | 6 ++-- src/basic-kokkos/REDUCE3_INT-Kokkos.cpp | 6 ++-- src/basic-kokkos/TRAP_INT-Kokkos.cpp | 6 ++-- src/basic/ATOMIC_PI.hpp | 8 ++--- src/basic/DAXPY.hpp | 8 ++--- src/basic/IF_QUAD.hpp | 8 ++--- src/basic/INIT3.hpp | 8 ++--- src/basic/INIT_VIEW1D.hpp | 8 ++--- src/basic/INIT_VIEW1D_OFFSET.hpp | 8 ++--- src/basic/MULADDSUB.hpp | 8 ++--- src/basic/NESTED_INIT.hpp | 8 ++--- src/basic/REDUCE3_INT.hpp | 8 ++--- src/basic/TRAP_INT.hpp | 8 ++--- src/common/KernelBase.cpp | 33 ++----------------- src/common/KernelBase.hpp | 11 +------ src/common/RAJAPerfSuite.cpp | 18 +++------- src/common/RAJAPerfSuite.hpp | 13 ++------ 24 files changed, 80 insertions(+), 135 deletions(-) diff --git a/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp b/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp index dce51b6f4..63e97ead2 100644 --- a/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp +++ b/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp @@ -18,7 +18,7 @@ namespace basic { -void ATOMIC_PI::runKokkosSeqVariant(VariantID vid) +void ATOMIC_PI::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -71,7 +71,7 @@ void ATOMIC_PI::runKokkosSeqVariant(VariantID vid) break; } - case Kokkos_Lambda_Seq : { + case Kokkos_Lambda : { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -83,7 +83,7 @@ void ATOMIC_PI::runKokkosSeqVariant(VariantID vid) // RAJA::atomicAdd(pi, dx / (1.0 + x * x)); // }); - Kokkos::parallel_for("ATOMIC_PI-KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("ATOMIC_PI-KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), [=] (Index_type i) { double x = (double(i) + 0.5) * dx; Kokkos::atomic_add(pi, dx / (1.0 + x * x)); diff --git a/src/basic-kokkos/DAXPY-Kokkos.cpp b/src/basic-kokkos/DAXPY-Kokkos.cpp index 659258eb0..cd36ed017 100644 --- a/src/basic-kokkos/DAXPY-Kokkos.cpp +++ b/src/basic-kokkos/DAXPY-Kokkos.cpp @@ -25,7 +25,7 @@ struct DaxpyFunctor { void operator()(Index_type i) const { DAXPY_BODY; } }; -void DAXPY::runKokkosSeqVariant(VariantID vid) +void DAXPY::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -42,10 +42,10 @@ void DAXPY::runKokkosSeqVariant(VariantID vid) switch ( vid ) { #if defined(RUN_RAJA_SEQ) - case Kokkos_Lambda_Seq: { + case Kokkos_Lambda: { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Kokkos::parallel_for("DAXPY-KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("DAXPY-KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), [=](Index_type i) { DAXPY_BODY; }); } stopTimer(); diff --git a/src/basic-kokkos/IF_QUAD-Kokkos.cpp b/src/basic-kokkos/IF_QUAD-Kokkos.cpp index 68adde515..0837ab577 100644 --- a/src/basic-kokkos/IF_QUAD-Kokkos.cpp +++ b/src/basic-kokkos/IF_QUAD-Kokkos.cpp @@ -20,7 +20,7 @@ namespace basic // Kokkos-ify here -void IF_QUAD::runKokkosSeqVariant(VariantID vid) +void IF_QUAD::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -41,7 +41,7 @@ void IF_QUAD::runKokkosSeqVariant(VariantID vid) #if defined(RUN_RAJA_SEQ) - case Kokkos_Lambda_Seq : { + case Kokkos_Lambda : { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -50,7 +50,7 @@ void IF_QUAD::runKokkosSeqVariant(VariantID vid) RAJA::RangeSegment(ibegin, iend), ifquad_lam); */ // Translation - Kokkos::parallel_for("IF_QUAD_KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("IF_QUAD_KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), [=] (Index_type i) {IF_QUAD_BODY}); diff --git a/src/basic-kokkos/INIT3-Kokkos.cpp b/src/basic-kokkos/INIT3-Kokkos.cpp index e3fee117b..8bacd7adc 100644 --- a/src/basic-kokkos/INIT3-Kokkos.cpp +++ b/src/basic-kokkos/INIT3-Kokkos.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT3::runKokkosSeqVariant(VariantID vid) +void INIT3::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -68,7 +68,7 @@ void INIT3::runKokkosSeqVariant(VariantID vid) } // Nota bene -- Conversion of Raja code begins here - case Kokkos_Lambda_Seq : { + case Kokkos_Lambda : { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -77,7 +77,7 @@ void INIT3::runKokkosSeqVariant(VariantID vid) // RAJA::RangeSegment(ibegin, iend), init3_lam); // Kokkos translation - Kokkos::parallel_for("INIT3-KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("INIT3-KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), [=] (Index_type i) {INIT3_BODY}); } stopTimer(); diff --git a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp index 38284ec76..3725f9ab4 100644 --- a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp +++ b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp @@ -18,7 +18,7 @@ namespace basic { -void INIT_VIEW1D::runKokkosSeqVariant(VariantID vid) +void INIT_VIEW1D::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -66,7 +66,7 @@ void INIT_VIEW1D::runKokkosSeqVariant(VariantID vid) } // AJP began modificaiton here - case Kokkos_Lambda_Seq : { + case Kokkos_Lambda : { INIT_VIEW1D_VIEW_RAJA; @@ -80,7 +80,7 @@ void INIT_VIEW1D::runKokkosSeqVariant(VariantID vid) // RAJA::forall( // RAJA::RangeSegment(ibegin, iend), initview1d_lam); //Kokkos translation - Kokkos::parallel_for("INIT_VIEW1D_KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin,iend), + Kokkos::parallel_for("INIT_VIEW1D_KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin,iend), [=] (Index_type i) {INIT_VIEW1D_BODY_RAJA}); } diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp index 844697a7c..5d55c51c4 100644 --- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp @@ -19,7 +19,7 @@ namespace basic -void INIT_VIEW1D_OFFSET::runKokkosSeqVariant(VariantID vid) +void INIT_VIEW1D_OFFSET::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; @@ -70,7 +70,7 @@ void INIT_VIEW1D_OFFSET::runKokkosSeqVariant(VariantID vid) // Conversion of Raja code to Kokkos starts here // - case Kokkos_Lambda_Seq : { + case Kokkos_Lambda : { INIT_VIEW1D_OFFSET_VIEW_RAJA; @@ -83,7 +83,7 @@ void INIT_VIEW1D_OFFSET::runKokkosSeqVariant(VariantID vid) // RAJA::forall( // RAJA::RangeSegment(ibegin, iend), initview1doffset_lam); - Kokkos::parallel_for("INIT_VIEW1D_OFFSET_KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), [=] (Index_type i) {INIT_VIEW1D_OFFSET_BODY_RAJA}); + Kokkos::parallel_for("INIT_VIEW1D_OFFSET_KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), [=] (Index_type i) {INIT_VIEW1D_OFFSET_BODY_RAJA}); } diff --git a/src/basic-kokkos/MULADDSUB-Kokkos.cpp b/src/basic-kokkos/MULADDSUB-Kokkos.cpp index 60f5231ac..2605901a9 100644 --- a/src/basic-kokkos/MULADDSUB-Kokkos.cpp +++ b/src/basic-kokkos/MULADDSUB-Kokkos.cpp @@ -18,7 +18,7 @@ namespace basic { -void MULADDSUB::runKokkosSeqVariant(VariantID vid) +void MULADDSUB::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -66,7 +66,7 @@ void MULADDSUB::runKokkosSeqVariant(VariantID vid) break; } - case Kokkos_Lambda_Seq : { + case Kokkos_Lambda : { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -76,7 +76,7 @@ void MULADDSUB::runKokkosSeqVariant(VariantID vid) // // Kokkos translation // If SIMD really matters , consider using Kokkos SIMD - Kokkos::parallel_for("MULTISUB-KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("MULTISUB-KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), [=] (Index_type i) {MULADDSUB_BODY}); } diff --git a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp index 3f75f0517..c3c3f1fdf 100644 --- a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp +++ b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp @@ -231,7 +231,7 @@ void moveDataToHostFromKokkosView(PointedAt* kokkos_ptr, ExistingView my_view, B ////////////////////////////////////////////////////////////////////////////// -void NESTED_INIT::runKokkosSeqVariant(VariantID vid) +void NESTED_INIT::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -285,9 +285,9 @@ void NESTED_INIT::runKokkosSeqVariant(VariantID vid) break; } -// Kokkos_Lambda_Seq variant +// Kokkos_Lambda variant - case Kokkos_Lambda_Seq : { + case Kokkos_Lambda : { // Wrap the nested init array pointer in a Kokkos View // In a Kokkos View, array arguments for array boundaries go from outmost diff --git a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp index 0d7bd2c58..3253b9191 100644 --- a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp +++ b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp @@ -19,7 +19,7 @@ namespace basic { -void REDUCE3_INT::runKokkosSeqVariant(VariantID vid) +void REDUCE3_INT::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -84,7 +84,7 @@ void REDUCE3_INT::runKokkosSeqVariant(VariantID vid) break; } - case Kokkos_Lambda_Seq : { + case Kokkos_Lambda : { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -110,7 +110,7 @@ void REDUCE3_INT::runKokkosSeqVariant(VariantID vid) // KOKKOS_LAMBDA IS A PRE-PROCESSOR DIRECTIVE; // It makes the capture clause on the lambda work for Host and Device - parallel_reduce("REDUCE3-KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), + parallel_reduce("REDUCE3-KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), [=](const int64_t i, Int_type& tl_max, Int_type& tl_min, Int_type& tl_sum){ Int_type vec_i = vec[i]; diff --git a/src/basic-kokkos/TRAP_INT-Kokkos.cpp b/src/basic-kokkos/TRAP_INT-Kokkos.cpp index 3cfde60ce..49ba77c2a 100644 --- a/src/basic-kokkos/TRAP_INT-Kokkos.cpp +++ b/src/basic-kokkos/TRAP_INT-Kokkos.cpp @@ -32,7 +32,7 @@ Real_type trap_int_func(Real_type x, } -void TRAP_INT::runKokkosSeqVariant(VariantID vid) +void TRAP_INT::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -88,7 +88,7 @@ void TRAP_INT::runKokkosSeqVariant(VariantID vid) break; } - case Kokkos_Lambda_Seq : { + case Kokkos_Lambda : { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -109,7 +109,7 @@ void TRAP_INT::runKokkosSeqVariant(VariantID vid) Real_type trap_integral_val = m_sumx_init; - Kokkos::parallel_reduce("TRAP_INT_KokkosSeq Kokkos_Lambda_Seq", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_reduce("TRAP_INT_KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), [=] (const int64_t i, Real_type& sumx) {TRAP_INT_BODY}, trap_integral_val ); diff --git a/src/basic/ATOMIC_PI.hpp b/src/basic/ATOMIC_PI.hpp index 4987e5a12..9a542c4d3 100644 --- a/src/basic/ATOMIC_PI.hpp +++ b/src/basic/ATOMIC_PI.hpp @@ -58,10 +58,10 @@ class ATOMIC_PI : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); #ifdef RUN_KOKKOS - void runKokkosSeqVariant(VariantID vid); - void runKokkosOpenMPVariant(VariantID vid); - void runKokkosCudaVariant(VariantID vid); - void runKokkosOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + + + #endif private: diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index f30890ea0..1484c907f 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -58,10 +58,10 @@ class DAXPY : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - void runKokkosSeqVariant(VariantID vid); - void runKokkosOpenMPVariant(VariantID vid); - void runKokkosCudaVariant(VariantID vid); - void runKokkosOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + + + private: Real_ptr m_x; Real_ptr m_y; diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp index c110d208c..e646c5374 100644 --- a/src/basic/IF_QUAD.hpp +++ b/src/basic/IF_QUAD.hpp @@ -70,10 +70,10 @@ class IF_QUAD : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - void runKokkosSeqVariant(VariantID vid); - void runKokkosOpenMPVariant(VariantID vid); - void runKokkosCudaVariant(VariantID vid); - void runKokkosOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + + + private: Real_ptr m_a; diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index b518db649..daa3dfe4a 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -56,10 +56,10 @@ class INIT3 : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - void runKokkosSeqVariant(VariantID vid); - void runKokkosOpenMPVariant(VariantID vid); - void runKokkosCudaVariant(VariantID vid); - void runKokkosOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + + + private: Real_ptr m_out1; diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index b1807a168..b25397cc0 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -66,10 +66,10 @@ class INIT_VIEW1D : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - void runKokkosSeqVariant(VariantID vid); - void runKokkosOpenMPVariant(VariantID vid); - void runKokkosCudaVariant(VariantID vid); - void runKokkosOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + + + private: Real_ptr m_a; Real_type m_val; diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp index 421f03b10..392fa61c0 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.hpp +++ b/src/basic/INIT_VIEW1D_OFFSET.hpp @@ -65,10 +65,10 @@ class INIT_VIEW1D_OFFSET : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - void runKokkosSeqVariant(VariantID vid); - void runKokkosOpenMPVariant(VariantID vid); - void runKokkosCudaVariant(VariantID vid); - void runKokkosOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + + + private: Real_ptr m_a; Real_type m_val; diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index a280d1b8f..b32ed4e21 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -58,10 +58,10 @@ class MULADDSUB : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - void runKokkosSeqVariant(VariantID vid); - void runKokkosOpenMPVariant(VariantID vid); - void runKokkosCudaVariant(VariantID vid); - void runKokkosOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + + + private: Real_ptr m_out1; Real_ptr m_out2; diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index 20232cc99..745e9f6dd 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -58,10 +58,10 @@ class NESTED_INIT : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - void runKokkosSeqVariant(VariantID vid); - void runKokkosOpenMPVariant(VariantID vid); - void runKokkosCudaVariant(VariantID vid); - void runKokkosOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + + + private: Index_type m_array_length; diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index c119f727a..19b24be8d 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -70,10 +70,10 @@ class REDUCE3_INT : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - void runKokkosSeqVariant(VariantID vid); - void runKokkosOpenMPVariant(VariantID vid); - void runKokkosCudaVariant(VariantID vid); - void runKokkosOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + + + private: Int_ptr m_vec; Int_type m_vsum; diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 1240f5e2b..811c915d3 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -67,10 +67,10 @@ class TRAP_INT : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - void runKokkosSeqVariant(VariantID vid); - void runKokkosOpenMPVariant(VariantID vid); - void runKokkosCudaVariant(VariantID vid); - void runKokkosOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + + + private: Real_type m_x0; Real_type m_xp; diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 45127a4f8..5c4637d23 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -148,39 +148,12 @@ void KernelBase::runKernel(VariantID vid) } #if defined(RUN_KOKKOS) - case Kokkos_Lambda_Seq : - case Kokkos_Functor_Seq : + case Kokkos_Lambda : + case Kokkos_Functor : { - runKokkosSeqVariant(vid); + runKokkosVariant(vid); break; } - -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - case Kokkos_Lambda_OpenMP : - case Kokkos_Functor_OpenMP : - { - runKokkosOpenMPVariant(vid); - break; - } -#endif - -#if defined(RAJA_ENABLE_TARGET_OPENMP) - case Kokkos_Lambda_OpenMPTarget : - case Kokkos_Functor_OpenMPTarget : - { - runKokkosOpenMPTargetVariant(vid); - break; - } -#endif - -#if defined(RAJA_ENABLE_CUDA) - case Kokkos_Lambda_CUDA : - case Kokkos_Functor_CUDA : - { - runKokkosCudaVariant(vid); - break; - } -#endif #endif // RUN_KOKKOS default : { #if 0 diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index e2afc314a..4c27f09df 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -126,16 +126,7 @@ class KernelBase #endif #if defined(RUN_KOKKOS) - virtual void runKokkosSeqVariant(VariantID vid) = 0; -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - virtual void runKokkosOpenMPVariant(VariantID vid) = 0; -#endif -#if defined(RAJA_ENABLE_CUDA) - virtual void runKokkosCudaVariant(VariantID vid) = 0; -#endif -#if defined(RAJA_ENABLE_TARGET_OPENMP) - virtual void runKokkosOpenMPTargetVariant(VariantID vid) = 0; -#endif + virtual void runKokkosVariant(VariantID vid) = 0; #endif // RUN_KOKKOS protected: diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index cfa043226..ab9a458dd 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -233,18 +233,8 @@ static const std::string VariantNames [] = std::string("Base_HIP"), std::string("RAJA_HIP"), - std::string("Kokkos_Lambda_Seq"), - std::string("Kokkos_Functor_Seq"), - - std::string("Kokkos_Lambda_OpenMP"), - std::string("Kokkos_Functor_OpenMP"), - - - std::string("Kokkos_Lambda_OpenMPTarget"), - std::string("Kokkos_Functor_OpenMPTarget"), - - std::string("Kokkos_Lambda_CUDA"), - std::string("Kokkos_Functor_CUDA"), + std::string("Kokkos_Lambda"), + std::string("Kokkos_Functor"), std::string("Unknown Variant") // Keep this at the end and DO NOT remove.... @@ -325,8 +315,8 @@ bool isVariantAvailable(VariantID vid) ret_val = true; } #if defined(RUN_KOKKOS) - if ( vid == Kokkos_Lambda_Seq || - vid == Kokkos_Functor_Seq ) { + if ( vid == Kokkos_Lambda || + vid == Kokkos_Functor ) { ret_val = true; } #endif // RUN_KOKKOS diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 44042d33c..fdd0fb973 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -206,17 +206,8 @@ enum VariantID { Base_HIP, RAJA_HIP, - Kokkos_Lambda_Seq, - Kokkos_Functor_Seq, - - Kokkos_Lambda_OpenMP, - Kokkos_Functor_OpenMP, - - Kokkos_Lambda_OpenMPTarget, - Kokkos_Functor_OpenMPTarget, - - Kokkos_Lambda_CUDA, - Kokkos_Functor_CUDA, + Kokkos_Lambda, + Kokkos_Functor, NumVariants // Keep this one last and NEVER comment out (!!) From 944e58a4f148b0c6dc4c3f3ee30c116e57625f15 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Tue, 2 Feb 2021 14:48:35 -0800 Subject: [PATCH 050/124] Removed all mentions of Kokkos specific backend implementations from RAJAPerf plumbing. To be done: remove the kernel implementations that still mention specific backends --- src/CMakeLists.txt | 4 ++-- src/basic-kokkos/DAXPY-Kokkos.cpp | 2 +- src/basic/ATOMIC_PI.cpp | 6 +++--- src/basic/DAXPY.cpp | 6 +++--- src/basic/IF_QUAD.cpp | 6 +++--- src/basic/INIT3.cpp | 6 +++--- src/basic/INIT_VIEW1D.cpp | 6 +++--- src/basic/INIT_VIEW1D_OFFSET.cpp | 6 +++--- src/basic/MULADDSUB.cpp | 6 +++--- src/basic/NESTED_INIT.cpp | 6 +++--- src/basic/REDUCE3_INT.cpp | 6 +++--- src/basic/TRAP_INT.cpp | 6 +++--- src/common/RAJAPerfSuite.cpp | 32 ------------------------------- src/common/RAJAPerfSuite.hpp | 4 ---- tpl/kokkos | 2 +- 15 files changed, 34 insertions(+), 70 deletions(-) mode change 160000 => 120000 tpl/kokkos diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1bae44838..dd18ad6fd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -12,7 +12,7 @@ add_subdirectory(common) #add_subdirectory(apps) add_subdirectory(basic) add_subdirectory(basic-kokkos) -add_subdirectory(kokkos-mechanics) +#add_subdirectory(kokkos-mechanics) #add_subdirectory(lcals) #add_subdirectory(polybench) #add_subdirectory(stream) @@ -22,7 +22,7 @@ set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS #apps basic basic-kokkos - kokkos-mechanics + #kokkos-mechanics #lcals #polybench #stream diff --git a/src/basic-kokkos/DAXPY-Kokkos.cpp b/src/basic-kokkos/DAXPY-Kokkos.cpp index cd36ed017..0e8bf3cf5 100644 --- a/src/basic-kokkos/DAXPY-Kokkos.cpp +++ b/src/basic-kokkos/DAXPY-Kokkos.cpp @@ -52,7 +52,7 @@ void DAXPY::runKokkosVariant(VariantID vid) break; } - case Kokkos_Functor_Seq: { + case Kokkos_Functor: { DaxpyFunctor daxpy_functor_instance(y,x,a); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { diff --git a/src/basic/ATOMIC_PI.cpp b/src/basic/ATOMIC_PI.cpp index c876147ba..83010da16 100644 --- a/src/basic/ATOMIC_PI.cpp +++ b/src/basic/ATOMIC_PI.cpp @@ -24,9 +24,9 @@ ATOMIC_PI::ATOMIC_PI(const RunParams& params) setDefaultSize(3000); setDefaultReps(10000); - setVariantDefined( Kokkos_Lambda_Seq ); - setVariantDefined( Kokkos_Lambda_OpenMP ); - setVariantDefined( Kokkos_Lambda_CUDA ); + setVariantDefined( Kokkos_Lambda ); + + setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index 52988d2d9..47f3e40a6 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -41,9 +41,9 @@ DAXPY::DAXPY(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda_Seq ); - setVariantDefined( Kokkos_Lambda_OpenMP ); - setVariantDefined( Kokkos_Lambda_CUDA ); + setVariantDefined( Kokkos_Lambda ); + + diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index b9836a175..328ad2cb6 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -25,9 +25,9 @@ IF_QUAD::IF_QUAD(const RunParams& params) setDefaultReps(1800); - setVariantDefined( Kokkos_Lambda_Seq ); - setVariantDefined( Kokkos_Lambda_OpenMP ); - setVariantDefined( Kokkos_Lambda_CUDA ); + setVariantDefined( Kokkos_Lambda ); + + setVariantDefined( Base_Seq ); diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index 20d8ff0f2..c3f9d6eaf 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -27,19 +27,19 @@ INIT3::INIT3(const RunParams& params) setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); - setVariantDefined( Kokkos_Lambda_Seq ); + setVariantDefined( Kokkos_Lambda ); setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); setVariantDefined( RAJA_OpenMP ); - setVariantDefined( Kokkos_Lambda_OpenMP ); + setVariantDefined( Base_OpenMPTarget ); setVariantDefined( RAJA_OpenMPTarget ); setVariantDefined( Base_CUDA ); setVariantDefined( RAJA_CUDA ); - setVariantDefined( Kokkos_Lambda_CUDA); + setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index 42b2588ac..31ed82a04 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -41,9 +41,9 @@ INIT_VIEW1D::INIT_VIEW1D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda_Seq ); - setVariantDefined( Kokkos_Lambda_OpenMP ); - setVariantDefined( Kokkos_Lambda_CUDA ); + setVariantDefined( Kokkos_Lambda ); + + } diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 0f810547b..4f5c686c3 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -41,9 +41,9 @@ INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda_Seq ); - setVariantDefined( Kokkos_Lambda_OpenMP ); - setVariantDefined( Kokkos_Lambda_CUDA ); + setVariantDefined( Kokkos_Lambda ); + + } diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index f3c2b6191..7cfaca709 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -41,9 +41,9 @@ MULADDSUB::MULADDSUB(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda_Seq ); - setVariantDefined( Kokkos_Lambda_OpenMP ); - setVariantDefined( Kokkos_Lambda_CUDA ); + setVariantDefined( Kokkos_Lambda ); + + } diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index 3a697e257..c038ea05f 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -48,9 +48,9 @@ NESTED_INIT::NESTED_INIT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda_Seq ); - setVariantDefined( Kokkos_Lambda_OpenMP ); - setVariantDefined( Kokkos_Lambda_CUDA ); + setVariantDefined( Kokkos_Lambda ); + + } NESTED_INIT::~NESTED_INIT() diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index 14e13a6ef..41b6050f8 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -46,9 +46,9 @@ REDUCE3_INT::REDUCE3_INT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda_Seq ); - setVariantDefined( Kokkos_Lambda_OpenMP ); - setVariantDefined( Kokkos_Lambda_CUDA ); + setVariantDefined( Kokkos_Lambda ); + + } diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index 3dde1e237..f28286919 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -41,9 +41,9 @@ TRAP_INT::TRAP_INT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined( Kokkos_Lambda_Seq ); - setVariantDefined( Kokkos_Lambda_OpenMP ); - setVariantDefined( Kokkos_Lambda_CUDA ); + setVariantDefined( Kokkos_Lambda ); + + } diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index ab9a458dd..64b93d282 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -77,8 +77,6 @@ #include "apps/PRESSURE.hpp" #include "apps/VOL3D.hpp" -#include "kokkos-mechanics/ViewAllocate.hpp" -#include "kokkos-mechanics/ViewStreamAdd.hpp" #include @@ -193,9 +191,6 @@ static const std::string KernelNames [] = // std::string("Apps_PRESSURE"), // std::string("Apps_VOL3D"), - std::string("KokkosMechanics_ViewAllocate"), - std::string("KokkosMechanics_ViewStreamAdd"), - std::string("Unknown Kernel") // Keep this at the end and DO NOT remove.... }; // END KernelNames @@ -329,12 +324,6 @@ bool isVariantAvailable(VariantID vid) vid == RAJA_OpenMP ) { ret_val = true; } -#if defined(RUN_KOKKOS) - if ( vid == Kokkos_Lambda_OpenMP || - vid == Kokkos_Functor_OpenMP ) { - ret_val = true; - } -#endif // RUN_KOKKOS #endif #if defined(RAJA_ENABLE_TARGET_OPENMP) @@ -342,12 +331,6 @@ bool isVariantAvailable(VariantID vid) vid == RAJA_OpenMPTarget ) { ret_val = true; } -#if defined(RUN_KOKKOS) - if ( vid == Kokkos_Lambda_OpenMPTarget || - vid == Kokkos_Functor_OpenMPTarget ) { - ret_val = true; - } -#endif // RUN_KOKKOS #endif #if defined(RAJA_ENABLE_CUDA) @@ -355,12 +338,6 @@ bool isVariantAvailable(VariantID vid) vid == RAJA_CUDA ) { ret_val = true; } -#if defined(RUN_KOKKOS) - if ( vid == Kokkos_Lambda_CUDA || - vid == Kokkos_Functor_CUDA ) { - ret_val = true; - } -#endif // RUN_KOKKOS #endif #if defined(RAJA_ENABLE_HIP) @@ -599,15 +576,6 @@ KernelBase* getKernelObject(KernelID kid, } */ - case KokkosMechanics_ViewAllocate : { - kernel = new kokkos_mechanics::ViewAllocate(run_params); - break; - } - - case KokkosMechanics_ViewStreamAdd: { - kernel = new kokkos_mechanics::ViewStreamAdd(run_params); - break; - } default: { std::cout << "\n Unknown Kernel ID = " << kid << std::endl; } diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index fdd0fb973..312f738ff 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -166,10 +166,6 @@ enum KernelID { //Apps_PRESSURE, //Apps_VOL3D, - // Kokkos Mechanics Tests - KokkosMechanics_ViewAllocate, - KokkosMechanics_ViewStreamAdd, - NumKernels // Keep this one last and NEVER comment out (!!) }; diff --git a/tpl/kokkos b/tpl/kokkos deleted file mode 160000 index d680eabdb..000000000 --- a/tpl/kokkos +++ /dev/null @@ -1 +0,0 @@ -Subproject commit d680eabdbccc9b30ce2708b1446507cd860d94e9 diff --git a/tpl/kokkos b/tpl/kokkos new file mode 120000 index 000000000..412bf0abb --- /dev/null +++ b/tpl/kokkos @@ -0,0 +1 @@ +/ascldap/users/dzpolia/src/kokkos \ No newline at end of file From a3d406155234856df058e43bb6dfd2eb412cb7d9 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 9 Feb 2021 13:46:23 -0800 Subject: [PATCH 051/124] tpl/kokkos: unlinking sym link & cloing kokkos --- tpl/kokkos | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 120000 => 160000 tpl/kokkos diff --git a/tpl/kokkos b/tpl/kokkos deleted file mode 120000 index 412bf0abb..000000000 --- a/tpl/kokkos +++ /dev/null @@ -1 +0,0 @@ -/ascldap/users/dzpolia/src/kokkos \ No newline at end of file diff --git a/tpl/kokkos b/tpl/kokkos new file mode 160000 index 000000000..4af934941 --- /dev/null +++ b/tpl/kokkos @@ -0,0 +1 @@ +Subproject commit 4af9349419d10d14fa42f0eb1bbcf8d8054f29ff From 4f738c8c2049601078c4105fb4e49e7b92bcc552 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 15 Feb 2021 16:03:03 -0800 Subject: [PATCH 052/124] DAXPY kernel with KokkosViews for pointer x, pointer y --- src/basic-kokkos/DAXPY-Kokkos.cpp | 29 +- src/basic-kokkos/NESTED_INIT-Kokkos.cpp | 378 +++++------------------- src/common/RAJAPerfSuite.hpp | 210 +++++++++++++ 3 files changed, 315 insertions(+), 302 deletions(-) diff --git a/src/basic-kokkos/DAXPY-Kokkos.cpp b/src/basic-kokkos/DAXPY-Kokkos.cpp index 0e8bf3cf5..3ff7a56d7 100644 --- a/src/basic-kokkos/DAXPY-Kokkos.cpp +++ b/src/basic-kokkos/DAXPY-Kokkos.cpp @@ -33,10 +33,23 @@ void DAXPY::runKokkosVariant(VariantID vid) DAXPY_DATA_SETUP; + // Declare KokkosViews for the pointers that will be wrapped. Find out + // which pointers in the KERNEL_NAME.hpp files + // Wrap pointers x and y in separate KokkosViews + // This is for one dimension; one dimensional things are indexed to iend. + + auto x_view = getViewFromPointer(x, iend); + + auto y_view = getViewFromPointer(y, iend); + + + auto daxpy_lam = [=](Index_type i) { DAXPY_BODY; }; + + #if defined(RUN_KOKKOS) switch ( vid ) { @@ -44,9 +57,11 @@ void DAXPY::runKokkosVariant(VariantID vid) #if defined(RUN_RAJA_SEQ) case Kokkos_Lambda: { startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Kokkos::parallel_for("DAXPY-KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), - [=](Index_type i) { DAXPY_BODY; }); + Kokkos::parallel_for("DAXPY-Kokkos Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i) { y_view[i] += a * x_view[i];} + ); } stopTimer(); @@ -70,6 +85,16 @@ void DAXPY::runKokkosVariant(VariantID vid) } + // Moving data back to the host + + moveDataToHostFromKokkosView(x, x_view, iend); + + moveDataToHostFromKokkosView(y, y_view, iend); + + + + + #endif // RUN_KOKKOS } diff --git a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp index c3c3f1fdf..1d874472e 100644 --- a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp +++ b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp @@ -12,334 +12,112 @@ #include -namespace rajaperf -{ -namespace basic -{ - -//Kokkos Design Spirit: -//WE NEED: -//1) Use KokkosViews --> a wrapper around pointers for host and device memory -//management -//2) Use default execution space -// -// -// -// -// NEW FUNCTION WILL: -// 1) Take in a raw pointer (e.g., float*, int*, etc.) -// 2) From this pointer, return a Kokkos::View -// -// Return type : Kokkos::View -// Kokkos::View takes tempalted arguments -// To write "generically" implies templated arguments -// https://eli.thegreenplace.net/2014/variadic-templates-in-c/ -// -template - - -// This is a TEMPLATED STRUCT. This struct will contain the type of a pointer of n dimensions -// This struct is templated on the template that immediately precedes the struct declaration. -struct PointerOfNdimensions; - -// This template block declares a specialization, which means that you say the -// template arguments that you're NOT specializing -template - -// Here, we are specialising a template according to the type of argument that -// is passed. In this example, we've specialized the PointedAt template -// argument for the case that the number of dimensions is 0. All we will do in -// this struct is to define a type. - -// This struct is a specialization of : -// template -struct PointerOfNdimensions { - // "using" is a type alias - // if you derefernce a pointer, you're just left with an object, the value - // of that pointer - using type = PointedAt; -}; - -// NO SPECIALIZATION, i.e., we fix no templated arguments -template - -struct PointerOfNdimensions { - // PointerOfNdimensions is a type - // My type is a pointer to the type of myself, decremented - using type = typename PointerOfNdimensions::type*; - -}; - - -template - -// FUNCTION THAT GETS A VIEW FROM A POINTER WITH RETURN TYPE KOKKOS::VIEW -// -auto getViewFromPointer(PointedAt* kokkos_ptr, Boundaries... boundaries) - // Recall: PointerOfNdimensions is struct that exists solely to hold a - // type - // -> connotes "return type after the arrow" - -> typename Kokkos::View< - typename PointerOfNdimensions ::type, - //typename Kokkos::DefaultHostExecutionSpace::memory_space> - //This more generic expression allow moving the - //View-wrapped pointer b/w - //Host and GPU - typename Kokkos::DefaultExecutionSpace::memory_space> - - -{ - // This says construct the pointer_holder variable from arguments passed to - // the template block - // - using host_view_type = typename Kokkos::View< - typename PointerOfNdimensions ::type, - typename Kokkos::DefaultHostExecutionSpace::memory_space>; - - // FYI - Device can be GPU, OpenMPTarget, HIP (for targeting an AMD GPU), SYCL (library in Intel - // Compiler) - // - using device_view_type = typename Kokkos::View< - typename PointerOfNdimensions ::type, - typename Kokkos::DefaultExecutionSpace::memory_space>; - - - - // When copying data, we can either change the Layout or the memory_space - // (host or device), but we cannot change both! - // Here, we are mirroring data on the host to the device, i.e., Layout is - // as if on the device, but the data is actually on the host. The host - // mirror will be Layout Left (optimal for the device), but data are - // actually on the HOST! - - // Here, "using" is type alias; in this example,its our gpu Layout on cpu - using mirror_view_type = typename device_view_type::HostMirror; - - // Assignment statement; we are constructing a host_view_type with the name pointer_holder. The value of kokkos_ptr - // is the pointer we're wrapping on the Host, and the Boundaries parameter - // pack values, boundaries, will also be part of this this host_view_type - // object. - - host_view_type pointer_holder (kokkos_ptr, boundaries...); - - // boundaries will contain the array dimenions; an allocation is implicitly made here - device_view_type device_data_copy ( "StringName", boundaries...); - - mirror_view_type cpu_to_gpu_mirror = Kokkos::create_mirror_view(device_data_copy); - - // We need to deep_copy our existing data, the contents of - // pointer_holder, into the mirror_view; - // Copying from Host to Device has two steps: 1) Change the layout, 2) - // change the memory_space (host or device). Step 1 is to change the - // layout to enable sending data from CPU to GPU. Step 2 is actually - // sending the optimal data layout to the GPU - - // This step changes the Layout to be optimal for the gpu - Kokkos::deep_copy(cpu_to_gpu_mirror, pointer_holder); - - - // The mirror view data layout on the HOST is like the layout for the GPU. GPU-optimized layouts are LayoutLeft, - // i.e., column-major - // This deep_copy copy GPU-layout data on the HOST to the Device - - // Actual copying of the data from the host to the gpu - Kokkos::deep_copy(device_data_copy, cpu_to_gpu_mirror); - - - // Kokkos::View return type - - return device_data_copy; - -} - -/////////////////////////////////////////////////////////////////////////////// -//THIS FUNCTION WILL MOVE DATA IN A KOKKOS::VIEW BACK TO HOST FROM DEVICE, AND -//STORE IN AN EXISTING POINTER -/////////////////////////////////////////////////////////////////////////////// - - - -template - -// DEFINING FUNCTION THAT GETS A VIEW FROM A POINTER WITH RETURN TYPE KOKKOS::VIEW -//"my_view" parameter is equivalent to device_data_copy -// -void moveDataToHostFromKokkosView(PointedAt* kokkos_ptr, ExistingView my_view, Boundaries... boundaries) - -{ - // This says construct the pointer_holder variable from arguments passed to - // the template block - // - using host_view_type = typename Kokkos::View< - typename PointerOfNdimensions ::type, - typename Kokkos::DefaultHostExecutionSpace::memory_space>; - - // FYI - Device can be GPU, OpenMPTarget, HIP (for targeting an AMD GPU), SYCL (library in Intel - // Compiler) - // - using device_view_type = typename Kokkos::View< - typename PointerOfNdimensions ::type, - typename Kokkos::DefaultExecutionSpace::memory_space>; - - - - // When copying data, we can either change the Layout or the memory_space - // (host or device), but we cannot change both! - // Here, we are mirroring data on the host to the device, i.e., Layout is - // as if on the device, but the data is actually on the host. The host - // mirror will be Layout Left (optimal for the device), but data are - // actually on the HOST! - - // Here, "using" is type alias; in this example,its our gpu Layout on cpu - using mirror_view_type = typename device_view_type::HostMirror; - - // Assignment statement; we are constructing a host_view_type with the name pointer_holder. The value of kokkos_ptr - // is the pointer we're wrapping on the Host, and the Boundaries parameter - // pack values, boundaries, will also be part of this this host_view_type - // object. - - host_view_type pointer_holder (kokkos_ptr, boundaries...); - - // Layout is optimal for gpu, but located on CPU - mirror_view_type cpu_to_gpu_mirror = Kokkos::create_mirror_view(my_view); - - - - // We need to deep_copy our existing data, the contents of - // pointer_holder, into the mirror_view; - // Copying from Host to Device has two steps: 1) Change the layout, 2) - // change the memory_space (host or device). Step 1 is to change the - // layout to enable sending data from CPU to GPU. Step 2 is actually - // sending the optimal data layout to the GPU - - // This step changes the Layout to be optimal for the gpu - - - // The mirror view data layout on the HOST is like the layout for the GPU. GPU-optimized layouts are LayoutLeft, - // i.e., column-major - // This deep_copy copy GPU-layout data on the HOST to the Device - - // Actual copying of the data from the gpu to the cpu - Kokkos::deep_copy(cpu_to_gpu_mirror, my_view); - - //This copies from the mirror on the cpu - Kokkos::deep_copy(pointer_holder, cpu_to_gpu_mirror); - -} - - -////////////////////////////////////////////////////////////////////////////// - -void NESTED_INIT::runKokkosVariant(VariantID vid) -{ +namespace rajaperf { +namespace basic { +//////////////////////////////////////////////////////////// +void NESTED_INIT::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); - NESTED_INIT_DATA_SETUP; auto nestedinit_lam = [=](Index_type i, Index_type j, Index_type k) { - NESTED_INIT_BODY; - }; + NESTED_INIT_BODY; + }; #if defined RUN_KOKKOS - switch ( vid ) { + switch (vid) { - case Base_Seq : { + case Base_Seq: { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type k = 0; k < nk; ++k ) { - for (Index_type j = 0; j < nj; ++j ) { - for (Index_type i = 0; i < ni; ++i ) { - NESTED_INIT_BODY; - } + for (Index_type k = 0; k < nk; ++k) { + for (Index_type j = 0; j < nj; ++j) { + for (Index_type i = 0; i < ni; ++i) { + NESTED_INIT_BODY; } } - } - stopTimer(); - - break; } + stopTimer(); + + break; + } #if defined(RUN_RAJA_SEQ) - case Lambda_Seq : { + case Lambda_Seq: { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type k = 0; k < nk; ++k ) { - for (Index_type j = 0; j < nj; ++j ) { - for (Index_type i = 0; i < ni; ++i ) { - nestedinit_lam(i, j, k); - } - } + for (Index_type k = 0; k < nk; ++k) { + for (Index_type j = 0; j < nj; ++j) { + for (Index_type i = 0; i < ni; ++i) { + nestedinit_lam(i, j, k); } - + } } - stopTimer(); - - break; } + stopTimer(); -// Kokkos_Lambda variant - - case Kokkos_Lambda : { - - // Wrap the nested init array pointer in a Kokkos View - // In a Kokkos View, array arguments for array boundaries go from outmost - // to innermost dimension sizes - // See the basic NESTED_INIT.hpp file for defnition of NESTED_INIT - - auto array_kokkos_view = getViewFromPointer(array, nk, nj, ni); - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - - // MDRange can be optimized - - Kokkos::parallel_for("NESTED_INIT KokkosSeq", - // Range policy - Kokkos::MDRangePolicy, - // Execution space - Kokkos::DefaultExecutionSpace>({0,0,0}, {nk,nj,ni}), - // Loop body - KOKKOS_LAMBDA(Index_type k, Index_type j, Index_type i) { - //NESTED_INIT_BODY no longer useful, because we're not - //operating on the array, but on the Kokkos::View - // array_kokkos_view created to hold value for - // getViewFromPointer(array, nk, nj, ni) - // MD Views are index'ed via "()" - // - // KOKKOS-FIED translation of NESTED_INIT_BODY: - // #define NESTED_INIT_BODY - // array[i+ni*(j+nj*k)] = 0.00000001 * i * j * k ; - // - array_kokkos_view(k, j, i) = 0.00000001 * i * j * k; - } -); - - } - stopTimer(); - // "Moves" mirror data from GPU to CPU (void, i.e., no retrun type). In - // this moving of data back to Host, the layout is changed back to Layout - // Right, vs. the LayoutLeft of the GPU - moveDataToHostFromKokkosView(array, array_kokkos_view, nk, nj, ni); + break; + } - break; + // Kokkos_Lambda variant + + case Kokkos_Lambda: { + + // Wrap the nested init array pointer in a Kokkos View + // In a Kokkos View, array arguments for array boundaries go from outmost + // to innermost dimension sizes + // See the basic NESTED_INIT.hpp file for defnition of NESTED_INIT + + auto array_kokkos_view = getViewFromPointer(array, nk, nj, ni); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // MDRange can be optimized + + Kokkos::parallel_for( + "NESTED_INIT KokkosSeq", + // Range policy + Kokkos::MDRangePolicy, + // Execution space + Kokkos::DefaultExecutionSpace>({0, 0, 0}, + {nk, nj, ni}), + // Loop body + KOKKOS_LAMBDA(Index_type k, Index_type j, Index_type i) { + // NESTED_INIT_BODY no longer useful, because we're not + // operating on the array, but on the Kokkos::View + // array_kokkos_view created to hold value for + // getViewFromPointer(array, nk, nj, ni) + // MD Views are index'ed via "()" + // + // KOKKOS-FIED translation of NESTED_INIT_BODY: + // #define NESTED_INIT_BODY + // array[i+ni*(j+nj*k)] = 0.00000001 * i * j * k ; + // + array_kokkos_view(k, j, i) = 0.00000001 * i * j * k; + }); } -#endif // RUN_RAJA_SEQ + stopTimer(); + // "Moves" mirror data from GPU to CPU (void, i.e., no retrun type). In + // this moving of data back to Host, the layout is changed back to Layout + // Right, vs. the LayoutLeft of the GPU + moveDataToHostFromKokkosView(array, array_kokkos_view, nk, nj, ni); - default : { - std::cout << "\n NESTED_INIT : Unknown variant id = " << vid << std::endl; - } + break; + } +#endif // RUN_RAJA_SEQ + default: { + std::cout << "\n NESTED_INIT : Unknown variant id = " << vid << std::endl; + } } -#endif //RUN_KOKKOS +#endif // RUN_KOKKOS } } // end namespace basic diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 312f738ff..eaad2efbb 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -24,6 +24,216 @@ namespace rajaperf { +#if defined(RUN_KOKKOS) + +// Kokkos Design Spirit: +// WE NEED: +// 1) Use KokkosViews --> a wrapper around pointers for host and device memory +// management +// 2) Use default execution space +// +// +// +// +// NEW FUNCTION WILL: +// 1) Take in a raw pointer (e.g., float*, int*, etc.) +// 2) From this pointer, return a Kokkos::View +// +// Return type : Kokkos::View +// Kokkos::View takes tempalted arguments +// To write "generically" implies templated arguments +// https://eli.thegreenplace.net/2014/variadic-templates-in-c/ +// +template + +// This is a TEMPLATED STRUCT. This struct will contain the type of a pointer +// of n dimensions This struct is templated on the template that immediately precedes the struct declaration. +struct PointerOfNdimensions; + +// This template block declares a specialization, which means that you say the +// template arguments that you're NOT specializing +template + +// Here, we are specialising a template according to the type of argument that +// is passed. In this example, we've specialized the PointedAt template +// argument for the case that the number of dimensions is 0. All we will do in +// this struct is to define a type. + +// This struct is a specialization of : +// template +struct PointerOfNdimensions { + // "using" is a type alias + // if you derefernce a pointer, you're just left with an object, the value + // of that pointer + using type = PointedAt; +}; + +// NO SPECIALIZATION, i.e., we fix no templated arguments +template + +struct PointerOfNdimensions { + // PointerOfNdimensions is a type + // My type is a pointer to the type of myself, decremented + using type = + typename PointerOfNdimensions::type *; +}; + +template + +// FUNCTION THAT GETS A VIEW FROM A POINTER WITH RETURN TYPE KOKKOS::VIEW +// + + +auto getViewFromPointer(PointedAt *kokkos_ptr, Boundaries... boundaries) + // Recall: PointerOfNdimensions is struct that exists solely to hold a + // type + // -> connotes "return type after the arrow" + -> typename Kokkos::View< + typename PointerOfNdimensions::type, + // typename Kokkos::DefaultHostExecutionSpace::memory_space> + // This more generic expression allow moving the + // View-wrapped pointer b/w + // Host and GPU + typename Kokkos::DefaultExecutionSpace::memory_space> + +{ + // This says construct the pointer_holder variable from arguments passed to + // the template block + // Declaration of a type alias, host_view_type + + using host_view_type = typename Kokkos::View< + // in the line below , you are using the type alias that is the memeber + // of a struct + + typename PointerOfNdimensions::type, + typename Kokkos::DefaultHostExecutionSpace::memory_space>; + + // FYI - Device can be GPU, OpenMPTarget, HIP (for targeting an AMD GPU), SYCL + // (library in Intel Compiler) + // + using device_view_type = typename Kokkos::View< + typename PointerOfNdimensions::type, + typename Kokkos::DefaultExecutionSpace::memory_space>; + + // When copying data, we can either change the Layout or the memory_space + // (host or device), but we cannot change both! + // Here, we are mirroring data on the host to the device, i.e., Layout is + // as if on the device, but the data is actually on the host. The host + // mirror will be Layout Left (optimal for the device), but data are + // actually on the HOST! + + // Here, "using" is type alias; in this example,its our gpu Layout on cpu + using mirror_view_type = typename device_view_type::HostMirror; + + // Assignment statement; we are constructing a host_view_type with the name + // pointer_holder. The value of kokkos_ptr is the pointer we're wrapping on + // the Host, and the Boundaries parameter pack values, boundaries, will also + // be part of this this host_view_type object. + + host_view_type pointer_holder(kokkos_ptr, boundaries...); + + // boundaries will contain the array dimenions; an allocation is implicitly + // made here + device_view_type device_data_copy("StringName", boundaries...); + + mirror_view_type cpu_to_gpu_mirror = + Kokkos::create_mirror_view(device_data_copy); + + // We need to deep_copy our existing data, the contents of + // pointer_holder, into the mirror_view; + // Copying from Host to Device has two steps: 1) Change the layout, 2) + // change the memory_space (host or device). Step 1 is to change the + // layout to enable sending data from CPU to GPU. Step 2 is actually + // sending the optimal data layout to the GPU + + // This step changes the Layout to be optimal for the gpu + Kokkos::deep_copy(cpu_to_gpu_mirror, pointer_holder); + + // The mirror view data layout on the HOST is like the layout for the GPU. + // GPU-optimized layouts are LayoutLeft, i.e., column-major This deep_copy + // copy GPU-layout data on the HOST to the Device + + // Actual copying of the data from the host to the gpu + Kokkos::deep_copy(device_data_copy, cpu_to_gpu_mirror); + + // Kokkos::View return type + + return device_data_copy; +} + +/////////////////////////////////////////////////////////////////////////////// +// THIS FUNCTION WILL MOVE DATA IN A KOKKOS::VIEW BACK TO HOST FROM DEVICE, AND +// STORE IN AN EXISTING POINTER +/////////////////////////////////////////////////////////////////////////////// + +template + +// DEFINING FUNCTION THAT GETS A VIEW FROM A POINTER WITH RETURN TYPE +// KOKKOS::VIEW +//"my_view" parameter is equivalent to device_data_copy +// +void moveDataToHostFromKokkosView(PointedAt *kokkos_ptr, ExistingView my_view, + Boundaries... boundaries) + +{ + // This says construct the pointer_holder variable from arguments passed to + // the template block + // + using host_view_type = typename Kokkos::View< + typename PointerOfNdimensions::type, + typename Kokkos::DefaultHostExecutionSpace::memory_space>; + + // FYI - Device can be GPU, OpenMPTarget, HIP (for targeting an AMD GPU), SYCL + // (library in Intel Compiler) + // + using device_view_type = typename Kokkos::View< + typename PointerOfNdimensions::type, + typename Kokkos::DefaultExecutionSpace::memory_space>; + + // When copying data, we can either change the Layout or the memory_space + // (host or device), but we cannot change both! + // Here, we are mirroring data on the host to the device, i.e., Layout is + // as if on the device, but the data is actually on the host. The host + // mirror will be Layout Left (optimal for the device), but data are + // actually on the HOST! + + // Here, "using" is type alias; in this example,its our gpu Layout on cpu + using mirror_view_type = typename device_view_type::HostMirror; + + // Assignment statement; we are constructing a host_view_type with the name + // pointer_holder. The value of kokkos_ptr is the pointer we're wrapping on + // the Host, and the Boundaries parameter pack values, boundaries, will also + // be part of this this host_view_type object. + + host_view_type pointer_holder(kokkos_ptr, boundaries...); + + // Layout is optimal for gpu, but located on CPU + mirror_view_type cpu_to_gpu_mirror = Kokkos::create_mirror_view(my_view); + + // We need to deep_copy our existing data, the contents of + // pointer_holder, into the mirror_view; + // Copying from Host to Device has two steps: 1) Change the layout, 2) + // change the memory_space (host or device). Step 1 is to change the + // layout to enable sending data from CPU to GPU. Step 2 is actually + // sending the optimal data layout to the GPU + + // This step changes the Layout to be optimal for the gpu + + // The mirror view data layout on the HOST is like the layout for the GPU. + // GPU-optimized layouts are LayoutLeft, i.e., column-major This deep_copy + // copy GPU-layout data on the HOST to the Device + + // Actual copying of the data from the gpu to the cpu + Kokkos::deep_copy(cpu_to_gpu_mirror, my_view); + + // This copies from the mirror on the cpu + Kokkos::deep_copy(pointer_holder, cpu_to_gpu_mirror); +} + + +#endif // RUN_KOKKOS + class KernelBase; class RunParams; From ce4079e1d7b72a440e673b06886f98c01780b345 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 16 Feb 2021 14:54:01 -0800 Subject: [PATCH 053/124] IF_QUAD.cpp: re-write basic kernels using KokkosView --- src/basic-kokkos/IF_QUAD-Kokkos.cpp | 35 +++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/src/basic-kokkos/IF_QUAD-Kokkos.cpp b/src/basic-kokkos/IF_QUAD-Kokkos.cpp index 0837ab577..673e9f64f 100644 --- a/src/basic-kokkos/IF_QUAD-Kokkos.cpp +++ b/src/basic-kokkos/IF_QUAD-Kokkos.cpp @@ -18,6 +18,7 @@ namespace basic { + // Kokkos-ify here void IF_QUAD::runKokkosVariant(VariantID vid) @@ -28,11 +29,22 @@ void IF_QUAD::runKokkosVariant(VariantID vid) IF_QUAD_DATA_SETUP; + // Instantiating views using getViewFromPointer for the IF_QUAD definition + + auto a_view = getViewFromPointer(a, iend); + auto b_view = getViewFromPointer(b, iend); + auto c_view = getViewFromPointer(c, iend); + auto x1_view = getViewFromPointer(x1, iend); + auto x2_view = getViewFromPointer(x2, iend); + + auto ifquad_lam = [=](Index_type i) { IF_QUAD_BODY; }; + + #if defined(RUN_KOKKOS) switch ( vid ) { @@ -50,14 +62,27 @@ void IF_QUAD::runKokkosVariant(VariantID vid) RAJA::RangeSegment(ibegin, iend), ifquad_lam); */ // Translation - Kokkos::parallel_for("IF_QUAD_KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("IF_QUAD_Kokkos Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), - [=] (Index_type i) {IF_QUAD_BODY}); + KOKKOS_LAMBDA (Index_type i) { + + Real_type s = b_view[i]*b_view[i] - 4.0*a_view[i]*c_view[i]; + if ( s >= 0 ) { + s = sqrt(s); + x2_view[i] = (-b_view[i]+s)/(2.0*a_view[i]); + x1_view[i] = (-b_view[i]-s)/(2.0*a_view[i]); + } else { + x2_view[i] = 0.0; + x1_view[i] = 0.0; + + + }}); } stopTimer(); break; + } #endif // RUN_RAJA_SEQ @@ -69,6 +94,12 @@ void IF_QUAD::runKokkosVariant(VariantID vid) #endif // RUN_KOKKOS + moveDataToHostFromKokkosView(a, a_view, iend); + moveDataToHostFromKokkosView(b, b_view, iend); + moveDataToHostFromKokkosView(c, c_view, iend); + moveDataToHostFromKokkosView(x1, x1_view, iend); + moveDataToHostFromKokkosView(x2, x2_view, iend); + } From fc4bb07ffa9fea5b6f0f8be4479886c0aeadcba1 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Wed, 17 Feb 2021 13:18:15 -0800 Subject: [PATCH 054/124] add Kokkos::fence() to kernel test --- src/basic-kokkos/DAXPY-Kokkos.cpp | 26 +++++++++++++++++++------ src/basic-kokkos/IF_QUAD-Kokkos.cpp | 4 ++++ src/basic-kokkos/NESTED_INIT-Kokkos.cpp | 5 +++++ 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/basic-kokkos/DAXPY-Kokkos.cpp b/src/basic-kokkos/DAXPY-Kokkos.cpp index 3ff7a56d7..f9b424afd 100644 --- a/src/basic-kokkos/DAXPY-Kokkos.cpp +++ b/src/basic-kokkos/DAXPY-Kokkos.cpp @@ -33,10 +33,12 @@ void DAXPY::runKokkosVariant(VariantID vid) DAXPY_DATA_SETUP; - // Declare KokkosViews for the pointers that will be wrapped. Find out - // which pointers in the KERNEL_NAME.hpp files + // Declare KokkosViews for the pointers that will be wrapped. + // Get pointer names in the KERNEL_NAME.hpp file // Wrap pointers x and y in separate KokkosViews - // This is for one dimension; one dimensional things are indexed to iend. + // This is a one dimension array + // One dimensional arrays are indexed to iend (RAJAPerfSuite convention) + // New template-based machinery in /rajaperf/src/common/RAJAPerfSuite.hpp auto x_view = getViewFromPointer(x, iend); @@ -56,13 +58,23 @@ void DAXPY::runKokkosVariant(VariantID vid) #if defined(RUN_RAJA_SEQ) case Kokkos_Lambda: { + + Kokkos::fence(); + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { Kokkos::parallel_for("DAXPY-Kokkos Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), - KOKKOS_LAMBDA(Index_type i) { y_view[i] += a * x_view[i];} + // Increment y_view (pointer wrapped in KokksView) + // by product of a and ith entry of x_view + // DAXPY_BODY substituted with the + // calculation defined in DAXPY.hpp + KOKKOS_LAMBDA(Index_type i) { y_view[i] += a * x_view[i];} ); } + // Kokkos fence + Kokkos::fence(); + stopTimer(); break; @@ -75,7 +87,9 @@ void DAXPY::runKokkosVariant(VariantID vid) daxpy_functor_instance); } stopTimer(); - + + + break; } #endif // RUN_RAJA_SEQ @@ -85,7 +99,7 @@ void DAXPY::runKokkosVariant(VariantID vid) } - // Moving data back to the host + // Moving all data (i.e., pointer, KokkosView-wrapped ponter) back to the host from the device moveDataToHostFromKokkosView(x, x_view, iend); diff --git a/src/basic-kokkos/IF_QUAD-Kokkos.cpp b/src/basic-kokkos/IF_QUAD-Kokkos.cpp index 673e9f64f..2377d2599 100644 --- a/src/basic-kokkos/IF_QUAD-Kokkos.cpp +++ b/src/basic-kokkos/IF_QUAD-Kokkos.cpp @@ -55,7 +55,9 @@ void IF_QUAD::runKokkosVariant(VariantID vid) case Kokkos_Lambda : { + Kokkos::fence(); startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { /* RAJA::forall( @@ -79,6 +81,8 @@ void IF_QUAD::runKokkosVariant(VariantID vid) }}); } + + Kokkos::fence(); stopTimer(); break; diff --git a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp index 1d874472e..da92d320f 100644 --- a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp +++ b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp @@ -75,6 +75,8 @@ void NESTED_INIT::runKokkosVariant(VariantID vid) { // See the basic NESTED_INIT.hpp file for defnition of NESTED_INIT auto array_kokkos_view = getViewFromPointer(array, nk, nj, ni); + + Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -103,6 +105,9 @@ void NESTED_INIT::runKokkosVariant(VariantID vid) { array_kokkos_view(k, j, i) = 0.00000001 * i * j * k; }); } + + Kokkos::fence(); + stopTimer(); // "Moves" mirror data from GPU to CPU (void, i.e., no retrun type). In // this moving of data back to Host, the layout is changed back to Layout From 1ca2a3caeae7be179b3281ba45b02cfa71f5f7c1 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 2 Mar 2021 14:32:51 -0800 Subject: [PATCH 055/124] INIT3 with Kokkos Views and custom types --- src/basic-kokkos/INIT3-Kokkos.cpp | 36 ++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/src/basic-kokkos/INIT3-Kokkos.cpp b/src/basic-kokkos/INIT3-Kokkos.cpp index 8bacd7adc..4111b36f2 100644 --- a/src/basic-kokkos/INIT3-Kokkos.cpp +++ b/src/basic-kokkos/INIT3-Kokkos.cpp @@ -26,6 +26,20 @@ void INIT3::runKokkosVariant(VariantID vid) INIT3_DATA_SETUP; + // Instantiating Views using getViewFromPointer for the INIT3 definition + // (i.e., INIT3.hpp) + + // The pointer is the first argument, and the last index, denoted by iend, is + // your second argument + // + auto out1_view = getViewFromPointer(out1, iend); + auto out2_view = getViewFromPointer(out2, iend); + auto out3_view = getViewFromPointer(out3, iend); + auto in1_view = getViewFromPointer(in1, iend); + auto in2_view = getViewFromPointer(in2, iend); + + // Next step, integrate the INIT3_BODY into the Kokkos parallel expression + auto init3_lam = [=](Index_type i) { INIT3_BODY; }; @@ -70,16 +84,23 @@ void INIT3::runKokkosVariant(VariantID vid) // Nota bene -- Conversion of Raja code begins here case Kokkos_Lambda : { + Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { // RAJA::forall( // RAJA::RangeSegment(ibegin, iend), init3_lam); - // Kokkos translation - Kokkos::parallel_for("INIT3-KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), - [=] (Index_type i) {INIT3_BODY}); + // Kokkos translation making INIT3_BODY explicit + Kokkos::parallel_for("INIT3-Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i) { + //INIT3_BODY definition: + // out1[i] = out2[i] = out3[i] = - in1[i] - in2[i] ; + out1_view[i] = out2_view[i] = out3_view[i] = - in1_view[i] - in2_view[i]; + }); } + Kokkos::fence(); stopTimer(); break; @@ -94,6 +115,15 @@ void INIT3::runKokkosVariant(VariantID vid) #endif // RUN_KOKKOS + moveDataToHostFromKokkosView(out1, out1_view, iend); + moveDataToHostFromKokkosView(out2, out2_view, iend); + moveDataToHostFromKokkosView(out3, out3_view, iend); + moveDataToHostFromKokkosView(in1, in1_view, iend); + moveDataToHostFromKokkosView(in2, in2_view, iend); + + + + } } // end namespace basic From 86c12837238b18112bfe78f2b3fc360adde59877 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 4 Mar 2021 12:50:17 -0800 Subject: [PATCH 056/124] Re-write of basic kernels using KokkosViews --- src/basic-kokkos/ATOMIC_PI-Kokkos.cpp | 37 ++++++++++++++----- src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp | 36 +++++++++++++++--- .../INIT_VIEW1D_OFFSET-Kokkos.cpp | 27 ++++++++++++-- src/basic-kokkos/MULADDSUB-Kokkos.cpp | 37 ++++++++++++++++++- src/basic-kokkos/REDUCE3_INT-Kokkos.cpp | 4 ++ src/basic-kokkos/TRAP_INT-Kokkos.cpp | 5 +++ 6 files changed, 125 insertions(+), 21 deletions(-) diff --git a/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp b/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp index 63e97ead2..45a8efa8e 100644 --- a/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp +++ b/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp @@ -26,6 +26,11 @@ void ATOMIC_PI::runKokkosVariant(VariantID vid) ATOMIC_PI_DATA_SETUP; + // Declare Kokkos View that will wrap the pointer defined in ATOMIC_PI.hpp + auto pi_view = getViewFromPointer(pi ); + + + #if defined(RUN_KOKKOS) switch ( vid ) { @@ -72,26 +77,36 @@ void ATOMIC_PI::runKokkosVariant(VariantID vid) } case Kokkos_Lambda : { - + // Ensure all upstream calculations have been completed before starting + // the timer + Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - *pi = m_pi_init; + + // Here, making a pointer of pi defined in ATOMIC_PI.hpp; we will use a + // KokkosView instead + // *pi = m_pi_init; // RAJA::forall( RAJA::RangeSegment(ibegin, iend), // [=](Index_type i) { // double x = (double(i) + 0.5) * dx; // RAJA::atomicAdd(pi, dx / (1.0 + x * x)); // }); - Kokkos::parallel_for("ATOMIC_PI-KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), - [=] (Index_type i) { - double x = (double(i) + 0.5) * dx; - Kokkos::atomic_add(pi, dx / (1.0 + x * x)); - }); - - *pi *= 4.0; + Kokkos::parallel_for("ATOMIC_PI-Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i) { + // Original ATOMIC_PI kernel reference implementation + // defined in ATOMIC_PI.hpp + double x = (double(i) + 0.5) * dx; + Kokkos::atomic_add(pi_view, dx / (1.0 + x * x)); + }); + + //*pi *= 4.0; + pi_view *= 4.0; } + + Kokkos::fence(); stopTimer(); break; @@ -104,6 +119,8 @@ void ATOMIC_PI::runKokkosVariant(VariantID vid) } #endif //RUN_KOKKOS + + moveDataToHostFromKokkosView(pi, pi_view, iend); } } // end namespace basic diff --git a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp index 3725f9ab4..2036476b4 100644 --- a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp +++ b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp @@ -26,6 +26,12 @@ void INIT_VIEW1D::runKokkosVariant(VariantID vid) INIT_VIEW1D_DATA_SETUP; + // Declare a Kokkos View that will be used to wrap a pointer + auto a_view = getViewFromPointer(a, iend); + + + + #if defined(RUN_KOKKOS) switch ( vid ) { @@ -68,22 +74,36 @@ void INIT_VIEW1D::runKokkosVariant(VariantID vid) // AJP began modificaiton here case Kokkos_Lambda : { - INIT_VIEW1D_VIEW_RAJA; + //INIT_VIEW1D_VIEW_RAJA; - auto initview1d_lam = [=](Index_type i) { + /* auto initview1d_lam = [=](Index_type i) { INIT_VIEW1D_BODY_RAJA; - }; - + + }; +*/ + // fence needed to ensure upstream operations are complete before timer + // start + Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { // RAJA::forall( // RAJA::RangeSegment(ibegin, iend), initview1d_lam); //Kokkos translation - Kokkos::parallel_for("INIT_VIEW1D_KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin,iend), - [=] (Index_type i) {INIT_VIEW1D_BODY_RAJA}); + Kokkos::parallel_for("INIT_VIEW1D_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin,iend), + KOKKOS_LAMBDA (Index_type i) { + //INIT_VIEW1D_BODY_RAJA + //Instead, use the INIT_VIEW1D_BODY definition + //with Kokkos View + //a[i] = (i+1) * v; + a_view[i] = (i + 1) * v; + + }); } + + Kokkos::fence(); stopTimer(); break; @@ -97,6 +117,10 @@ void INIT_VIEW1D::runKokkosVariant(VariantID vid) } #endif // RUN_KOKKOS + + moveDataToHostFromKokkosView(a, a_view, iend); + + } } // end namespace basic diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp index 5d55c51c4..4e9de109c 100644 --- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp @@ -27,6 +27,8 @@ void INIT_VIEW1D_OFFSET::runKokkosVariant(VariantID vid) INIT_VIEW1D_OFFSET_DATA_SETUP; + auto a_view = getViewFromPointer(a, iend); + #if defined(RUN_KOKKOS) @@ -72,21 +74,34 @@ void INIT_VIEW1D_OFFSET::runKokkosVariant(VariantID vid) // case Kokkos_Lambda : { - INIT_VIEW1D_OFFSET_VIEW_RAJA; + //INIT_VIEW1D_OFFSET_VIEW_RAJA; - auto initview1doffset_lam = [=](Index_type i) { + /*auto initview1doffset_lam = [=](Index_type i) { INIT_VIEW1D_OFFSET_BODY_RAJA; }; +*/ + + // Set a fence + Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { // RAJA::forall( // RAJA::RangeSegment(ibegin, iend), initview1doffset_lam); - Kokkos::parallel_for("INIT_VIEW1D_OFFSET_KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), [=] (Index_type i) {INIT_VIEW1D_OFFSET_BODY_RAJA}); + Kokkos::parallel_for("INIT_VIEW1D_OFFSET_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA (Index_type i) { + //INIT_VIEW1D_OFFSET_BODY_RAJA + //Instead, use the INIT_VIEW1D_OFFSET_BODY + //definition: + //a[i-ibegin] = i * v; + a_view[i-ibegin] = i * v; + }); } + Kokkos::fence(); stopTimer(); break; @@ -100,6 +115,12 @@ void INIT_VIEW1D_OFFSET::runKokkosVariant(VariantID vid) } #endif // RUN_KOKKOS + + // Move data from Kokkos View back to Host + moveDataToHostFromKokkosView(a, a_view, iend); + + + } } // end namespace basic diff --git a/src/basic-kokkos/MULADDSUB-Kokkos.cpp b/src/basic-kokkos/MULADDSUB-Kokkos.cpp index 2605901a9..00e6b47e8 100644 --- a/src/basic-kokkos/MULADDSUB-Kokkos.cpp +++ b/src/basic-kokkos/MULADDSUB-Kokkos.cpp @@ -26,6 +26,17 @@ void MULADDSUB::runKokkosVariant(VariantID vid) MULADDSUB_DATA_SETUP; + + // Define Kokkos Views that will wrap pointers defined in MULADDSUB.hpp + auto out1_view = getViewFromPointer(out1, iend); + auto out2_view = getViewFromPointer(out2, iend); + auto out3_view = getViewFromPointer(out3, iend); + auto in1_view = getViewFromPointer(in1, iend); + auto in2_view = getViewFromPointer(in2, iend); + + + + auto mas_lam = [=](Index_type i) { MULADDSUB_BODY; }; @@ -68,6 +79,8 @@ void MULADDSUB::runKokkosVariant(VariantID vid) case Kokkos_Lambda : { + // Set fence to ensure upstream calculations have completed + Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -76,10 +89,21 @@ void MULADDSUB::runKokkosVariant(VariantID vid) // // Kokkos translation // If SIMD really matters , consider using Kokkos SIMD - Kokkos::parallel_for("MULTISUB-KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), - [=] (Index_type i) {MULADDSUB_BODY}); + Kokkos::parallel_for("MULTISUB-KokkosSeq Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i) { + //MULADDSUB_BODY definition: + //out1[i] = in1[i] * in2[i] ; + //out2[i] = in1[i] + in2[i] ; + //out3[i] = in1[i] - in2[i] ; + // WITH KOKKOS VIEWS + out1_view[i] = in1_view[i] * in2_view[i] ; + out2_view[i] = in1_view[i] + in2_view[i] ; + out3_view[i] = in1_view[i] - in2_view[i] ; + }); } + Kokkos::fence(); stopTimer(); break; @@ -92,6 +116,15 @@ void MULADDSUB::runKokkosVariant(VariantID vid) } #endif // RUN_KOKKOS + moveDataToHostFromKokkosView(out1, out1_view, iend); + moveDataToHostFromKokkosView(out2, out2_view, iend); + moveDataToHostFromKokkosView(out3, out3_view, iend); + moveDataToHostFromKokkosView(out3, out3_view, iend); + moveDataToHostFromKokkosView(in1, in1_view, iend); + moveDataToHostFromKokkosView(in2, in2_view, iend); + + + } } // end namespace basic diff --git a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp index 3253b9191..d44c6c396 100644 --- a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp +++ b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp @@ -27,6 +27,10 @@ void REDUCE3_INT::runKokkosVariant(VariantID vid) REDUCE3_INT_DATA_SETUP; + //Declare KokkosView that will wrap the pointer + + //auto + #if defined(RUN_KOKKOS) switch ( vid ) { diff --git a/src/basic-kokkos/TRAP_INT-Kokkos.cpp b/src/basic-kokkos/TRAP_INT-Kokkos.cpp index 49ba77c2a..e708f3a78 100644 --- a/src/basic-kokkos/TRAP_INT-Kokkos.cpp +++ b/src/basic-kokkos/TRAP_INT-Kokkos.cpp @@ -40,6 +40,11 @@ void TRAP_INT::runKokkosVariant(VariantID vid) TRAP_INT_DATA_SETUP; +// Declare KokkosViews that will wrap a pointer - not relevant in this case +// ...? + + + #if defined(RUN_KOKKOS) switch ( vid ) { From 5c32d68ce438883ffc9d4065e52e7e92959bfc7d Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 4 Mar 2021 13:22:09 -0800 Subject: [PATCH 057/124] Rewrite of REDUCE3_INT using KokkosViews --- src/basic-kokkos/REDUCE3_INT-Kokkos.cpp | 34 +++++++++++++++---------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp index d44c6c396..65f62f821 100644 --- a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp +++ b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp @@ -29,7 +29,7 @@ void REDUCE3_INT::runKokkosVariant(VariantID vid) //Declare KokkosView that will wrap the pointer - //auto + auto vec_view = getViewFromPointer(vec, iend); #if defined(RUN_KOKKOS) @@ -90,6 +90,7 @@ void REDUCE3_INT::runKokkosVariant(VariantID vid) case Kokkos_Lambda : { + Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { /* @@ -107,6 +108,8 @@ void REDUCE3_INT::runKokkosVariant(VariantID vid) m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); */ // These values are initilized elsewhere by RPS + // These variables were declared to Kokkos-ify the parallel_reduce + // construct: Int_type max_value = m_vmax_init; Int_type min_value = m_vmin_init; Int_type sum = m_vsum_init; @@ -114,20 +117,23 @@ void REDUCE3_INT::runKokkosVariant(VariantID vid) // KOKKOS_LAMBDA IS A PRE-PROCESSOR DIRECTIVE; // It makes the capture clause on the lambda work for Host and Device - parallel_reduce("REDUCE3-KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), - - [=](const int64_t i, Int_type& tl_max, Int_type& tl_min, Int_type& tl_sum){ - Int_type vec_i = vec[i]; - if (vec_i > tl_max) tl_max = vec_i; - if (vec_i < tl_min) tl_min = vec_i; - tl_sum += vec_i; - }, Kokkos::Max(max_value), Kokkos::Min(min_value), sum); - - m_vsum += static_cast(sum); - m_vmin = RAJA_MIN(m_vmin, static_cast(min_value)); - m_vmax = RAJA_MAX(m_vmax, static_cast(max_value)); + parallel_reduce("REDUCE3-Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(const int64_t i, Int_type& tl_max, Int_type& tl_min, Int_type& tl_sum){ + Int_type vec_i = vec_view[i]; + if (vec_i > tl_max) tl_max = vec_i; + if (vec_i < tl_min) tl_min = vec_i; + tl_sum += vec_i; + }, + Kokkos::Max(max_value), + Kokkos::Min(min_value), + sum); + m_vsum += static_cast(sum); + m_vmin = RAJA_MIN(m_vmin, static_cast(min_value)); + m_vmax = RAJA_MAX(m_vmax, static_cast(max_value)); } + Kokkos::fence(); stopTimer(); break; @@ -140,6 +146,8 @@ void REDUCE3_INT::runKokkosVariant(VariantID vid) } #endif // RUN_KOKKOS + + moveDataToHostFromKokkosView(vec, vec_view, iend); } } // end namespace basic From 497741c985db9b57752ad9a2b655adc6aa871e7c Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 4 Mar 2021 14:52:44 -0800 Subject: [PATCH 058/124] Re-write of TRAP_INT with Kokkos::fences(); --- src/basic-kokkos/TRAP_INT-Kokkos.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/basic-kokkos/TRAP_INT-Kokkos.cpp b/src/basic-kokkos/TRAP_INT-Kokkos.cpp index e708f3a78..6dcacc809 100644 --- a/src/basic-kokkos/TRAP_INT-Kokkos.cpp +++ b/src/basic-kokkos/TRAP_INT-Kokkos.cpp @@ -95,6 +95,7 @@ void TRAP_INT::runKokkosVariant(VariantID vid) case Kokkos_Lambda : { + Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -114,13 +115,16 @@ void TRAP_INT::runKokkosVariant(VariantID vid) Real_type trap_integral_val = m_sumx_init; - Kokkos::parallel_reduce("TRAP_INT_KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), - [=] (const int64_t i, Real_type& sumx) {TRAP_INT_BODY}, trap_integral_val - ); + Kokkos::parallel_reduce("TRAP_INT_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(const int64_t i, Real_type& sumx) {TRAP_INT_BODY}, + trap_integral_val + ); m_sumx += static_cast(trap_integral_val) * h; } + Kokkos::fence(); stopTimer(); break; From 5ef80af3145e8929a4ccb0965a0a0aed96df2f82 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Mon, 8 Mar 2021 13:16:03 -0800 Subject: [PATCH 059/124] At least functional version of this --- src/basic-kokkos/ATOMIC_PI-Kokkos.cpp | 154 +++++++++++++------------- 1 file changed, 75 insertions(+), 79 deletions(-) diff --git a/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp b/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp index 45a8efa8e..51feaacf0 100644 --- a/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp +++ b/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp @@ -12,14 +12,10 @@ #include -namespace rajaperf -{ -namespace basic -{ +namespace rajaperf { +namespace basic { - -void ATOMIC_PI::runKokkosVariant(VariantID vid) -{ +void ATOMIC_PI::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getRunSize(); @@ -27,100 +23,100 @@ void ATOMIC_PI::runKokkosVariant(VariantID vid) ATOMIC_PI_DATA_SETUP; // Declare Kokkos View that will wrap the pointer defined in ATOMIC_PI.hpp - auto pi_view = getViewFromPointer(pi ); - - + auto pi_view = getViewFromPointer(pi, 1); #if defined(RUN_KOKKOS) - switch ( vid ) { + switch (vid) { - case Base_Seq : { + case Base_Seq: { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - *pi = m_pi_init; - for (Index_type i = ibegin; i < iend; ++i ) { - double x = (double(i) + 0.5) * dx; - *pi += dx / (1.0 + x * x); - } - *pi *= 4.0; + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + *pi = m_pi_init; + for (Index_type i = ibegin; i < iend; ++i) { + double x = (double(i) + 0.5) * dx; + *pi += dx / (1.0 + x * x); } - stopTimer(); - - break; + *pi *= 4.0; } + stopTimer(); -#if defined(RUN_RAJA_SEQ) - case Lambda_Seq : { + break; + } - auto atomicpi_base_lam = [=](Index_type i) { - double x = (double(i) + 0.5) * dx; - *pi += dx / (1.0 + x * x); - }; +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq: { - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + auto atomicpi_base_lam = [=](Index_type i) { + double x = (double(i) + 0.5) * dx; + *pi += dx / (1.0 + x * x); + }; - *pi = m_pi_init; - for (Index_type i = ibegin; i < iend; ++i ) { - atomicpi_base_lam(i); - } - *pi *= 4.0; + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + *pi = m_pi_init; + for (Index_type i = ibegin; i < iend; ++i) { + atomicpi_base_lam(i); } - stopTimer(); - - break; + *pi *= 4.0; } + stopTimer(); - case Kokkos_Lambda : { - // Ensure all upstream calculations have been completed before starting - // the timer - Kokkos::fence(); - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - // Here, making a pointer of pi defined in ATOMIC_PI.hpp; we will use a - // KokkosView instead - // *pi = m_pi_init; -// RAJA::forall( RAJA::RangeSegment(ibegin, iend), -// [=](Index_type i) { -// double x = (double(i) + 0.5) * dx; -// RAJA::atomicAdd(pi, dx / (1.0 + x * x)); -// }); - - Kokkos::parallel_for("ATOMIC_PI-Kokkos Kokkos_Lambda", - Kokkos::RangePolicy(ibegin, iend), - KOKKOS_LAMBDA(Index_type i) { - // Original ATOMIC_PI kernel reference implementation - // defined in ATOMIC_PI.hpp - double x = (double(i) + 0.5) * dx; - Kokkos::atomic_add(pi_view, dx / (1.0 + x * x)); - }); - - //*pi *= 4.0; - pi_view *= 4.0; - - } - - Kokkos::fence(); - stopTimer(); + break; + } - break; + case Kokkos_Lambda: { + // Ensure all upstream calculations have been completed before starting + // the timer + Kokkos::fence(); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // Here, making a pointer of pi defined in ATOMIC_PI.hpp; we will use a + // KokkosView instead + // *pi = m_pi_init; + // RAJA::forall( RAJA::RangeSegment(ibegin, iend), + // [=](Index_type i) { + // double x = (double(i) + 0.5) * dx; + // RAJA::atomicAdd(pi, dx / (1.0 + x * x)); + // }); + *pi = m_pi_init; + auto pi_view = getViewFromPointer(pi, 1); + + Kokkos::parallel_for( + "ATOMIC_PI-Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i) { + // Original ATOMIC_PI kernel reference implementation + // defined in ATOMIC_PI.hpp + double x = (double(i) + 0.5) * dx; + Kokkos::atomic_add(&pi_view(0), dx / (1.0 + x * x)); + }); + + moveDataToHostFromKokkosView(pi, pi_view, 1); + *pi *= 4.0; + //*m_pi += *pi; + //*pi *= 4.0; + // pi_view *= 4.0; } -#endif //RUN_RAJA_SEQ - default : { - std::cout << "\n ATOMIC_PI : Unknown variant id = " << vid << std::endl; - } + Kokkos::fence(); + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + default: { + std::cout << "\n ATOMIC_PI : Unknown variant id = " << vid << std::endl; + } } -#endif //RUN_KOKKOS +#endif // RUN_KOKKOS - moveDataToHostFromKokkosView(pi, pi_view, iend); + // moveDataToHostFromKokkosView(pi, pi_view, 1); } } // end namespace basic From 0be15e484df6a2559c139ef72112b8a7623e6a74 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Thu, 11 Mar 2021 07:38:00 -0800 Subject: [PATCH 060/124] OpenMPTarget build --- CMakeLists.txt | 13 +- src/CMakeLists.txt | 231 +++++++++--------- src/basic-kokkos/ATOMIC_PI-Kokkos.cpp | 4 +- src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp.bck | 85 ------- src/basic-kokkos/DAXPY-Kokkos.cpp | 2 - src/basic-kokkos/IF_QUAD-Kokkos.cpp | 2 - src/basic-kokkos/INIT3-Kokkos.cpp | 2 - src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp | 2 - .../INIT_VIEW1D_OFFSET-Kokkos.cpp | 2 - src/basic-kokkos/MULADDSUB-Kokkos.cpp | 2 - src/basic-kokkos/NESTED_INIT-Kokkos.cpp | 2 - src/basic-kokkos/REDUCE3_INT-Kokkos.cpp | 7 +- src/basic-kokkos/TRAP_INT-Kokkos.cpp | 2 - src/basic/ATOMIC_PI-OMPTarget.cpp | 10 +- src/basic/ATOMIC_PI.cpp | 1 + src/basic/DAXPY-OMPTarget.cpp | 8 +- src/basic/IF_QUAD-OMPTarget.cpp | 8 +- src/basic/INIT3-OMPTarget.cpp | 8 +- src/basic/INIT_VIEW1D-OMPTarget.cpp | 8 +- src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp | 8 +- src/basic/MULADDSUB-OMPTarget.cpp | 8 +- src/basic/NESTED_INIT-OMPTarget.cpp | 26 +- src/basic/REDUCE3_INT-OMPTarget.cpp | 26 +- src/basic/TRAP_INT-OMPTarget.cpp | 12 +- src/common/RAJAPerfSuite.cpp | 4 +- src/common/RAJAPerfSuite.hpp | 3 +- 26 files changed, 202 insertions(+), 284 deletions(-) delete mode 100644 src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp.bck diff --git a/CMakeLists.txt b/CMakeLists.txt index c3aa9d8dc..6ae05b0d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,7 +55,6 @@ add_subdirectory(tpl/RAJA) get_property(RAJA_INCLUDE_DIRS DIRECTORY tpl/RAJA PROPERTY INCLUDE_DIRECTORIES) include_directories(${RAJA_INCLUDE_DIRS}) - # # Setup variables to pass to Perf suite # @@ -118,6 +117,18 @@ set (HIP_HIPCC_FLAGS ${RAJA_HIPCC_FLAGS}) # ENABLE KOKKOS IS A RAJA PERFSUITE OPTION if(ENABLE_KOKKOS) add_definitions(-DRUN_KOKKOS) + if(ENABLE_TARGET_OPENMP) + set(Kokkos_ENABLE_OPENMPTARGET ON CACHE BOOL "Docstring") + set(Kokkos_ARCH_VOLTA70 ON CACHE BOOL "Docstring") #TODO: better + set(CMAKE_CXX_STANDARD 17) + set(RAJA_ENABLE_TARGET_OPENMP ON CACHE BOOL "Docstring") + if(NOT CMAKE_BUILD_TYPE MATCHES Debug) + if(NOT EXPERIMENTAL_BUILD) + message(FATAL_ERROR "Kokkos builds with OpenMPTarget require a Debug build to succeed at the moment. Rebuild with CMAKE_BUILD_TYPE=Debug. If you're a compiler developer, rebuild with -DEXPERIMENTAL_BUILD=ON") + endif() + endif() + #add_definitions(-DRAJA_ENABLE_TARGET_OPENMP) + endif() # ENABLE_CUDA IS A RAJA PERFSUITE OPTION if(ENABLE_CUDA) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index dd18ad6fd..ac19adb6a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -31,33 +31,34 @@ list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) if(ENABLE_TARGET_OPENMP) remove_definitions(-DRUN_RAJA_SEQ -DRUN_OPENMP ) + include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/basic) blt_add_executable( NAME raja-perf-omptarget.exe SOURCES RAJAPerfSuiteDriver.cpp - apps/AppsData.cpp - apps/DEL_DOT_VEC_2D.cpp - apps/DEL_DOT_VEC_2D-Seq.cpp - apps/DEL_DOT_VEC_2D-OMPTarget.cpp - apps/ENERGY.cpp - apps/ENERGY-Seq.cpp - apps/ENERGY-OMPTarget.cpp - apps/FIR.cpp - apps/FIR-Seq.cpp - apps/FIR-OMPTarget.cpp - apps/PRESSURE.cpp - apps/PRESSURE-Seq.cpp - apps/PRESSURE-OMPTarget.cpp - apps/LTIMES.cpp - apps/LTIMES-Seq.cpp - apps/LTIMES-OMPTarget.cpp - apps/LTIMES_NOVIEW.cpp - apps/LTIMES_NOVIEW-Seq.cpp - apps/LTIMES_NOVIEW-OMPTarget.cpp - apps/VOL3D.cpp - apps/VOL3D-Seq.cpp - apps/VOL3D-OMPTarget.cpp - apps/WIP-COUPLE.cpp + #apps/AppsData.cpp + #apps/DEL_DOT_VEC_2D.cpp + #apps/DEL_DOT_VEC_2D-Seq.cpp + #apps/DEL_DOT_VEC_2D-OMPTarget.cpp + #apps/ENERGY.cpp + #apps/ENERGY-Seq.cpp + #apps/ENERGY-OMPTarget.cpp + #apps/FIR.cpp + #apps/FIR-Seq.cpp + #apps/FIR-OMPTarget.cpp + #apps/PRESSURE.cpp + #apps/PRESSURE-Seq.cpp + #apps/PRESSURE-OMPTarget.cpp + #apps/LTIMES.cpp + #apps/LTIMES-Seq.cpp + #apps/LTIMES-OMPTarget.cpp + #apps/LTIMES_NOVIEW.cpp + #apps/LTIMES_NOVIEW-Seq.cpp + #apps/LTIMES_NOVIEW-OMPTarget.cpp + #apps/VOL3D.cpp + #apps/VOL3D-Seq.cpp + #apps/VOL3D-OMPTarget.cpp + #apps/WIP-COUPLE.cpp basic/ATOMIC_PI.cpp basic/ATOMIC_PI-Seq.cpp basic/ATOMIC_PI-OMPTarget.cpp @@ -88,93 +89,103 @@ blt_add_executable( basic/TRAP_INT.cpp basic/TRAP_INT-Seq.cpp basic/TRAP_INT-OMPTarget.cpp - lcals/DIFF_PREDICT.cpp - lcals/DIFF_PREDICT-Seq.cpp - lcals/DIFF_PREDICT-OMPTarget.cpp - lcals/EOS.cpp - lcals/EOS-Seq.cpp - lcals/EOS-OMPTarget.cpp - lcals/FIRST_DIFF.cpp - lcals/FIRST_DIFF-Seq.cpp - lcals/FIRST_DIFF-OMPTarget.cpp - lcals/FIRST_MIN.cpp - lcals/FIRST_MIN-Seq.cpp - lcals/FIRST_MIN-OMPTarget.cpp - lcals/FIRST_SUM.cpp - lcals/FIRST_SUM-Seq.cpp - lcals/FIRST_SUM-OMPTarget.cpp - lcals/GEN_LIN_RECUR.cpp - lcals/GEN_LIN_RECUR-Seq.cpp - lcals/GEN_LIN_RECUR-OMPTarget.cpp - lcals/HYDRO_1D.cpp - lcals/HYDRO_1D-Seq.cpp - lcals/HYDRO_1D-OMPTarget.cpp - lcals/HYDRO_2D.cpp - lcals/HYDRO_2D-Seq.cpp - lcals/HYDRO_2D-OMPTarget.cpp - lcals/INT_PREDICT.cpp - lcals/INT_PREDICT-Seq.cpp - lcals/INT_PREDICT-OMPTarget.cpp - lcals/PLANCKIAN.cpp - lcals/PLANCKIAN-Seq.cpp - lcals/PLANCKIAN-OMPTarget.cpp - lcals/TRIDIAG_ELIM.cpp - lcals/TRIDIAG_ELIM-Seq.cpp - lcals/TRIDIAG_ELIM-OMPTarget.cpp - polybench/POLYBENCH_2MM.cpp - polybench/POLYBENCH_2MM-Seq.cpp - polybench/POLYBENCH_2MM-OMPTarget.cpp - polybench/POLYBENCH_3MM.cpp - polybench/POLYBENCH_3MM-Seq.cpp - polybench/POLYBENCH_3MM-OMPTarget.cpp - polybench/POLYBENCH_ADI.cpp - polybench/POLYBENCH_ADI-Seq.cpp - polybench/POLYBENCH_ADI-OMPTarget.cpp - polybench/POLYBENCH_ATAX.cpp - polybench/POLYBENCH_ATAX-Seq.cpp - polybench/POLYBENCH_ATAX-OMPTarget.cpp - polybench/POLYBENCH_FDTD_2D.cpp - polybench/POLYBENCH_FDTD_2D-Seq.cpp - polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp - polybench/POLYBENCH_FLOYD_WARSHALL.cpp - polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp - polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp - polybench/POLYBENCH_GEMM.cpp - polybench/POLYBENCH_GEMM-Seq.cpp - polybench/POLYBENCH_GEMM-OMPTarget.cpp - polybench/POLYBENCH_GEMVER.cpp - polybench/POLYBENCH_GEMVER-Seq.cpp - polybench/POLYBENCH_GEMVER-OMPTarget.cpp - polybench/POLYBENCH_GESUMMV.cpp - polybench/POLYBENCH_GESUMMV-Seq.cpp - polybench/POLYBENCH_GESUMMV-OMPTarget.cpp - polybench/POLYBENCH_HEAT_3D.cpp - polybench/POLYBENCH_HEAT_3D-Seq.cpp - polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp - polybench/POLYBENCH_JACOBI_1D.cpp - polybench/POLYBENCH_JACOBI_1D-Seq.cpp - polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp - polybench/POLYBENCH_JACOBI_2D.cpp - polybench/POLYBENCH_JACOBI_2D-Seq.cpp - polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp - polybench/POLYBENCH_MVT.cpp - polybench/POLYBENCH_MVT-Seq.cpp - polybench/POLYBENCH_MVT-OMPTarget.cpp - stream/ADD.cpp - stream/ADD-Seq.cpp - stream/ADD-OMPTarget.cpp - stream/COPY.cpp - stream/COPY-Seq.cpp - stream/COPY-OMPTarget.cpp - stream/DOT.cpp - stream/DOT-Seq.cpp - stream/DOT-OMPTarget.cpp - stream/MUL.cpp - stream/MUL-Seq.cpp - stream/MUL-OMPTarget.cpp - stream/TRIAD.cpp - stream/TRIAD-Seq.cpp - stream/TRIAD-OMPTarget.cpp + basic-kokkos/ATOMIC_PI-Kokkos.cpp + basic-kokkos/DAXPY-Kokkos.cpp + basic-kokkos/IF_QUAD-Kokkos.cpp + basic-kokkos/INIT3-Kokkos.cpp + basic-kokkos/INIT_VIEW1D-Kokkos.cpp + basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp + basic-kokkos/MULADDSUB-Kokkos.cpp + basic-kokkos/NESTED_INIT-Kokkos.cpp + basic-kokkos/REDUCE3_INT-Kokkos.cpp + basic-kokkos/TRAP_INT-Kokkos.cpp + #lcals/DIFF_PREDICT.cpp + #lcals/DIFF_PREDICT-Seq.cpp + #lcals/DIFF_PREDICT-OMPTarget.cpp + #lcals/EOS.cpp + #lcals/EOS-Seq.cpp + #lcals/EOS-OMPTarget.cpp + #lcals/FIRST_DIFF.cpp + #lcals/FIRST_DIFF-Seq.cpp + #lcals/FIRST_DIFF-OMPTarget.cpp + #lcals/FIRST_MIN.cpp + #lcals/FIRST_MIN-Seq.cpp + #lcals/FIRST_MIN-OMPTarget.cpp + #lcals/FIRST_SUM.cpp + #lcals/FIRST_SUM-Seq.cpp + #lcals/FIRST_SUM-OMPTarget.cpp + #lcals/GEN_LIN_RECUR.cpp + #lcals/GEN_LIN_RECUR-Seq.cpp + #lcals/GEN_LIN_RECUR-OMPTarget.cpp + #lcals/HYDRO_1D.cpp + #lcals/HYDRO_1D-Seq.cpp + #lcals/HYDRO_1D-OMPTarget.cpp + #lcals/HYDRO_2D.cpp + #lcals/HYDRO_2D-Seq.cpp + #lcals/HYDRO_2D-OMPTarget.cpp + #lcals/INT_PREDICT.cpp + #lcals/INT_PREDICT-Seq.cpp + #lcals/INT_PREDICT-OMPTarget.cpp + #lcals/PLANCKIAN.cpp + #lcals/PLANCKIAN-Seq.cpp + #lcals/PLANCKIAN-OMPTarget.cpp + #lcals/TRIDIAG_ELIM.cpp + #lcals/TRIDIAG_ELIM-Seq.cpp + #lcals/TRIDIAG_ELIM-OMPTarget.cpp + #polybench/POLYBENCH_2MM.cpp + #polybench/POLYBENCH_2MM-Seq.cpp + #polybench/POLYBENCH_2MM-OMPTarget.cpp + #polybench/POLYBENCH_3MM.cpp + #polybench/POLYBENCH_3MM-Seq.cpp + #polybench/POLYBENCH_3MM-OMPTarget.cpp + #polybench/POLYBENCH_ADI.cpp + #polybench/POLYBENCH_ADI-Seq.cpp + #polybench/POLYBENCH_ADI-OMPTarget.cpp + #polybench/POLYBENCH_ATAX.cpp + #polybench/POLYBENCH_ATAX-Seq.cpp + #polybench/POLYBENCH_ATAX-OMPTarget.cpp + #polybench/POLYBENCH_FDTD_2D.cpp + #polybench/POLYBENCH_FDTD_2D-Seq.cpp + #polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp + #polybench/POLYBENCH_FLOYD_WARSHALL.cpp + #polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp + #polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp + #polybench/POLYBENCH_GEMM.cpp + #polybench/POLYBENCH_GEMM-Seq.cpp + #polybench/POLYBENCH_GEMM-OMPTarget.cpp + #polybench/POLYBENCH_GEMVER.cpp + #polybench/POLYBENCH_GEMVER-Seq.cpp + #polybench/POLYBENCH_GEMVER-OMPTarget.cpp + #polybench/POLYBENCH_GESUMMV.cpp + #polybench/POLYBENCH_GESUMMV-Seq.cpp + #polybench/POLYBENCH_GESUMMV-OMPTarget.cpp + #polybench/POLYBENCH_HEAT_3D.cpp + #polybench/POLYBENCH_HEAT_3D-Seq.cpp + #polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp + #polybench/POLYBENCH_JACOBI_1D.cpp + #polybench/POLYBENCH_JACOBI_1D-Seq.cpp + #polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp + #polybench/POLYBENCH_JACOBI_2D.cpp + #polybench/POLYBENCH_JACOBI_2D-Seq.cpp + #polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp + #polybench/POLYBENCH_MVT.cpp + #polybench/POLYBENCH_MVT-Seq.cpp + #polybench/POLYBENCH_MVT-OMPTarget.cpp + #stream/ADD.cpp + #stream/ADD-Seq.cpp + #stream/ADD-OMPTarget.cpp + #stream/COPY.cpp + #stream/COPY-Seq.cpp + #stream/COPY-OMPTarget.cpp + #stream/DOT.cpp + #stream/DOT-Seq.cpp + #stream/DOT-OMPTarget.cpp + #stream/MUL.cpp + #stream/MUL-Seq.cpp + #stream/MUL-OMPTarget.cpp + #stream/TRIAD.cpp + #stream/TRIAD-Seq.cpp + #stream/TRIAD-OMPTarget.cpp common/DataUtils.cpp common/Executor.cpp common/KernelBase.cpp diff --git a/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp b/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp index 51feaacf0..da2e4ee90 100644 --- a/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp +++ b/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp @@ -46,7 +46,6 @@ void ATOMIC_PI::runKokkosVariant(VariantID vid) { break; } -#if defined(RUN_RAJA_SEQ) case Lambda_Seq: { auto atomicpi_base_lam = [=](Index_type i) { @@ -84,7 +83,7 @@ void ATOMIC_PI::runKokkosVariant(VariantID vid) { // RAJA::atomicAdd(pi, dx / (1.0 + x * x)); // }); *pi = m_pi_init; - auto pi_view = getViewFromPointer(pi, 1); + pi_view = getViewFromPointer(pi, 1); Kokkos::parallel_for( "ATOMIC_PI-Kokkos Kokkos_Lambda", @@ -108,7 +107,6 @@ void ATOMIC_PI::runKokkosVariant(VariantID vid) { break; } -#endif // RUN_RAJA_SEQ default: { std::cout << "\n ATOMIC_PI : Unknown variant id = " << vid << std::endl; diff --git a/src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp.bck b/src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp.bck deleted file mode 100644 index d6cd9a1e6..000000000 --- a/src/basic-kokkos/ATOMIC_PI-KokkosOMP.cpp.bck +++ /dev/null @@ -1,85 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "ATOMIC_PI.hpp" - -#include "RAJA/RAJA.hpp" - -#include - -namespace rajaperf -{ -namespace basic -{ -struct AtomicPIFunctor { - Real_type dx; - Real_ptr pi; - - AtomicPIFunctor(Real_type m_dx, Real_ptr m_pi) : ATOMIC_PI_FUNCTOR_CONSTRUCT {} -}; - - -void ATOMIC_PI::runKokkosOpenMPVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getRunSize(); - - ATOMIC_PI_DATA_SETUP; - -#if defined(RUN_KOKKOS) && defined(RUN_OPENMP) - switch ( vid ) { - - case Kokkos_Functor_OpenMP : { - - startTimer(); - //for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - // *pi = m_pi_init; - // RAJA::forall( - // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - // double x = (double(i) + 0.5) * dx; - // RAJA::atomicAdd(pi, dx / (1.0 + x * x)); - // }); - // *pi *= 4.0; - - //} - stopTimer(); - - break; - } - case Kokkos_Lambda_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - *pi = m_pi_init; - - Kokkos::parallel_for("name",Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA(Index_type i){ - double x = ((double(i) + 0.5) * dx); - Kokkos::atomic_add(pi, dx / (1.0 + x * x)); - }); - *pi *= 4.0; - } - stopTimer(); - - break; - } - - - default : { - std::cout << "\n ATOMIC_PI : Unknown variant id = " << vid << std::endl; - } - - } - -#endif -} - -} // end namespace basic -} // end namespace rajaperf diff --git a/src/basic-kokkos/DAXPY-Kokkos.cpp b/src/basic-kokkos/DAXPY-Kokkos.cpp index f9b424afd..754700d1c 100644 --- a/src/basic-kokkos/DAXPY-Kokkos.cpp +++ b/src/basic-kokkos/DAXPY-Kokkos.cpp @@ -56,7 +56,6 @@ void DAXPY::runKokkosVariant(VariantID vid) switch ( vid ) { -#if defined(RUN_RAJA_SEQ) case Kokkos_Lambda: { Kokkos::fence(); @@ -92,7 +91,6 @@ void DAXPY::runKokkosVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ default : { std::cout << "\n DAXPY : Unknown variant id = " << vid << std::endl; } diff --git a/src/basic-kokkos/IF_QUAD-Kokkos.cpp b/src/basic-kokkos/IF_QUAD-Kokkos.cpp index 2377d2599..29b2a2e25 100644 --- a/src/basic-kokkos/IF_QUAD-Kokkos.cpp +++ b/src/basic-kokkos/IF_QUAD-Kokkos.cpp @@ -51,7 +51,6 @@ void IF_QUAD::runKokkosVariant(VariantID vid) -#if defined(RUN_RAJA_SEQ) case Kokkos_Lambda : { @@ -88,7 +87,6 @@ void IF_QUAD::runKokkosVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ default : { std::cout << "\n IF_QUAD : Unknown variant id = " << vid << std::endl; diff --git a/src/basic-kokkos/INIT3-Kokkos.cpp b/src/basic-kokkos/INIT3-Kokkos.cpp index 4111b36f2..08b67eb57 100644 --- a/src/basic-kokkos/INIT3-Kokkos.cpp +++ b/src/basic-kokkos/INIT3-Kokkos.cpp @@ -63,7 +63,6 @@ void INIT3::runKokkosVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) case Lambda_Seq : { @@ -105,7 +104,6 @@ void INIT3::runKokkosVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ default : { std::cout << "\n INIT3 : Unknown variant id = " << vid << std::endl; diff --git a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp index 2036476b4..efba110c1 100644 --- a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp +++ b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp @@ -51,7 +51,6 @@ void INIT_VIEW1D::runKokkosVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) case Lambda_Seq : { auto initview1d_base_lam = [=](Index_type i) { @@ -108,7 +107,6 @@ void INIT_VIEW1D::runKokkosVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ default : { std::cout << "\n INIT_VIEW1D : Unknown variant id = " << vid << std::endl; diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp index 4e9de109c..5f010597b 100644 --- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp @@ -50,7 +50,6 @@ void INIT_VIEW1D_OFFSET::runKokkosVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) case Lambda_Seq : { auto initview1doffset_base_lam = [=](Index_type i) { @@ -106,7 +105,6 @@ void INIT_VIEW1D_OFFSET::runKokkosVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ default : { std::cout << "\n INIT_VIEW1D_OFFSET : Unknown variant id = " << vid << std::endl; diff --git a/src/basic-kokkos/MULADDSUB-Kokkos.cpp b/src/basic-kokkos/MULADDSUB-Kokkos.cpp index 00e6b47e8..9efcc2c39 100644 --- a/src/basic-kokkos/MULADDSUB-Kokkos.cpp +++ b/src/basic-kokkos/MULADDSUB-Kokkos.cpp @@ -61,7 +61,6 @@ void MULADDSUB::runKokkosVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) case Lambda_Seq : { startTimer(); @@ -108,7 +107,6 @@ void MULADDSUB::runKokkosVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ default : { std::cout << "\n MULADDSUB : Unknown variant id = " << vid << std::endl; diff --git a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp index da92d320f..4dee1a560 100644 --- a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp +++ b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp @@ -46,7 +46,6 @@ void NESTED_INIT::runKokkosVariant(VariantID vid) { break; } -#if defined(RUN_RAJA_SEQ) case Lambda_Seq: { startTimer(); @@ -116,7 +115,6 @@ void NESTED_INIT::runKokkosVariant(VariantID vid) { break; } -#endif // RUN_RAJA_SEQ default: { std::cout << "\n NESTED_INIT : Unknown variant id = " << vid << std::endl; diff --git a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp index 65f62f821..ef15833ee 100644 --- a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp +++ b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp @@ -58,7 +58,6 @@ void REDUCE3_INT::runKokkosVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) case Lambda_Seq : { auto init3_base_lam = [=](Index_type i) -> Int_type { @@ -110,13 +109,12 @@ void REDUCE3_INT::runKokkosVariant(VariantID vid) // These values are initilized elsewhere by RPS // These variables were declared to Kokkos-ify the parallel_reduce // construct: +#ifndef RAJA_ENABLE_TARGET_OPENMP Int_type max_value = m_vmax_init; Int_type min_value = m_vmin_init; Int_type sum = m_vsum_init; - // KOKKOS_LAMBDA IS A PRE-PROCESSOR DIRECTIVE; - // It makes the capture clause on the lambda work for Host and Device parallel_reduce("REDUCE3-Kokkos Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA(const int64_t i, Int_type& tl_max, Int_type& tl_min, Int_type& tl_sum){ @@ -131,14 +129,13 @@ void REDUCE3_INT::runKokkosVariant(VariantID vid) m_vsum += static_cast(sum); m_vmin = RAJA_MIN(m_vmin, static_cast(min_value)); m_vmax = RAJA_MAX(m_vmax, static_cast(max_value)); - +#endif } Kokkos::fence(); stopTimer(); break; } -#endif // RUN_RAJA_SEQ default : { std::cout << "\n REDUCE3_INT : Unknown variant id = " << vid << std::endl; diff --git a/src/basic-kokkos/TRAP_INT-Kokkos.cpp b/src/basic-kokkos/TRAP_INT-Kokkos.cpp index e708f3a78..edcc27a89 100644 --- a/src/basic-kokkos/TRAP_INT-Kokkos.cpp +++ b/src/basic-kokkos/TRAP_INT-Kokkos.cpp @@ -68,7 +68,6 @@ void TRAP_INT::runKokkosVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) case Lambda_Seq : { auto trapint_base_lam = [=](Index_type i) -> Real_type { @@ -125,7 +124,6 @@ void TRAP_INT::runKokkosVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ default : { std::cout << "\n TRAP_INT : Unknown variant id = " << vid << std::endl; diff --git a/src/basic/ATOMIC_PI-OMPTarget.cpp b/src/basic/ATOMIC_PI-OMPTarget.cpp index 578b5ed99..d901395d3 100644 --- a/src/basic/ATOMIC_PI-OMPTarget.cpp +++ b/src/basic/ATOMIC_PI-OMPTarget.cpp @@ -78,11 +78,11 @@ void ATOMIC_PI::runOpenMPTargetVariant(VariantID vid) initOpenMPDeviceData(pi, &m_pi_init, 1, did, hid); - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - double x = (double(i) + 0.5) * dx; - RAJA::atomicAdd(pi, dx / (1.0 + x * x)); - }); + //RAJA::forall>( + // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + // double x = (double(i) + 0.5) * dx; + // RAJA::atomicAdd(pi, dx / (1.0 + x * x)); + //}); getOpenMPDeviceData(m_pi, pi, 1, hid, did); *m_pi *= 4.0; diff --git a/src/basic/ATOMIC_PI.cpp b/src/basic/ATOMIC_PI.cpp index 83010da16..ea907f8e1 100644 --- a/src/basic/ATOMIC_PI.cpp +++ b/src/basic/ATOMIC_PI.cpp @@ -59,6 +59,7 @@ void ATOMIC_PI::setUp(VariantID vid) void ATOMIC_PI::updateChecksum(VariantID vid) { + std::cout << "Value is "<<*m_pi<>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - DAXPY_BODY; - }); + //RAJA::forall>( + // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + // DAXPY_BODY; + //}); } stopTimer(); diff --git a/src/basic/IF_QUAD-OMPTarget.cpp b/src/basic/IF_QUAD-OMPTarget.cpp index 5c6a261d9..1e1b951e9 100644 --- a/src/basic/IF_QUAD-OMPTarget.cpp +++ b/src/basic/IF_QUAD-OMPTarget.cpp @@ -78,10 +78,10 @@ void IF_QUAD::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - IF_QUAD_BODY; - }); + //RAJA::forall>( + // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + // IF_QUAD_BODY; + //}); } stopTimer(); diff --git a/src/basic/INIT3-OMPTarget.cpp b/src/basic/INIT3-OMPTarget.cpp index 0c31a1806..0d8f4e8bf 100644 --- a/src/basic/INIT3-OMPTarget.cpp +++ b/src/basic/INIT3-OMPTarget.cpp @@ -80,10 +80,10 @@ void INIT3::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - INIT3_BODY; - }); + //RAJA::forall>( + // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + // INIT3_BODY; + //}); } stopTimer(); diff --git a/src/basic/INIT_VIEW1D-OMPTarget.cpp b/src/basic/INIT_VIEW1D-OMPTarget.cpp index 2426b71e4..e0c80e416 100644 --- a/src/basic/INIT_VIEW1D-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D-OMPTarget.cpp @@ -72,10 +72,10 @@ void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - INIT_VIEW1D_BODY_RAJA; - }); + //RAJA::forall>( + // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + // INIT_VIEW1D_BODY_RAJA; + //}); } stopTimer(); diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp index 9468ce57e..1fa9ecb81 100644 --- a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp @@ -72,10 +72,10 @@ void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - INIT_VIEW1D_OFFSET_BODY_RAJA; - }); + //RAJA::forall>( + // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + // INIT_VIEW1D_OFFSET_BODY_RAJA; + //}); } stopTimer(); diff --git a/src/basic/MULADDSUB-OMPTarget.cpp b/src/basic/MULADDSUB-OMPTarget.cpp index 02e1a1c59..55b449aea 100644 --- a/src/basic/MULADDSUB-OMPTarget.cpp +++ b/src/basic/MULADDSUB-OMPTarget.cpp @@ -80,10 +80,10 @@ void MULADDSUB::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - MULADDSUB_BODY; - }); + //RAJA::forall>( + // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + // MULADDSUB_BODY; + //}); } stopTimer(); diff --git a/src/basic/NESTED_INIT-OMPTarget.cpp b/src/basic/NESTED_INIT-OMPTarget.cpp index 00cb56755..403c68d75 100644 --- a/src/basic/NESTED_INIT-OMPTarget.cpp +++ b/src/basic/NESTED_INIT-OMPTarget.cpp @@ -64,23 +64,23 @@ void NESTED_INIT::runOpenMPTargetVariant(VariantID vid) NESTED_INIT_DATA_SETUP_OMP_TARGET; - using EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::Collapse, // k, j, i - RAJA::statement::Lambda<0> - > - >; + //using EXEC_POL = + // RAJA::KernelPolicy< + // RAJA::statement::Collapse, // k, j, i + // RAJA::statement::Lambda<0> + // > + // >; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), - RAJA::RangeSegment(0, nj), - RAJA::RangeSegment(0, nk)), - [=](Index_type i, Index_type j, Index_type k) { - NESTED_INIT_BODY; - }); + // RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, ni), + // RAJA::RangeSegment(0, nj), + // RAJA::RangeSegment(0, nk)), + // [=](Index_type i, Index_type j, Index_type k) { + // NESTED_INIT_BODY; + // }); } stopTimer(); diff --git a/src/basic/REDUCE3_INT-OMPTarget.cpp b/src/basic/REDUCE3_INT-OMPTarget.cpp index ec8325737..1de222ee3 100644 --- a/src/basic/REDUCE3_INT-OMPTarget.cpp +++ b/src/basic/REDUCE3_INT-OMPTarget.cpp @@ -80,19 +80,19 @@ void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum vsum(m_vsum_init); - RAJA::ReduceMin vmin(m_vmin_init); - RAJA::ReduceMax vmax(m_vmax_init); - - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), - [=](Index_type i) { - REDUCE3_INT_BODY_RAJA; - }); - - m_vsum += static_cast(vsum.get()); - m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); - m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); + // RAJA::ReduceSum vsum(m_vsum_init); + // RAJA::ReduceMin vmin(m_vmin_init); + // RAJA::ReduceMax vmax(m_vmax_init); + + // RAJA::forall>( + // RAJA::RangeSegment(ibegin, iend), + // [=](Index_type i) { + // REDUCE3_INT_BODY_RAJA; + // }); + + // m_vsum += static_cast(vsum.get()); + // m_vmin = RAJA_MIN(m_vmin, static_cast(vmin.get())); + // m_vmax = RAJA_MAX(m_vmax, static_cast(vmax.get())); } stopTimer(); diff --git a/src/basic/TRAP_INT-OMPTarget.cpp b/src/basic/TRAP_INT-OMPTarget.cpp index 2ce375669..fc4a2e116 100644 --- a/src/basic/TRAP_INT-OMPTarget.cpp +++ b/src/basic/TRAP_INT-OMPTarget.cpp @@ -86,14 +86,14 @@ void TRAP_INT::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::ReduceSum sumx(m_sumx_init); + //RAJA::ReduceSum sumx(m_sumx_init); - RAJA::forall>( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - TRAP_INT_BODY; - }); + //RAJA::forall>( + // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + // TRAP_INT_BODY; + //}); - m_sumx += static_cast(sumx.get()) * h; + //m_sumx += static_cast(sumx.get()) * h; } stopTimer(); diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 64b93d282..5b6feba6d 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -309,6 +309,8 @@ bool isVariantAvailable(VariantID vid) vid == RAJA_Seq ) { ret_val = true; } +#endif + #if defined(RUN_KOKKOS) if ( vid == Kokkos_Lambda || vid == Kokkos_Functor ) { @@ -316,8 +318,6 @@ bool isVariantAvailable(VariantID vid) } #endif // RUN_KOKKOS -#endif - #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) if ( vid == Base_OpenMP || vid == Lambda_OpenMP || diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index eaad2efbb..7c8d232fa 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -190,7 +190,7 @@ void moveDataToHostFromKokkosView(PointedAt *kokkos_ptr, ExistingView my_view, using device_view_type = typename Kokkos::View< typename PointerOfNdimensions::type, typename Kokkos::DefaultExecutionSpace::memory_space>; - + // When copying data, we can either change the Layout or the memory_space // (host or device), but we cannot change both! // Here, we are mirroring data on the host to the device, i.e., Layout is @@ -210,6 +210,7 @@ void moveDataToHostFromKokkosView(PointedAt *kokkos_ptr, ExistingView my_view, // Layout is optimal for gpu, but located on CPU mirror_view_type cpu_to_gpu_mirror = Kokkos::create_mirror_view(my_view); + //auto mirror_view_type cpu_to_gpu_mirror = Kokkos::create_mirror_view(my_view); // We need to deep_copy our existing data, the contents of // pointer_holder, into the mirror_view; From 5af2f061311c7077c877ea5d7eb9ebd6e503c24f Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 11 Mar 2021 09:22:05 -0800 Subject: [PATCH 061/124] ATOMIC_PI notes added to explain changes --- src/basic-kokkos/ATOMIC_PI-Kokkos.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp b/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp index 51feaacf0..d6a761250 100644 --- a/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp +++ b/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp @@ -83,8 +83,16 @@ void ATOMIC_PI::runKokkosVariant(VariantID vid) { // double x = (double(i) + 0.5) * dx; // RAJA::atomicAdd(pi, dx / (1.0 + x * x)); // }); + // + // Initializing a value, pi, on the host *pi = m_pi_init; - auto pi_view = getViewFromPointer(pi, 1); + // This is an assignment statement! Not a declaration. + // David made this assignment because of the structure of the + // computation. + // We're moving the data in the pointer to the device (GPU) + // IT IS IMPORTANT TO REALISE WHEN YOUR VARIABLE / DATA ARE BEING + // REINITIALIZED + pi_view = getViewFromPointer(pi, 1); Kokkos::parallel_for( "ATOMIC_PI-Kokkos Kokkos_Lambda", @@ -93,9 +101,14 @@ void ATOMIC_PI::runKokkosVariant(VariantID vid) { // Original ATOMIC_PI kernel reference implementation // defined in ATOMIC_PI.hpp double x = (double(i) + 0.5) * dx; + // Make a reference to the 0th element of a 1D view with one + // element + // Atomic operation is an uninterruptable, single operation; e.g., + // addition, multiplication, division, etc. All of these atomic operations are architecture dependent. + // Atomics are advantageous from a correctness point of view Kokkos::atomic_add(&pi_view(0), dx / (1.0 + x * x)); }); - + // Moving the data on the device (held in the KokkosView) BACK to the pointer, pi. moveDataToHostFromKokkosView(pi, pi_view, 1); *pi *= 4.0; //*m_pi += *pi; From a4f2ef0ec8bd5141333aed1be01c77bd3813e74e Mon Sep 17 00:00:00 2001 From: Amy Jo Powell Date: Thu, 25 Mar 2021 08:16:10 -0600 Subject: [PATCH 062/124] changes needed for diverse builds --- CMakeLists.txt | 27 +++++++++-- .../snl-builds/caraway_rhel7_hipcc_4.0.0.sh | 48 +++++++++++++++++++ src/basic-kokkos/CMakeLists.txt | 14 ++++++ src/basic-kokkos/DAXPY-Kokkos.cpp | 2 +- src/basic-kokkos/TRAP_INT-Kokkos.cpp | 6 ++- 5 files changed, 90 insertions(+), 7 deletions(-) create mode 100755 scripts/snl-builds/caraway_rhel7_hipcc_4.0.0.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 6ae05b0d1..617f76452 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,8 +28,8 @@ set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) set(CMAKE_CUDA_HOST_LINK_LAUNCHER ${CMAKE_CUDA_COMPILER}) include(blt/SetupBLT.cmake) -set(CMAKE_CXX_STANDARD 14) -set(BLT_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 17) +set(BLT_CXX_STANDARD 17) include(blt/SetupBLT.cmake) # @@ -81,7 +81,8 @@ if (ENABLE_OPENMP) endif() if (ENABLE_CUDA) list(APPEND RAJA_PERFSUITE_DEPENDS cuda) -endif() +endif() +# HIP is used with AMD / VEGA GPU if (ENABLE_HIP) list(APPEND RAJA_PERFSUITE_DEPENDS hip) endif() @@ -114,9 +115,19 @@ configure_file(${CMAKE_SOURCE_DIR}/src/rajaperf_config.hpp.in set (CUDA_NVCC_FLAGS ${RAJA_NVCC_FLAGS}) set (HIP_HIPCC_FLAGS ${RAJA_HIPCC_FLAGS}) +# HACKS TO FIX COMPILATION ISSUES +include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/tpl/RAJA/include/) + + + # ENABLE KOKKOS IS A RAJA PERFSUITE OPTION if(ENABLE_KOKKOS) add_definitions(-DRUN_KOKKOS) + if(ENABLE_HIP) + set(Kokkos_ENABLE_HIP ON CACHE BOOL "Kokkos builds with AMD HIP require a ... build...AJP FINISH") + set(Kokkos_ARCH_VEGA900 ON CACHE BOOL "Docstring") #TODO: better + #set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE /ascldap/users/ajpowel/RAJAPerf/amd_build/compiler_unscrewer) + endif() if(ENABLE_TARGET_OPENMP) set(Kokkos_ENABLE_OPENMPTARGET ON CACHE BOOL "Docstring") set(Kokkos_ARCH_VOLTA70 ON CACHE BOOL "Docstring") #TODO: better @@ -127,6 +138,13 @@ if(ENABLE_KOKKOS) message(FATAL_ERROR "Kokkos builds with OpenMPTarget require a Debug build to succeed at the moment. Rebuild with CMAKE_BUILD_TYPE=Debug. If you're a compiler developer, rebuild with -DEXPERIMENTAL_BUILD=ON") endif() endif() + + + + + + + #add_definitions(-DRAJA_ENABLE_TARGET_OPENMP) endif() @@ -138,7 +156,8 @@ if(ENABLE_KOKKOS) enable_language(CUDA) endif() if(ENABLE_OPENMP) - set(Kokkos_ENABLE_OPENMP CACHE BOOL ON) + #set(Kokkos_ENABLE_OPENMP CACHE BOOL ON) + set(Kokkos_ENABLE_OPENMP ON CACHE BOOL "Docstring") endif() add_subdirectory(tpl/kokkos) diff --git a/scripts/snl-builds/caraway_rhel7_hipcc_4.0.0.sh b/scripts/snl-builds/caraway_rhel7_hipcc_4.0.0.sh new file mode 100755 index 000000000..6ecde9d4f --- /dev/null +++ b/scripts/snl-builds/caraway_rhel7_hipcc_4.0.0.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +################################################################################# + +BUILD_SUFFIX=snl_rhel7-hipcc-4.0.0 + +rm -rf build_${BUILD_SUFFIX} 2>/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +################################## +#Caraway Build (AMD) +################################# +module purge + +module load cmake/3.19.3 + +module load git/2.9.4 + +################################## +# FOR COMPUTE NODE (caraway04 GPU): + +module load rocm/4.0.0 + +module load python/3.7.3 + +cmake \ +-DCMAKE_BUILD_TYPE=Release \ +-DENABLE_KOKKOS=ON \ +-DENABLE_HIP=ON \ +-DKokkos_ARCH_VEGA900=ON \ +-DCMAKE_CXX_FLAGS="--gcc-toolchain=/home/projects/x86-64/gcc/8.2.0/" \ +-DHIP_HIPCC_FLAGS="--gcc-toolchain=/home/projects/x86-64/gcc/8.2.0/ -std=c++17" \ +-DCMAKE_CXX_STANDARD=17 \ +-DCMAKE_CXX_COMPILER=hipcc .. \ + +make -j24;make + +cd bin/ +./raja-perf.exe + + + diff --git a/src/basic-kokkos/CMakeLists.txt b/src/basic-kokkos/CMakeLists.txt index 7f62c271f..9a0f9a7bd 100644 --- a/src/basic-kokkos/CMakeLists.txt +++ b/src/basic-kokkos/CMakeLists.txt @@ -24,3 +24,17 @@ blt_add_library( DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) +# Diagnostics +message (STATUS "${RAJA_PERFSUITE_DEPENDS}") +#blt_print_target_properties(TARGET basic-kokkos) +#blt_print_target_properties(TARGET kokkos) + +#kokkoscore;kokkoscontainers;kokkosalgorithms +#blt_print_target_properties(TARGET kokkoscore) +#blt_print_target_properties(TARGET kokkoscontainers) +#blt_print_target_properties(TARGET kokkosalgorithms) + +blt_print_target_properties(TARGET RAJA) + +get_source_file_property(blah ATOMIC_PI-Kokkos.cpp HIP_SOURCE_PROPERTY_FORMAT) +message (STATUS "DOGS1 - ${blah}") diff --git a/src/basic-kokkos/DAXPY-Kokkos.cpp b/src/basic-kokkos/DAXPY-Kokkos.cpp index 754700d1c..d5d478032 100644 --- a/src/basic-kokkos/DAXPY-Kokkos.cpp +++ b/src/basic-kokkos/DAXPY-Kokkos.cpp @@ -82,7 +82,7 @@ void DAXPY::runKokkosVariant(VariantID vid) DaxpyFunctor daxpy_functor_instance(y,x,a); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Kokkos::parallel_for("DAXPY-KokkosSeq Kokkos_Functor_Seq", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("DAXPY-Kokkos Kokkos_Functor", Kokkos::RangePolicy(ibegin, iend), daxpy_functor_instance); } stopTimer(); diff --git a/src/basic-kokkos/TRAP_INT-Kokkos.cpp b/src/basic-kokkos/TRAP_INT-Kokkos.cpp index edcc27a89..63bd8fb0f 100644 --- a/src/basic-kokkos/TRAP_INT-Kokkos.cpp +++ b/src/basic-kokkos/TRAP_INT-Kokkos.cpp @@ -113,8 +113,10 @@ void TRAP_INT::runKokkosVariant(VariantID vid) Real_type trap_integral_val = m_sumx_init; - Kokkos::parallel_reduce("TRAP_INT_KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), - [=] (const int64_t i, Real_type& sumx) {TRAP_INT_BODY}, trap_integral_val + Kokkos::parallel_reduce("TRAP_INT_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + [=] (const int64_t i, Real_type& sumx) {TRAP_INT_BODY}, + trap_integral_val ); m_sumx += static_cast(trap_integral_val) * h; From 210bd2ccfb3ae0a3a8fd55d7f45f30d89665382b Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Thu, 25 Mar 2021 12:21:49 -0600 Subject: [PATCH 063/124] Candidate fixes for builds --- CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 617f76452..39c7e707f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,8 +28,8 @@ set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) set(CMAKE_CUDA_HOST_LINK_LAUNCHER ${CMAKE_CUDA_COMPILER}) include(blt/SetupBLT.cmake) -set(CMAKE_CXX_STANDARD 17) -set(BLT_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 14) +set(BLT_CXX_STANDARD 14) include(blt/SetupBLT.cmake) # @@ -132,6 +132,7 @@ if(ENABLE_KOKKOS) set(Kokkos_ENABLE_OPENMPTARGET ON CACHE BOOL "Docstring") set(Kokkos_ARCH_VOLTA70 ON CACHE BOOL "Docstring") #TODO: better set(CMAKE_CXX_STANDARD 17) + set(BLT_CXX_STANDARD 17) set(RAJA_ENABLE_TARGET_OPENMP ON CACHE BOOL "Docstring") if(NOT CMAKE_BUILD_TYPE MATCHES Debug) if(NOT EXPERIMENTAL_BUILD) From 28c76e5fb496d7d38aa452396febb61828ff5a1e Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 25 Mar 2021 12:55:06 -0600 Subject: [PATCH 064/124] commented incomplete code to fix RPS builds --- src/common/Executor.cpp | 23 +++++++++++++++++++++++ src/common/Executor.hpp | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index e3d198ea2..e1578a298 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -49,6 +49,29 @@ Executor::~Executor() #endif } +// New functions for Kokkos to register new group and kernel IDs + +/* +groupID Executor::registerGroup(std::string) +{ + + + return getNewGroupID(); + +} + +kernelID Executor::registerKernel(std::string, groupID groupName, KernelBase*) +{ + + return getNewKernelID(); +} + +*/ + + + + + void Executor::setupSuite() { diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index e25800636..1485fff15 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -43,6 +43,18 @@ class Executor void outputRunData(); + // Interface for adding new Kokkos groups and kernels +/* + using groupID = int; + using kernelID = int; + + + groupID registerGroup(std::string groupName); + + kernelID registerKernel(std::string, groupID groupName, KernelBase*); +*/ + + private: Executor() = delete; @@ -71,6 +83,27 @@ class Executor void writeFOMReport(const std::string& filename); void getFOMGroups(std::vector& fom_groups); + // Kokkos add group and kernel ID functions + /* + inline groupID getNewGroupID() { + // The newGroupID will be shared amongst invocations of this + // function. + static groupID newGroupID; + + return newGroupID++; + + } + + inline kernelID getNewKernelID() { + + static kernelID newKernelID; + return newKernelID++; + + } + +*/ + + // Data members RunParams run_params; std::vector kernels; std::vector variant_ids; From 97f9e5dadec39175ccd236e34a15c871633f8da0 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 25 Mar 2021 14:03:33 -0600 Subject: [PATCH 065/124] DAXPY: comment out functor --- src/basic-kokkos/DAXPY-Kokkos.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/basic-kokkos/DAXPY-Kokkos.cpp b/src/basic-kokkos/DAXPY-Kokkos.cpp index d5d478032..fd6569200 100644 --- a/src/basic-kokkos/DAXPY-Kokkos.cpp +++ b/src/basic-kokkos/DAXPY-Kokkos.cpp @@ -63,7 +63,8 @@ void DAXPY::runKokkosVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Kokkos::parallel_for("DAXPY-Kokkos Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), + Kokkos::parallel_for("DAXPY-Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), // Increment y_view (pointer wrapped in KokksView) // by product of a and ith entry of x_view // DAXPY_BODY substituted with the @@ -78,19 +79,21 @@ void DAXPY::runKokkosVariant(VariantID vid) break; } - case Kokkos_Functor: { +/* case Kokkos_Functor: { DaxpyFunctor daxpy_functor_instance(y,x,a); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { Kokkos::parallel_for("DAXPY-Kokkos Kokkos_Functor", Kokkos::RangePolicy(ibegin, iend), daxpy_functor_instance); } + stopTimer(); break; } +*/ default : { std::cout << "\n DAXPY : Unknown variant id = " << vid << std::endl; } From 4dbf1d0d86d9188b3115f6da1e8847ce9712bd49 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 29 Mar 2021 12:26:36 -0600 Subject: [PATCH 066/124] interface prototype that builds --- src/common/Executor.cpp | 102 +++++++++++++++++++++++++++++++++++++--- src/common/Executor.hpp | 28 ++++++++--- 2 files changed, 118 insertions(+), 12 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index e1578a298..0d41439cc 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -51,22 +51,112 @@ Executor::~Executor() // New functions for Kokkos to register new group and kernel IDs -/* -groupID Executor::registerGroup(std::string) + +Executor::groupID Executor::registerGroup(std::string groupName) { - + auto checkIfGroupExists = kernelsPerGroup.find(groupName); + if (checkIfGroupExists == kernelsPerGroup.end()){ + kernelsPerGroup[groupName] = kernelSet(); +} + else { + // TODO: ERROR CONDITION + +} return getNewGroupID(); } -kernelID Executor::registerKernel(std::string, groupID groupName, KernelBase*) +// New function +Executor::kernelID Executor::registerKernel(std::string kernelName, std::string groupName, KernelBase* kernel) { + + + auto checkIfKernelExists = allKernels.find(kernelName); + if (checkIfKernelExists == allKernels.end()) { + allKernels[kernelName] = kernel; +} + else { + // TODO: ERROR CONDITION + return getNewKernelID(); +} + // Add the kernel to its group + auto checkIfGroupExists = kernelsPerGroup.find(groupName); + if (checkIfGroupExists == kernelsPerGroup.end()){ + // If group does not exist, ERROR CONDITION +} - return getNewKernelID(); +else { + // + checkIfGroupExists -> second.insert(kernel); + +} +return getNewKernelID(); } -*/ + +std::vector Executor::lookUpKernelByName(std::string kernelOrGroupName){ + + // The vector / list return type, std::vector will contain + // either all of the kernels with a given kernel name or group name + // We have two maps (defined in Executor.hpp): kernelMap allKernels, groupMap kernelsPerGroup, + // STEPS: + // 1) declare new vector that will contain the string data: + // 2) LOGIC: + // i) check to see if the kernel / group requested on the + // "./rajaperf.exe -k" line (you can pass either a specific kernel or a + // kernel group + + std::vector kernelsByNameVect ; + + // If kernelName is groupName , then add that set of kernels in the + // group to the vector + // else if kernelName is kernel, then add the kernel to the vector + // else if kernelName is horse stuff, then say so + // + // + // Declare iterator against which you can test equivalence + auto checkLookUpGroupNameIterator = kernelsPerGroup.find(kernelOrGroupName); + auto checkLookUpKernelNameIterator = allKernels.find(kernelOrGroupName); + + // Check to see if groupName NOT in kernelsPerGroup; + // end() iterates to the end + if (checkLookUpGroupNameIterator != kernelsPerGroup.end()) { + //cout << " STEP 1" << endl; + + // when using the arrow, you get a key, value pair. + // YOu can access either member by "first" or "second" + + auto groupSetForTests = checkLookUpGroupNameIterator -> second; + + for (auto item: groupSetForTests) { + kernelsByNameVect.push_back(item); + } + } + + else if (checkLookUpKernelNameIterator != allKernels.end()) { + + auto kernelSetForTests = checkLookUpKernelNameIterator -> second; + + kernelsByNameVect.push_back(kernelSetForTests); + + + } + + else { + + //TODO: ERROR CASE; + + exit(1); + + + } + + +return kernelsByNameVect; + + +} diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 1485fff15..8c71279bb 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -44,16 +44,26 @@ class Executor void outputRunData(); // Interface for adding new Kokkos groups and kernels -/* + using groupID = int; + using kernelSet = std::set; + using kernelMap = std::map; + using groupMap = std::map; using kernelID = int; + /////////////////////////////////////////////////// + // + // Logic: + // Need the full set of kernels + // Associate group names (e.g., lcals, basic) with kernel sets + // Interface to add new kernels (e.g., DAXPY) and groups (basic) + // for Kokkos Performance Testing groupID registerGroup(std::string groupName); - kernelID registerKernel(std::string, groupID groupName, KernelBase*); -*/ + kernelID registerKernel(std::string, std::string groupName, KernelBase*); + std::vector lookUpKernelByName(std::string kernelOrGroupName); private: Executor() = delete; @@ -83,8 +93,9 @@ class Executor void writeFOMReport(const std::string& filename); void getFOMGroups(std::vector& fom_groups); - // Kokkos add group and kernel ID functions - /* + // Kokkos add group and kernel ID inline functions + // Provisional Design for Kokkos + inline groupID getNewGroupID() { // The newGroupID will be shared amongst invocations of this // function. @@ -101,7 +112,7 @@ class Executor } -*/ + // Data members RunParams run_params; @@ -109,6 +120,11 @@ class Executor std::vector variant_ids; VariantID reference_vid; + + kernelMap allKernels; + groupMap kernelsPerGroup; + + }; } // closing brace for rajaperf namespace From a3196d68e05f838608bcdc419a4558a374c205ef Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Tue, 6 Apr 2021 08:56:08 -0600 Subject: [PATCH 067/124] Added infrastructure for make_kernel_base, and support for it in the Executor class --- src/common/Executor.cpp | 3 +- src/common/Executor.hpp | 2 +- src/common/KernelBase.cpp | 19 ++++++- src/common/KernelBase.hpp | 2 + src/common/QuickKernelBase.hpp | 92 ++++++++++++++++++++++++++++++++++ 5 files changed, 115 insertions(+), 3 deletions(-) create mode 100644 src/common/QuickKernelBase.hpp diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 0d41439cc..f9a0750f8 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -68,9 +68,10 @@ Executor::groupID Executor::registerGroup(std::string groupName) } // New function -Executor::kernelID Executor::registerKernel(std::string kernelName, std::string groupName, KernelBase* kernel) +Executor::kernelID Executor::registerKernel(std::string groupName, KernelBase* kernel) { + auto kernelName = kernel->getName(); auto checkIfKernelExists = allKernels.find(kernelName); if (checkIfKernelExists == allKernels.end()) { diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 8c71279bb..cefdc3077 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -61,7 +61,7 @@ class Executor groupID registerGroup(std::string groupName); - kernelID registerKernel(std::string, std::string groupName, KernelBase*); + kernelID registerKernel(std::string, KernelBase*); std::vector lookUpKernelByName(std::string kernelOrGroupName); diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 5c4637d23..83e4eb98e 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -32,6 +32,24 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) } } + KernelBase::KernelBase(std::string& name, const RunParams& params) + : run_params(params), + kernel_id(Basic_DAXPY), // TODO DZP: better + name(name), + default_size(0), + default_reps(0), + running_variant(NumVariants) + { + for (size_t ivar = 0; ivar < NumVariants; ++ivar) { + checksum[ivar] = 0.0; + num_exec[ivar] = 0; + min_time[ivar] = std::numeric_limits::max(); + max_time[ivar] = -std::numeric_limits::max(); + tot_time[ivar] = 0.0; + has_variant_to_run[ivar] = false; + } + } + KernelBase::~KernelBase() { @@ -68,7 +86,6 @@ void KernelBase::execute(VariantID vid) this->setUp(vid); #ifdef RUN_KOKKOS Kokkos::Tools::pushRegion(this->getName() + ":"+getVariantName(vid)); - #endif this->runKernel(vid); #ifdef RUN_KOKKOS diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 4c27f09df..69c05bfbd 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -33,11 +33,13 @@ class KernelBase public: KernelBase(KernelID kid, const RunParams& params); + KernelBase(std::string& name, const RunParams& params); virtual ~KernelBase(); KernelID getKernelID() const { return kernel_id; } const std::string& getName() const { return name; } + void setName(const std::string& new_name) { name = new_name; } Index_type getDefaultSize() const { return default_size; } Index_type getDefaultReps() const { return default_reps; } diff --git a/src/common/QuickKernelBase.hpp b/src/common/QuickKernelBase.hpp new file mode 100644 index 000000000..b8736e1c2 --- /dev/null +++ b/src/common/QuickKernelBase.hpp @@ -0,0 +1,92 @@ +#ifndef RAJAPERFSUITE_QUICKKERNELBASE_HPP +#define RAJAPERFSUITE_QUICKKERNELBASE_HPP + +#include "KernelBase.hpp" +#include + +namespace rajaperf { + + struct SureBuddyOkay { + bool validate_checksum(double reference, double variant) { + return true; + } + }; + + template + class QuickKernelBase : public rajaperf::KernelBase { + SetUp m_setup; + Execute m_execute; + Checksum m_checksum; + using runData = decltype(m_setup(0, 0)); + runData rd; + public: + QuickKernelBase(std::string &name, const RunParams ¶ms, SetUp se, Execute ex, Checksum ch) : KernelBase( + name, + params), + m_setup(se), + m_execute(ex), + m_checksum( + ch) {} + + QuickKernelBase(std::string &name, const RunParams ¶ms, SetUp se, Execute ex) : KernelBase(name, + params), + m_setup(se), + m_execute(ex), + m_checksum( + SureBuddyOkay()) {} + + Real_type m_y; + + void setUp(VariantID vid) override { rd = m_setup(0, 0); } + + void updateChecksum(VariantID vid) override { + checksum[vid] += m_y; + } + + void tearDown(VariantID vID) override {} + + void runSeqVariant(VariantID vID) override {} + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + void runOpenMPVariant(VariantID vid) override { + auto size = getRunSize(); + for(int x =0; x< getRunReps(); ++x){ + m_execute(x, size) + } + } +#endif +#if defined(RAJA_ENABLE_CUDA) + void runCudaVariant(VariantID vid) override {} +#endif +#if defined(RAJA_ENABLE_HIP) + void runHipVariant(VariantID vid) override {} +#endif +#if defined(RAJA_ENABLE_TARGET_OPENMP) + void runOpenMPTargetVariant(VariantID vid) override {} +#endif + +#if defined(RUN_KOKKOS) + using index_seq = std::make_index_sequence::value>; + + template + void rkv_helper(std::index_sequence) { + auto size = getRunSize(); + for (int x = 0; x < getRunReps(); ++x) { + m_execute(x, size, std::get(rd)...); + } + } + + void runKokkosVariant(VariantID vid) override { + rkv_helper(index_seq()); + } + +#endif // RUN_KOKKOS + }; + + template + KernelBase *make_kernel_base(std::string name, const RunParams ¶ms, Lambdas... lambdas) { + return new QuickKernelBase(name, params, lambdas...); + } + +} // end namespace rajaperf +#endif //RAJAPERFSUITE_QUICKKERNELBASE_HPP From ae6286dda5901df9852a87eef42f6e1a237f2ed2 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Tue, 6 Apr 2021 09:53:17 -0600 Subject: [PATCH 068/124] Added infrastructure for make_kernel_base, and support for it in the Executor class --- src/common/QuickKernelBase.hpp | 38 ++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/src/common/QuickKernelBase.hpp b/src/common/QuickKernelBase.hpp index b8736e1c2..dc0cc7cef 100644 --- a/src/common/QuickKernelBase.hpp +++ b/src/common/QuickKernelBase.hpp @@ -17,7 +17,11 @@ namespace rajaperf { SetUp m_setup; Execute m_execute; Checksum m_checksum; - using runData = decltype(m_setup(0, 0)); + struct empty { + }; + using runData_helper = decltype(m_setup(0, 0)); + using runData = typename std::conditional::value, empty, runData_helper>::type; + using is_empty = std::is_same; runData rd; public: QuickKernelBase(std::string &name, const RunParams ¶ms, SetUp se, Execute ex, Checksum ch) : KernelBase( @@ -37,7 +41,16 @@ namespace rajaperf { Real_type m_y; - void setUp(VariantID vid) override { rd = m_setup(0, 0); } + void setUpHelper(std::true_type) { + } + + void setUpHelper(std::false_type) { + rd = m_setup(0, 0); + } + + void setUp(VariantID vid) override { + setUpHelper(is_empty()); + } void updateChecksum(VariantID vid) override { checksum[vid] += m_y; @@ -66,7 +79,6 @@ namespace rajaperf { #endif #if defined(RUN_KOKKOS) - using index_seq = std::make_index_sequence::value>; template void rkv_helper(std::index_sequence) { @@ -76,10 +88,28 @@ namespace rajaperf { } } - void runKokkosVariant(VariantID vid) override { + void rkv_helper(empty em) { + auto size = getRunSize(); + for (int x = 0; x < getRunReps(); ++x) { + m_execute(x, size); + } + } + + void rkv_switch_on_empty(std::false_type) { + using index_seq = typename + std::make_index_sequence::value>; rkv_helper(index_seq()); } + void rkv_switch_on_empty(std::true_type) { + rkv_helper(empty()); + + } + + void runKokkosVariant(VariantID vid) override { + rkv_switch_on_empty(is_empty()); + } + #endif // RUN_KOKKOS }; From 6f16423ef24f628107d058d4cce91277de2043b9 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Tue, 6 Apr 2021 10:49:53 -0600 Subject: [PATCH 069/124] Small fix to CMakeLists --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fdf65de1d..344dff0a7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,7 +16,7 @@ add_subdirectory(basic-kokkos) #add_subdirectory(lcals) #add_subdirectory(polybench) #add_subdirectory(stream) -add_subdirectory(algorithm) +#add_subdirectory(algorithm) set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS common From 05d6ab76104ad6824d43f42a93553877dd22f757 Mon Sep 17 00:00:00 2001 From: Amy Jo Powell Date: Wed, 7 Apr 2021 11:56:40 -0600 Subject: [PATCH 070/124] add KOKKOS_FUNCTION to TRAP_INT-Kokkos.cpp --- blt | 2 +- src/basic-kokkos/TRAP_INT-Kokkos.cpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/blt b/blt index 2c192774b..f50d586dd 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit 2c192774b587c245ec2d7022b2e862395ffa8a21 +Subproject commit f50d586dda77e6de70197d1470149621681cc580 diff --git a/src/basic-kokkos/TRAP_INT-Kokkos.cpp b/src/basic-kokkos/TRAP_INT-Kokkos.cpp index 984e5e82a..bf882da0d 100644 --- a/src/basic-kokkos/TRAP_INT-Kokkos.cpp +++ b/src/basic-kokkos/TRAP_INT-Kokkos.cpp @@ -21,6 +21,8 @@ namespace basic // Function used in TRAP_INT loop. // RAJA_INLINE +// +KOKKOS_FUNCTION Real_type trap_int_func(Real_type x, Real_type y, Real_type xp, From 8c3519967178cdc1ac62fadbfc9d68c18bcfb64d Mon Sep 17 00:00:00 2001 From: Amy Jo Powell Date: Wed, 7 Apr 2021 16:25:14 -0600 Subject: [PATCH 071/124] updating rocprim in RAJA --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 0502b9b69..297843f9d 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 0502b9b69c4cb60aa0afbdf699b555c76cb18f22 +Subproject commit 297843f9d358809f5767fb1fa6cd413ca424dfc6 From 18253c3bc4981aced0d04c2c3e06ea1f62d72ffa Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Wed, 7 Apr 2021 16:28:46 -0600 Subject: [PATCH 072/124] Added infrastructure for make_kernel_base, and support for it in the Executor class --- src/RAJAPerfSuiteDriver.cpp | 20 +- src/common/Executor.cpp | 7 +- src/common/Executor.hpp | 3 + src/common/KernelBase.cpp | 2 +- src/common/KernelBase.hpp | 2 +- src/common/RAJAPerfSuite.cpp | 385 ++++++++++++++++++++--------------- src/common/RAJAPerfSuite.hpp | 7 +- 7 files changed, 262 insertions(+), 164 deletions(-) diff --git a/src/RAJAPerfSuiteDriver.cpp b/src/RAJAPerfSuiteDriver.cpp index e73ec2260..98f122d4a 100644 --- a/src/RAJAPerfSuiteDriver.cpp +++ b/src/RAJAPerfSuiteDriver.cpp @@ -7,14 +7,32 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include "common/Executor.hpp" - +#include "common/QuickKernelBase.hpp" #include //------------------------------------------------------------------------------ int main( int argc, char** argv ) { // STEP 1: Create suite executor object + //rajaperf::Executor executor(argc, argv); rajaperf::Executor executor(argc, argv); + rajaperf::make_perfsuite_executor(&executor, argc, argv); + //rajaperf::RunParams params(argc, argv); + //executor.registerGroup("Sparse"); + + //executor.registerKernel("Sparse", rajaperf::make_kernel_base( + // "Sparse_SPMV", params, [&](const int repfact, const int size){ + // }, + // [&] (const int repfact, const int size) {} + // )); + // executor.registerKernel("Sparse", rajaperf::make_kernel_base( + // "Sparse_SPMM", params, [&](const int repfact, const int size){ + // return std::make_tuple(1); + // }, + // [&] (const int repfact, const int size, auto matrix) { + // // do the math using Kokkos Kernels operators + // } + // )); // STEP 2: Assemble kernels and variants to run executor.setupSuite(); diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index f9a0750f8..b25ef6553 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -1102,5 +1102,10 @@ void Executor::getFOMGroups(vector& fom_groups) } - +void free_register_group(Executor* exec, std::string groupName){ + exec->registerGroup(groupName); +} +void free_register_kernel(Executor* exec, std::string groupName, KernelBase* kernel) { + exec->registerKernel(groupName, kernel); +} } // closing brace for rajaperf namespace diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index cefdc3077..2ae2e5ffc 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -127,6 +127,9 @@ class Executor }; +void free_register_group(Executor*, std::string); +void free_register_kernel(Executor*, std::string, KernelBase*); + } // closing brace for rajaperf namespace #endif // closing endif for header file include guard diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index a86f72583..73755eb0d 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -32,7 +32,7 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) } } - KernelBase::KernelBase(std::string& name, const RunParams& params) + KernelBase::KernelBase(std::string name, const RunParams& params) : run_params(params), kernel_id(Basic_DAXPY), // TODO DZP: better name(name), diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 9e11f0afc..36cbb14e5 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -39,7 +39,7 @@ class KernelBase public: KernelBase(KernelID kid, const RunParams& params); - KernelBase(std::string& name, const RunParams& params); + KernelBase(std::string name, const RunParams& params); virtual ~KernelBase(); diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index a2a4127ed..fd510e7f4 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -87,8 +87,81 @@ #include -namespace rajaperf -{ +namespace rajaperf { + void make_perfsuite_executor(rajaperf::Executor *exec, int argc, char *argv[]) { + RunParams run_params(argc, argv); + free_register_group(exec, std::string("Basic")); + free_register_group(exec, std::string("Lcals")); + free_register_group(exec, std::string("Polybench")); + free_register_group(exec, std::string("Stream")); + free_register_group(exec, std::string("Apps")); + free_register_group(exec, std::string("Algorithm")); + + // Basic + + free_register_kernel(exec, "Basic", new basic::ATOMIC_PI(run_params)); + free_register_kernel(exec, "Basic", new basic::DAXPY(run_params)); + free_register_kernel(exec, "Basic", new basic::IF_QUAD(run_params)); + free_register_kernel(exec, "Basic", new basic::IF_QUAD(run_params)); + free_register_kernel(exec, "Basic", new basic::INIT3(run_params)); + free_register_kernel(exec, "Basic", new basic::INIT_VIEW1D(run_params)); + free_register_kernel(exec, "Basic", new basic::INIT_VIEW1D_OFFSET(run_params)); + free_register_kernel(exec, "Basic", new basic::MULADDSUB(run_params)); + free_register_kernel(exec, "Basic", new basic::NESTED_INIT(run_params)); + free_register_kernel(exec, "Basic", new basic::REDUCE3_INT(run_params)); + free_register_kernel(exec, "Basic", new basic::TRAP_INT(run_params)); + /** + // Lcals + free_register_kernel(exec, "Lcals", new lcals::DIFF_PREDICT(run_params)); + free_register_kernel(exec, "Lcals", new lcals::EOS(run_params)); + free_register_kernel(exec, "Lcals", new lcals::FIRST_DIFF(run_params)); + free_register_kernel(exec, "Lcals", new lcals::FIRST_MIN(run_params)); + free_register_kernel(exec, "Lcals", new lcals::FIRST_SUM(run_params)); + free_register_kernel(exec, "Lcals", new lcals::GEN_LIN_RECUR(run_params)); + free_register_kernel(exec, "Lcals", new lcals::HYDRO_1D(run_params)); + free_register_kernel(exec, "Lcals", new lcals::HYDRO_2D(run_params)); + free_register_kernel(exec, "Lcals", new lcals::INT_PREDICT(run_params)); + free_register_kernel(exec, "Lcals", new lcals::PLANCKIAN(run_params)); + free_register_kernel(exec, "Lcals", new lcals::TRIDIAG_ELIM(run_params)); + + // Polybench + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_2MM(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_3MM(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_ADI(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_ATAX(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_FDTD_2D(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_FLOYD_WARSHALL(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_GEMM(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_GEMVER(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_GESUMMV(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_HEAT_3D(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_JACOBI_1D(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_JACOBI_2D(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_MVT(run_params)); + + // Stream + free_register_kernel(exec, "Stream", new stream::ADD(run_params)); + free_register_kernel(exec, "Stream", new stream::COPY(run_params)); + free_register_kernel(exec, "Stream", new stream::DOT(run_params)); + free_register_kernel(exec, "Stream", new stream::MUL(run_params)); + free_register_kernel(exec, "Stream", new stream::TRIAD(run_params)); + + // Apps + free_register_kernel(exec, "Apps", new apps::COUPLE(run_params)); + free_register_kernel(exec, "Apps", new apps::DEL_DOT_VEC_2D(run_params)); + free_register_kernel(exec, "Apps", new apps::ENERGY(run_params)); + free_register_kernel(exec, "Apps", new apps::FIR(run_params)); + free_register_kernel(exec, "Apps", new apps::HALOEXCHANGE(run_params)); + free_register_kernel(exec, "Apps", new apps::LTIMES(run_params)); + free_register_kernel(exec, "Apps", new apps::LTIMES_NOVIEW(run_params)); + free_register_kernel(exec, "Apps", new apps::PRESSURE(run_params)); + free_register_kernel(exec, "Apps", new apps::VOL3D(run_params)); + + // Algorithm + free_register_kernel(exec, "Algorithm", new algorithm::SORT(run_params)); + free_register_kernel(exec, "Algorithm", new algorithm::SORTPAIRS(run_params)); + */ + } /*! ******************************************************************************* @@ -102,18 +175,18 @@ namespace rajaperf * ******************************************************************************* */ -static const std::string GroupNames [] = -{ - std::string("Basic"), - std::string("Lcals"), - std::string("Polybench"), - std::string("Stream"), - std::string("Apps"), - std::string("Algorithm"), + static const std::string GroupNames[] = + { + std::string("Basic"), + std::string("Lcals"), + std::string("Polybench"), + std::string("Stream"), + std::string("Apps"), + std::string("Algorithm"), - std::string("Unknown Group") // Keep this at the end and DO NOT remove.... + std::string("Unknown Group") // Keep this at the end and DO NOT remove.... -}; // END GroupNames + }; // END GroupNames /*! @@ -128,22 +201,22 @@ static const std::string GroupNames [] = * ******************************************************************************* */ -static const std::string KernelNames [] = -{ + static const std::string KernelNames[] = + { // // Basic kernels... // - std::string("Basic_ATOMIC_PI"), - std::string("Basic_DAXPY"), - std::string("Basic_IF_QUAD"), - std::string("Basic_INIT3"), - std::string("Basic_INIT_VIEW1D"), - std::string("Basic_INIT_VIEW1D_OFFSET"), - std::string("Basic_MULADDSUB"), - std::string("Basic_NESTED_INIT"), - std::string("Basic_REDUCE3_INT"), - std::string("Basic_TRAP_INT"), + std::string("Basic_ATOMIC_PI"), + std::string("Basic_DAXPY"), + std::string("Basic_IF_QUAD"), + std::string("Basic_INIT3"), + std::string("Basic_INIT_VIEW1D"), + std::string("Basic_INIT_VIEW1D_OFFSET"), + std::string("Basic_MULADDSUB"), + std::string("Basic_NESTED_INIT"), + std::string("Basic_REDUCE3_INT"), + std::string("Basic_TRAP_INT"), // // Lcals kernels... @@ -204,9 +277,9 @@ static const std::string KernelNames [] = // std::string("Algorithm_SORT"), // std::string("Algorithm_SORTPAIRS"), - std::string("Unknown Kernel") // Keep this at the end and DO NOT remove.... + std::string("Unknown Kernel") // Keep this at the end and DO NOT remove.... -}; // END KernelNames + }; // END KernelNames /*! @@ -221,36 +294,36 @@ static const std::string KernelNames [] = * ******************************************************************************* */ -static const std::string VariantNames [] = -{ + static const std::string VariantNames[] = + { - std::string("Base_Seq"), - std::string("Lambda_Seq"), - std::string("RAJA_Seq"), + std::string("Base_Seq"), + std::string("Lambda_Seq"), + std::string("RAJA_Seq"), - std::string("Base_OpenMP"), - std::string("Lambda_OpenMP"), - std::string("RAJA_OpenMP"), + std::string("Base_OpenMP"), + std::string("Lambda_OpenMP"), + std::string("RAJA_OpenMP"), - std::string("Base_OMPTarget"), - std::string("RAJA_OMPTarget"), + std::string("Base_OMPTarget"), + std::string("RAJA_OMPTarget"), - std::string("Base_CUDA"), - std::string("Lambda_CUDA"), - std::string("RAJA_CUDA"), - std::string("RAJA_WORKGROUP_CUDA"), + std::string("Base_CUDA"), + std::string("Lambda_CUDA"), + std::string("RAJA_CUDA"), + std::string("RAJA_WORKGROUP_CUDA"), - std::string("Base_HIP"), - std::string("Lambda_HIP"), - std::string("RAJA_HIP"), - std::string("RAJA_WORKGROUP_HIP"), + std::string("Base_HIP"), + std::string("Lambda_HIP"), + std::string("RAJA_HIP"), + std::string("RAJA_WORKGROUP_HIP"), - std::string("Kokkos_Lambda"), - std::string("Kokkos_Functor"), + std::string("Kokkos_Lambda"), + std::string("Kokkos_Functor"), - std::string("Unknown Variant") // Keep this at the end and DO NOT remove.... + std::string("Unknown Variant") // Keep this at the end and DO NOT remove.... -}; // END VariantNames + }; // END VariantNames /* @@ -260,10 +333,9 @@ static const std::string VariantNames [] = * ******************************************************************************* */ -const std::string& getGroupName(GroupID sid) -{ - return GroupNames[sid]; -} + const std::string &getGroupName(GroupID sid) { + return GroupNames[sid]; + } /* @@ -273,12 +345,11 @@ const std::string& getGroupName(GroupID sid) * ******************************************************************************* */ -std::string getKernelName(KernelID kid) -{ - std::string::size_type pos = KernelNames[kid].find("_"); - std::string kname(KernelNames[kid].substr(pos+1, std::string::npos)); - return kname; -} + std::string getKernelName(KernelID kid) { + std::string::size_type pos = KernelNames[kid].find("_"); + std::string kname(KernelNames[kid].substr(pos + 1, std::string::npos)); + return kname; + } /* @@ -288,10 +359,9 @@ std::string getKernelName(KernelID kid) * ******************************************************************************* */ -const std::string& getFullKernelName(KernelID kid) -{ - return KernelNames[kid]; -} + const std::string &getFullKernelName(KernelID kid) { + return KernelNames[kid]; + } /* @@ -301,10 +371,9 @@ const std::string& getFullKernelName(KernelID kid) * ******************************************************************************* */ -const std::string& getVariantName(VariantID vid) -{ - return VariantNames[vid]; -} + const std::string &getVariantName(VariantID vid) { + return VariantNames[vid]; + } /*! ******************************************************************************* @@ -314,62 +383,61 @@ const std::string& getVariantName(VariantID vid) * ******************************************************************************* */ -bool isVariantAvailable(VariantID vid) -{ - bool ret_val = false; + bool isVariantAvailable(VariantID vid) { + bool ret_val = false; - if ( vid == Base_Seq ) { - ret_val = true; - } + if (vid == Base_Seq) { + ret_val = true; + } #if defined(RUN_RAJA_SEQ) - if ( vid == Lambda_Seq || - vid == RAJA_Seq ) { - ret_val = true; - } + if (vid == Lambda_Seq || + vid == RAJA_Seq) { + ret_val = true; + } #endif #if defined(RUN_KOKKOS) - if ( vid == Kokkos_Lambda || - vid == Kokkos_Functor ) { - ret_val = true; - } + if (vid == Kokkos_Lambda || + vid == Kokkos_Functor) { + ret_val = true; + } #endif // RUN_KOKKOS #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - if ( vid == Base_OpenMP || - vid == Lambda_OpenMP || - vid == RAJA_OpenMP ) { - ret_val = true; - } + if ( vid == Base_OpenMP || + vid == Lambda_OpenMP || + vid == RAJA_OpenMP ) { + ret_val = true; + } #endif #if defined(RAJA_ENABLE_TARGET_OPENMP) - if ( vid == Base_OpenMPTarget || - vid == RAJA_OpenMPTarget ) { - ret_val = true; - } + if ( vid == Base_OpenMPTarget || + vid == RAJA_OpenMPTarget ) { + ret_val = true; + } #endif #if defined(RAJA_ENABLE_CUDA) - if ( vid == Base_CUDA || - vid == Lambda_CUDA || - vid == RAJA_CUDA || - vid == RAJA_WORKGROUP_CUDA ) { - ret_val = true; - } + if ( vid == Base_CUDA || + vid == Lambda_CUDA || + vid == RAJA_CUDA || + vid == RAJA_WORKGROUP_CUDA ) { + ret_val = true; + } #endif #if defined(RAJA_ENABLE_HIP) - if ( vid == Base_HIP || - vid == Lambda_HIP || - vid == RAJA_HIP || - vid == RAJA_WORKGROUP_HIP ) { - ret_val = true; - } + if ( vid == Base_HIP || + vid == Lambda_HIP || + vid == RAJA_HIP || + vid == RAJA_WORKGROUP_HIP ) { + ret_val = true; + } #endif - return ret_val; -} + return ret_val; + } /* ******************************************************************************* @@ -378,56 +446,55 @@ bool isVariantAvailable(VariantID vid) * ******************************************************************************* */ -KernelBase* getKernelObject(KernelID kid, - const RunParams& run_params) -{ - KernelBase* kernel = 0; - - switch ( kid ) { - - // - // Basic kernels... - // - case Basic_ATOMIC_PI : { - kernel = new basic::ATOMIC_PI(run_params); - break; - } - case Basic_DAXPY : { - kernel = new basic::DAXPY(run_params); - break; - } - case Basic_IF_QUAD : { - kernel = new basic::IF_QUAD(run_params); - break; - } - case Basic_INIT3 : { - kernel = new basic::INIT3(run_params); - break; - } - case Basic_INIT_VIEW1D : { - kernel = new basic::INIT_VIEW1D(run_params); - break; - } - case Basic_INIT_VIEW1D_OFFSET : { - kernel = new basic::INIT_VIEW1D_OFFSET(run_params); - break; - } - case Basic_MULADDSUB : { - kernel = new basic::MULADDSUB(run_params); - break; - } - case Basic_NESTED_INIT : { - kernel = new basic::NESTED_INIT(run_params); - break; - } - case Basic_REDUCE3_INT : { - kernel = new basic::REDUCE3_INT(run_params); - break; - } - case Basic_TRAP_INT : { - kernel = new basic::TRAP_INT(run_params); - break; - } + KernelBase *getKernelObject(KernelID kid, + const RunParams &run_params) { + KernelBase *kernel = 0; + + switch (kid) { + + // + // Basic kernels... + // + case Basic_ATOMIC_PI : { + kernel = new basic::ATOMIC_PI(run_params); + break; + } + case Basic_DAXPY : { + kernel = new basic::DAXPY(run_params); + break; + } + case Basic_IF_QUAD : { + kernel = new basic::IF_QUAD(run_params); + break; + } + case Basic_INIT3 : { + kernel = new basic::INIT3(run_params); + break; + } + case Basic_INIT_VIEW1D : { + kernel = new basic::INIT_VIEW1D(run_params); + break; + } + case Basic_INIT_VIEW1D_OFFSET : { + kernel = new basic::INIT_VIEW1D_OFFSET(run_params); + break; + } + case Basic_MULADDSUB : { + kernel = new basic::MULADDSUB(run_params); + break; + } + case Basic_NESTED_INIT : { + kernel = new basic::NESTED_INIT(run_params); + break; + } + case Basic_REDUCE3_INT : { + kernel = new basic::REDUCE3_INT(run_params); + break; + } + case Basic_TRAP_INT : { + kernel = new basic::TRAP_INT(run_params); + break; + } /** DZP: big comment block for unimplemented // // Lcals kernels... @@ -609,13 +676,13 @@ KernelBase* getKernelObject(KernelID kid, break; } */ - default: { - std::cout << "\n Unknown Kernel ID = " << kid << std::endl; - } + default: { + std::cout << "\n Unknown Kernel ID = " << kid << std::endl; + } - } // end switch on kernel id + } // end switch on kernel id - return kernel; -} + return kernel; + } } // closing brace for rajaperf namespace diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 13b8e5e4a..9417954fb 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -20,10 +20,15 @@ #endif #include - namespace rajaperf { +class Executor; // forward declaration +class KernelBase; + +void free_register_group(Executor*, std::string); // forward declaration +void free_register_kernel(Executor*, std::string, KernelBase*); // forward declaration +void make_perfsuite_executor(Executor* exec, int argc, char* argv[]); #if defined(RUN_KOKKOS) // Kokkos Design Spirit: From 4f730cd3ac70850d22ed2dc90f9efc111d857976 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Wed, 7 Apr 2021 16:56:09 -0600 Subject: [PATCH 073/124] Update RAJA with version from develop --- tpl/RAJA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpl/RAJA b/tpl/RAJA index 297843f9d..3047fa720 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 297843f9d358809f5767fb1fa6cd413ca424dfc6 +Subproject commit 3047fa720132d19ee143b1fcdacaa72971f5988c From 7f17ba47a61ac52e2b47a1ddfb01940ce85394bf Mon Sep 17 00:00:00 2001 From: Amy Jo Powell Date: Fri, 9 Apr 2021 13:47:21 -0600 Subject: [PATCH 074/124] cleaning up RPS vs Kokkos handling of HIP backend --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 39c7e707f..789bd7dfb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,7 +83,8 @@ if (ENABLE_CUDA) list(APPEND RAJA_PERFSUITE_DEPENDS cuda) endif() # HIP is used with AMD / VEGA GPU -if (ENABLE_HIP) +# Neatly separate RAJAPerf Suite and Kokkos handling of HIP +if ((ENABLE_HIP) AND (NOT ENABLE_KOKKOS)) list(APPEND RAJA_PERFSUITE_DEPENDS hip) endif() From af54587b393d175a6dc375b6fb225289b23276bc Mon Sep 17 00:00:00 2001 From: Amy Jo Powell Date: Fri, 23 Apr 2021 13:16:15 -0600 Subject: [PATCH 075/124] Changes for HIP config and build --- .gitmodules | 2 +- CMakeLists.txt | 3 ++- tpl/RAJA | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.gitmodules b/.gitmodules index 9aec11a63..600442a7e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,7 +6,7 @@ url = https://github.com/LLNL/blt.git [submodule "tpl/RAJA"] path = tpl/RAJA - url = https://github.com/LLNL/RAJA.git + url = https://github.com/ajpowelsnl/RAJA.git [submodule "tpl/kokkos"] path = tpl/kokkos url = https://github.com/kokkos/kokkos diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a6ff871b..fc9069999 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,6 +85,7 @@ endif() # HIP is used with AMD / VEGA GPU # Neatly separate RAJAPerf Suite and Kokkos handling of HIP if ((ENABLE_HIP) AND (NOT ENABLE_KOKKOS)) +#if (ENABLE_HIP) list(APPEND RAJA_PERFSUITE_DEPENDS hip) endif() @@ -186,5 +187,5 @@ endif() # # Each directory in the perf suite has its own CMakeLists.txt file. -# +# DZP, AJP, DB, DA fixes add_subdirectory(src) diff --git a/tpl/RAJA b/tpl/RAJA index 3047fa720..a51c43c65 160000 --- a/tpl/RAJA +++ b/tpl/RAJA @@ -1 +1 @@ -Subproject commit 3047fa720132d19ee143b1fcdacaa72971f5988c +Subproject commit a51c43c652fac99089157106ce57f9b0bc578858 From 009388cce403ee1435995ab1d4a6d3008af87f6f Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Fri, 23 Apr 2021 14:57:03 -0600 Subject: [PATCH 076/124] making RPS use new infrastructure (Executor.cpp) to handle Kokkos --- src/RAJAPerfSuiteDriver.cpp | 1 + src/common/Executor.cpp | 326 +++++++++++++++++++++++++++-------- src/common/Executor.hpp | 5 + src/common/RAJAPerfSuite.cpp | 1 - 4 files changed, 257 insertions(+), 76 deletions(-) diff --git a/src/RAJAPerfSuiteDriver.cpp b/src/RAJAPerfSuiteDriver.cpp index 98f122d4a..42fe557d2 100644 --- a/src/RAJAPerfSuiteDriver.cpp +++ b/src/RAJAPerfSuiteDriver.cpp @@ -17,6 +17,7 @@ int main( int argc, char** argv ) //rajaperf::Executor executor(argc, argv); rajaperf::Executor executor(argc, argv); rajaperf::make_perfsuite_executor(&executor, argc, argv); + //executor.registerKernel //rajaperf::RunParams params(argc, argv); //executor.registerGroup("Sparse"); diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index b25ef6553..71331d04b 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -15,6 +15,7 @@ // Warmup kernel to run first to remove startup overheads in timings #include "basic/DAXPY.hpp" +// Standard library includes #include #include #include @@ -38,63 +39,150 @@ Executor::Executor(int argc, char** argv) { } +/* + * https://www.delftstack.com/howto/cpp/cpp-tilde-operator/ + * + * The destructor is a special member function that handles the deallocation of the class object’s resources. + * AS opposed to the class constructors, it has only one destructor function for a given class. + * The class destructor is declared with the same name as the class plus the prefix ~ tilde operator. + *... + * Generally, the class members are destroyed after the destructor function code is run; + * thus, we can demonstrate how the StringArray class instance goes out of scope and hence + * printing to the console the corresponding text. + * + */ + +// Destructor for resource de-allocation Executor::~Executor() { for (size_t ik = 0; ik < kernels.size(); ++ik) { delete kernels[ik]; } + + // Pre-processor directives #if defined(RUN_KOKKOS) - Kokkos::finalize(); // TODO DZP: should this be here? + Kokkos::finalize(); // TODO DZP: should this be here? Good question. AJP #endif } // New functions for Kokkos to register new group and kernel IDs +// The return type is Executor::groupID Executor::groupID Executor::registerGroup(std::string groupName) { - + // find() method searches the string for the first occurrence of the sequence specified by its arguments. + // Recall, "kernelsPerGroup" is a mapping of kernel groups (e.g., basic) and their constituent kernels (e.g., DAXPY) auto checkIfGroupExists = kernelsPerGroup.find(groupName); - if (checkIfGroupExists == kernelsPerGroup.end()){ + + + /* Recall, these items are defined in Executor.hpp: + using groupID = int; + using kernelID = int; + using kernelSet = std::set; // data type: set of KernelBase* instances + using kernelMap = std::map; // data type: map of string kernel names to instances of KernelBase* + using groupMap = std::map; // data type: map of groupNames to sets of kernels + ... + // "allKernels" is an instance of kernelMap, which is a "map" of all kernels and their ID's + kernelMap allKernels; + + // "kernelsPerGroup" is an instance of "groupMap;" "kernelsPerGroup" maps kernels to their categories (e.g., basic, polybench, etc.) + groupMap kernelsPerGroup; + + */ + + /* end() + * Return iterator to end + * Returns an iterator referring to the past-the-end element in the vector container. + * The past-the-end element is the theoretical element that would follow the last element in the vector. + * It does not point to any element, and thus shall not be de-referenced. + * Because the ranges used by functions of the standard library do not include + * the element pointed by their closing iterator, + * this function is often used in combination with vector::begin to specify a range including all the elements in the container. + * If the container is empty, this function returns the same as vector::begin. + * + */ + + + // HERE, WE ARE CHECKING THE CASE THAT THE groupNAME **IS NOT** IN THE MAP OBJECT + // Using the .end() idiom to check if I've fallen off the edge of the container without finding a match + if (checkIfGroupExists == kernelsPerGroup.end()){ + // If groupName not found, set that groupName in kernelsPerGroup to an empty kernelSet obj kernelsPerGroup[groupName] = kernelSet(); } else { - // TODO: ERROR CONDITION + // ERROR CONDITION: DUPLICATING GROUPS + // Error lists exsiting group, and kills program. + + std::cout << "The Group Name " << groupName << " already exists. Program is exiting." << std::endl; + + // In kernelsPerGroup, the Group Name is the first position / key value, and the second position / value type in the set + auto fullKernelSet = checkIfGroupExists->second; + + // fullKernelSet is of type std::set + + for (auto kernel: fullKernelSet) { + + std::cout << kernel->getName() << std::endl; + + } + + exit(1); } + // getNewGroupID() is an object of type Executor::groupID, an int return getNewGroupID(); + } -// New function +// New function with return type Executor::kernelID, returning getNewKernelID(); registerKernel is a new function in the Executor class +// + Executor::kernelID Executor::registerKernel(std::string groupName, KernelBase* kernel) { - + // declaring and setting kernelName to de-referenced kernel pointer obj, an instance of KernelBase* auto kernelName = kernel->getName(); - + // Recall, "allKernels" maps named kernels to their IDs auto checkIfKernelExists = allKernels.find(kernelName); + // Check if checkKernelExists value IS NOT in the map of all kernels if (checkIfKernelExists == allKernels.end()) { + // if the kernel name IS NOT in the allKernels map, set kernelName to kernel, the KernelBase* instance allKernels[kernelName] = kernel; } else { - // TODO: ERROR CONDITION - return getNewKernelID(); -} - // Add the kernel to its group + // ERROR CONDITION: if the kernel is found / exists, make the program exit + + std::cout << "Kernel " << checkIfKernelExists->first << " already exists. Program is exiting." << std::endl; + + exit(1); + } + ////////////////////////////////////////////////////////////////////////////// + // This error condition : adding a groupName before checking if the group associated with the kernel exists + // Declare and set checkIfGroupExists to the value of the string-type groupName in the kernelsPerGroup map auto checkIfGroupExists = kernelsPerGroup.find(groupName); + // LOGIC: Check if checkIfGroupExists value is the same as the past-the-end element in the vector container, which + // does not have a value + // i.e., check for the case that the groupName DOES NOT exist with the ".end()" idiom; if (checkIfGroupExists == kernelsPerGroup.end()){ - // If group does not exist, ERROR CONDITION + } else { - // + // If the groupName DOES EXIST, then insert the kernel (instance of KernelBase*) at the second position of the + // allKernels map to associate the kernel and its groupNAme + checkIfGroupExists -> second.insert(kernel); } -return getNewKernelID(); + + // getNewKernelID is an obj of type Executor::kernelID + return getNewKernelID(); } +// AJP & DZP new function +// AJP GOAL: return a vector of all kernelBase* objects to be run by std::vector Executor::lookUpKernelByName(std::string kernelOrGroupName){ @@ -106,17 +194,21 @@ std::vector Executor::lookUpKernelByName(std::string kernelOrGroup // 2) LOGIC: // i) check to see if the kernel / group requested on the // "./rajaperf.exe -k" line (you can pass either a specific kernel or a - // kernel group - + // kernel groupName, e.g., "Basic" + + // Declaring the vector kernelsByNameVect of type std::vector; + // This variable will contain the set of kernels to run std::vector kernelsByNameVect ; - // If kernelName is groupName , then add that set of kernels in the + // CONDITIONS TO INCLUDE: + // 1) If kernelName is groupName , then add that set of kernels in the // group to the vector - // else if kernelName is kernel, then add the kernel to the vector - // else if kernelName is horse stuff, then say so - // - // - // Declare iterator against which you can test equivalence + + // 2) else if kernelName is kernel, then add the kernel to the vector + // 3) else if kernelName is horse stuff, then say so + + // HINT: Declare iterator against which you can test equivalence + auto checkLookUpGroupNameIterator = kernelsPerGroup.find(kernelOrGroupName); auto checkLookUpKernelNameIterator = allKernels.find(kernelOrGroupName); @@ -126,8 +218,9 @@ std::vector Executor::lookUpKernelByName(std::string kernelOrGroup //cout << " STEP 1" << endl; // when using the arrow, you get a key, value pair. - // YOu can access either member by "first" or "second" + // You can access either member by "first" or "second" + // we have std::set of KernelBase* auto groupSetForTests = checkLookUpGroupNameIterator -> second; for (auto item: groupSetForTests) { @@ -137,136 +230,217 @@ std::vector Executor::lookUpKernelByName(std::string kernelOrGroup else if (checkLookUpKernelNameIterator != allKernels.end()) { - auto kernelSetForTests = checkLookUpKernelNameIterator -> second; + auto kernel = checkLookUpKernelNameIterator -> second; - kernelsByNameVect.push_back(kernelSetForTests); + kernelsByNameVect.push_back(kernel); } - else { - - //TODO: ERROR CASE; - exit(1); - - - } - - -return kernelsByNameVect; + // kernelsByNameVect is an object of type std::vector that will be used by + return kernelsByNameVect; } - - - +////////////////////////////////////////////////////////////////////////////////////// +// * AJP TASK: change the setupSuite to use the allKernels (type: kernelMap) and kernelsPerGroup (type: groupMap) +// * maps; +// * The goal here is to make a vector of the different instances of KernelBase*, kernel, that are to be run; +// * The vector you'll need already exists! +// * Hint: see line 375-ish for kernels.push_back; +// */ +///////////////////////////////////////////////////////////////////////////////////// void Executor::setupSuite() { + // Initial handling of run parameters input RunParams::InputOpt in_state = run_params.getInputState(); + // QUESTION -- In this first step, are we doing nothing (initially) if we have bad input? + // Should there be an else condition for this conditional? if ( in_state == RunParams::InfoRequest || in_state == RunParams::BadInput ) { return; } cout << "\nSetting up suite based on input..." << endl; + + //////////////////////////////////////////////////////////////////////////////////// + // Declaring function type aliases + using Slist = list; using Svector = vector; + // Set of kernel IDs, e.g., DAXPY, IF_QUAD using KIDset = set; + // "variants" include CUDA, OpenMPTarget, OpenMP, HIP, Serial using VIDset = set; - - // + /////////////////////////////////////////////////////////////////////////////////// // Determine which kernels to execute from input. // run_kern will be non-duplicated ordered set of IDs of kernel to run. - // + // kernel_input is an object of type reference to Svector; + // kernel_input will contain the input for the kernels to run const Svector& kernel_input = run_params.getKernelInput(); + // Declare run_kern of type KIDset; contains the set of kernels (KernelBase* instances to run) KIDset run_kern; + /* LOGIC + 1) check if each of the inputs in matches a groupName; + 2) if a match, add every kernel in that group to the vector that will be run; + 3) if no match, check existing kernels + 4) if a match, add that kernel + 5) if no match, add that kernel to set the set of invalid kernels + */ + + Svector invalid; + + // The case when the executable is passed no args + if (kernel_input.empty()) { + // your iterator does the deferencing for you, thus you don't need the input arrow, which is + // necessary for dereferencing + + for (auto iter_input: allKernels) { + kernels.push_back(iter_input.second); + } + } + else { + + for (auto kernelName: kernel_input) { + std::vector matchingKernelsVec = lookUpKernelByName(kernelName); + // if everything that matched is in the vector, and nothing matched, i.e., an empty vector, + // i.e., the kernel name was invalid + + if (matchingKernelsVec.empty()) { + invalid.push_back(kernelName); + } else { + + for (auto iter_kern: matchingKernelsVec) { + kernels.push_back(iter_kern); + + } + } + } + } + +/* if ( kernel_input.empty() ) { // - // No kernels specified in input, run them all... + // if No kernels specified in input, run them all... // for (size_t ik = 0; ik < NumKernels; ++ik) { + // here, inserting kernels to run; you must cast ik (of type size_t), the indexing variable, as a KernelID type run_kern.insert( static_cast(ik) ); } } else { - // - // Need to parse input to determine which kernels to run - // - // Make list copy of kernel input to manipulate + // Parse input to determine which kernels to run + // Make list of strings copy of kernel input for the parsing // (need to process potential group names and/or kernel names) + + // Slist is a type alias for list + // Populate list with the kernel_input, from the beginning index to the end Slist input(kernel_input.begin(), kernel_input.end()); - // + // AJP code addition -- print list of inputs + + for (auto idx: input ) + + std::cout << "Input parameters list: " << idx << std:: endl; + // Search input for matching group names. - // groups2run will contain names of groups to run. - // + // groups2run is a vector of strings (of type Svector, a type alias of vector) containing names + // of groups to run if passed in as input. + Svector groups2run; + // Outer loop: Iterate through the list of strings from the first to the last item for (Slist::iterator it = input.begin(); it != input.end(); ++it) { + // inner loop: iterate over NumGroups, a member of GroupID enum defined in RAJAPerfSuite.hpp for (size_t ig = 0; ig < NumGroups; ++ig) { + // declare a constant (immutable) string reference "group_name" + // Store the value at the the ig(th) index as a GroupID in group_name const string& group_name = getGroupName(static_cast(ig)); + // if group_name is equal to the value the it(th)* index points to, + // push_back / append that group_name to groups2run vector of strings if ( group_name == *it ) { groups2run.push_back(group_name); } } } - // - // If group name(s) found in input, assemble kernels in group(s) + // If group name(s) found in input, assemble kernel sets for those group(s); // to run and remove those group name(s) from input list. - // + // Here, iterate the groups2run, and store the value at ig(th) index in + // an immutable/constant reference called gname (of type string) for (size_t ig = 0; ig < groups2run.size(); ++ig) { const string& gname(groups2run[ig]); + // NumKernels is always the last member of KernelID, an enum, declared in RAJAPerfSuite.hpp + // Iterate over NumKernels, casting the index ik to a KernelID type, and setting it to kid + // for (size_t ik = 0; ik < NumKernels; ++ik) { KernelID kid = static_cast(ik); + // if the group name DOES occur within the string full kernel name (npos means until the end of the string), + // insert the kid (of KernelID type) into the run_kern (of type KIDset) if ( getFullKernelName(kid).find(gname) != string::npos ) { run_kern.insert(kid); } } - + // remember, gname is a const/immutable string reference containing group names as a string input.remove(gname); } - // + + // Look for matching names of individual kernels in remaining input. - // // Assemble invalid input for warning message. - // - Svector invalid; - + // Declare the vector "invalid" of type Svector (type alias for vector) to hold ... + // Iterate over the input from beginning to the end item; for (Slist::iterator it = input.begin(); it != input.end(); ++it) { + // initialize a boolean, "found_it" to false; + // why do we need this variable? AJP -- ANSWER HERE bool found_it = false; - + // Iterate ik over NumKernels & TRUE; + // Iterate until you hit the end of the list , or until you find what you're looking for. for (size_t ik = 0; ik < NumKernels && !found_it; ++ik) { + // cast the ik(th) value to a KernelID, and set equal to kid KernelID kid = static_cast(ik); + // if the kernel name (for a kid, of type KernelID) is equal to the value pointed at at the it(th) index + // OR if the full kernel name (for a kid) is equal to the value pointed at at the it(th) index + // insert that kid into the run_kern (of type KIDset) and set found_it boolean to true if ( getKernelName(kid) == *it || getFullKernelName(kid) == *it ) { run_kern.insert(kid); found_it = true; } } - + // ATTN: found_it depend on whether or not the kernel was found; + // if the kernel was NOT found, we want to push it back to the set of invalid; + // if found_it = false, push back the value pointed at at the it(th) index to the vector of strings, "&invalid," + // which is of type Svector (a type alias) if ( !found_it ) invalid.push_back(*it); } - + // Update the run_params obj with data in the invalid vector reference run_params.setInvalidKernelInput(invalid); } - // // Assemble set of available variants to run // (based on compile-time configuration). - // + // Recall, a variant will be: base_seq, base_CUDA, Raja_lambda, kokkos_lambda, etc. + + // Declare available_var as a VIDset +*/ + + run_params.setInvalidKernelInput(invalid); + VIDset available_var; + // iterate the NumVariants & static_cast value at iv(th) index to VariantID + // if the variant is available, insert vid into the VIDset for (size_t iv = 0; iv < NumVariants; ++iv) { VariantID vid = static_cast(iv); if ( isVariantAvailable( vid ) ) { @@ -299,13 +473,17 @@ void Executor::setupSuite() // // Set reference variant if not specified. - // + // Here, this is where base_seq is set as the default baseline; + // the baseline that is used can be changed! + // e.g., kokkos_lambda + if ( run_params.getReferenceVariant().empty() && !run_var.empty() ) { reference_vid = *run_var.begin(); } } else { + // // Parse input to determine which variants to run: // - variants to run will be the intersection of available variants @@ -347,12 +525,7 @@ void Executor::setupSuite() } - // - // Create kernel objects and variants to execute. If invalid input is not - // empty for either case, then there were unmatched input items. - // - // A message will be emitted later so user can sort it out... - // + if ( !(run_params.getInvalidKernelInput().empty()) ) { @@ -360,15 +533,17 @@ void Executor::setupSuite() } else { // kernel input looks good - for (KIDset::iterator kid = run_kern.begin(); + // Get lists using David and Amy's new maps! + +/* for (KIDset::iterator kid = run_kern.begin(); kid != run_kern.end(); ++kid) { /// RDH DISABLE COUPLE KERNEL until we find a reasonable way to do /// complex numbers in GPU code - if ( /** *kid != Apps_COUPLE */ true ) { + if ( true ) { kernels.push_back( getKernelObject(*kid, run_params) ); } } - +*/ if ( !(run_params.getInvalidVariantInput().empty()) ) { run_params.setInputState(RunParams::BadInput); @@ -408,7 +583,7 @@ void Executor::setupSuite() } // if kernel input looks good } - +//////////////////////////////////////////////////////////////////////////////////// void Executor::reportRunSummary(ostream& str) const { @@ -1100,7 +1275,8 @@ void Executor::getFOMGroups(vector& fom_groups) } #endif } - +// TODO: AJP and DZP talk these functions through; +// is the arrow operator here acting as a pointer object to registerGroup, etc.? void free_register_group(Executor* exec, std::string groupName){ exec->registerGroup(groupName); diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 2ae2e5ffc..2ab4ef055 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -12,6 +12,7 @@ #include "common/RAJAPerfSuite.hpp" #include "common/RunParams.hpp" +#include #include #include #include @@ -121,7 +122,11 @@ class Executor VariantID reference_vid; + // "allKernels" is an instance of kernelMap, which is a "map" of all kernels (as strings, e.g., DAXPY, to their + // kernelBase* instances; the string name will be the key (first), and the kernelBase* instance will be the value (second) kernelMap allKernels; + // "kernelsPerGroup" is an instance of "groupMap;" "kernelsPerGroup" maps kernels to their + // categories / parent class (e.g., basic, polybench, etc.) groupMap kernelsPerGroup; diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index fd510e7f4..bc654296a 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -102,7 +102,6 @@ namespace rajaperf { free_register_kernel(exec, "Basic", new basic::ATOMIC_PI(run_params)); free_register_kernel(exec, "Basic", new basic::DAXPY(run_params)); free_register_kernel(exec, "Basic", new basic::IF_QUAD(run_params)); - free_register_kernel(exec, "Basic", new basic::IF_QUAD(run_params)); free_register_kernel(exec, "Basic", new basic::INIT3(run_params)); free_register_kernel(exec, "Basic", new basic::INIT_VIEW1D(run_params)); free_register_kernel(exec, "Basic", new basic::INIT_VIEW1D_OFFSET(run_params)); From 7be49db23101cff5f1f03340273ee31960e81c0a Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Tue, 27 Apr 2021 10:58:45 -0600 Subject: [PATCH 077/124] Fixes for OpenMPTarget build --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 344dff0a7..acf08ddc5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -33,7 +33,7 @@ list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) if(ENABLE_TARGET_OPENMP) remove_definitions(-DRUN_RAJA_SEQ -DRUN_OPENMP ) - +include_directories(basic) blt_add_executable( NAME raja-perf-omptarget.exe SOURCES RAJAPerfSuiteDriver.cpp From f238c1c22c1554b441be01eb993a7efebf7d521b Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 25 May 2021 11:05:17 -0600 Subject: [PATCH 078/124] IF_QUAD-Kokkos.cpp: tidying up formatting --- src/basic-kokkos/IF_QUAD-Kokkos.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/basic-kokkos/IF_QUAD-Kokkos.cpp b/src/basic-kokkos/IF_QUAD-Kokkos.cpp index 29b2a2e25..b515d67e5 100644 --- a/src/basic-kokkos/IF_QUAD-Kokkos.cpp +++ b/src/basic-kokkos/IF_QUAD-Kokkos.cpp @@ -31,9 +31,9 @@ void IF_QUAD::runKokkosVariant(VariantID vid) // Instantiating views using getViewFromPointer for the IF_QUAD definition - auto a_view = getViewFromPointer(a, iend); - auto b_view = getViewFromPointer(b, iend); - auto c_view = getViewFromPointer(c, iend); + auto a_view = getViewFromPointer(a, iend); + auto b_view = getViewFromPointer(b, iend); + auto c_view = getViewFromPointer(c, iend); auto x1_view = getViewFromPointer(x1, iend); auto x2_view = getViewFromPointer(x2, iend); @@ -63,21 +63,22 @@ void IF_QUAD::runKokkosVariant(VariantID vid) RAJA::RangeSegment(ibegin, iend), ifquad_lam); */ // Translation - Kokkos::parallel_for("IF_QUAD_Kokkos Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), - - KOKKOS_LAMBDA (Index_type i) { + Kokkos::parallel_for("IF_QUAD_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA (Index_type i) { Real_type s = b_view[i]*b_view[i] - 4.0*a_view[i]*c_view[i]; if ( s >= 0 ) { s = sqrt(s); x2_view[i] = (-b_view[i]+s)/(2.0*a_view[i]); x1_view[i] = (-b_view[i]-s)/(2.0*a_view[i]); - } else { + } + else { x2_view[i] = 0.0; x1_view[i] = 0.0; - - }}); + } +}); } From c9ab748272e6094c4efd6d26d3fd05a8643c3e13 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 25 May 2021 14:38:51 -0600 Subject: [PATCH 079/124] stream-kokkos/ADD: changes to infrastructure and kernel --- src/CMakeLists.txt | 41 ++++---- src/basic-kokkos/INIT3-Kokkos.cpp | 2 - src/common/KernelBase.cpp | 15 ++- src/common/RAJAPerfSuite.cpp | 23 +++-- src/common/RAJAPerfSuite.hpp | 12 +-- src/stream-kokkos/ADD-Kokkos.cpp | 152 +++++++++++++++++++++++++++++ src/stream-kokkos/CMakeLists.txt | 19 ++++ src/stream-kokkos/COPY-Kokkos.cpp | 90 +++++++++++++++++ src/stream-kokkos/DOT-Kokkos.cpp | 104 ++++++++++++++++++++ src/stream-kokkos/MUL-Kokkos.cpp | 90 +++++++++++++++++ src/stream-kokkos/TRIAD-Kokkos.cpp | 90 +++++++++++++++++ src/stream/ADD.hpp | 1 + 12 files changed, 601 insertions(+), 38 deletions(-) create mode 100644 src/stream-kokkos/ADD-Kokkos.cpp create mode 100644 src/stream-kokkos/CMakeLists.txt create mode 100644 src/stream-kokkos/COPY-Kokkos.cpp create mode 100644 src/stream-kokkos/DOT-Kokkos.cpp create mode 100644 src/stream-kokkos/MUL-Kokkos.cpp create mode 100644 src/stream-kokkos/TRIAD-Kokkos.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index acf08ddc5..a5aebda4e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -15,7 +15,8 @@ add_subdirectory(basic-kokkos) #add_subdirectory(kokkos-mechanics) #add_subdirectory(lcals) #add_subdirectory(polybench) -#add_subdirectory(stream) +add_subdirectory(stream) +add_subdirectory(stream-kokkos) #add_subdirectory(algorithm) set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS @@ -26,7 +27,8 @@ set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS #kokkos-mechanics #lcals #polybench - #stream + stream + stream-kokkos #algorithm ) list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) @@ -172,21 +174,26 @@ blt_add_executable( #polybench/POLYBENCH_MVT.cpp #polybench/POLYBENCH_MVT-Seq.cpp #polybench/POLYBENCH_MVT-OMPTarget.cpp - #stream/ADD.cpp - #stream/ADD-Seq.cpp - #stream/ADD-OMPTarget.cpp - #stream/COPY.cpp - #stream/COPY-Seq.cpp - #stream/COPY-OMPTarget.cpp - #stream/DOT.cpp - #stream/DOT-Seq.cpp - #stream/DOT-OMPTarget.cpp - #stream/MUL.cpp - #stream/MUL-Seq.cpp - #stream/MUL-OMPTarget.cpp - #stream/TRIAD.cpp - #stream/TRIAD-Seq.cpp - #stream/TRIAD-OMPTarget.cpp + stream/ADD.cpp + stream/ADD-Seq.cpp + stream/ADD-OMPTarget.cpp + stream/COPY.cpp + stream/COPY-Seq.cpp + stream/COPY-OMPTarget.cpp + stream/DOT.cpp + stream/DOT-Seq.cpp + stream/DOT-OMPTarget.cpp + stream/MUL.cpp + stream/MUL-Seq.cpp + stream/MUL-OMPTarget.cpp + stream/TRIAD.cpp + stream/TRIAD-Seq.cpp + stream/TRIAD-OMPTarget.cpp + stream-kokkos/ADD-Kokkos.cpp + stream-kokkos/COPY-Kokkos.cpp + stream-kokkos/DOT-Kokkos.cpp + stream-kokkos/MUL-Kokkos.cpp + stream-kokkos/TRIAD-Kokkos.cpp common/DataUtils.cpp common/Executor.cpp common/KernelBase.cpp diff --git a/src/basic-kokkos/INIT3-Kokkos.cpp b/src/basic-kokkos/INIT3-Kokkos.cpp index 08b67eb57..9a13476da 100644 --- a/src/basic-kokkos/INIT3-Kokkos.cpp +++ b/src/basic-kokkos/INIT3-Kokkos.cpp @@ -120,8 +120,6 @@ void INIT3::runKokkosVariant(VariantID vid) moveDataToHostFromKokkosView(in2, in2_view, iend); - - } } // end namespace basic diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 73755eb0d..8cd5e9b30 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -61,8 +61,21 @@ Index_type KernelBase::getRunSize() const return static_cast(default_size*run_params.getSizeFactor()); } +//FIXME Index_type KernelBase::getRunReps() const -{ +{ + // DEBUGGING + std::cout << "AMOS" << std::endl; + + // std::cout << "Get Run Name " << run_params << std::endl; + //std::cout << "Get Variant Name " << run_params.getVariantName << std::endl; + // std::cout << "Get Full Kernel Name " << run_params.getFullKernelName << std::endl; + // std::cout << "Check Run Reps " << run_params.getCheckRunReps << std::endl; + //std::cout << "Get Run Reps " << run_params.getRunReps << std::endl; + //std::cout << "Get Run Rep Factor " << run_params.getRepFactor << std::endl; + // std::cout << "Get Run Size " << run_params.getRunSize -> run_params.getRunSize << std::endl; + // std::cout << "Get Run Size Factor " << run_params.getSizeFactor << std::endl; + if (run_params.getInputState() == RunParams::CheckRun) { return static_cast(run_params.getCheckRunReps()); } else { diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index bc654296a..29ec1f45e 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -137,14 +137,14 @@ namespace rajaperf { free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_JACOBI_1D(run_params)); free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_JACOBI_2D(run_params)); free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_MVT(run_params)); - +**/ // Stream free_register_kernel(exec, "Stream", new stream::ADD(run_params)); - free_register_kernel(exec, "Stream", new stream::COPY(run_params)); - free_register_kernel(exec, "Stream", new stream::DOT(run_params)); - free_register_kernel(exec, "Stream", new stream::MUL(run_params)); - free_register_kernel(exec, "Stream", new stream::TRIAD(run_params)); - + //free_register_kernel(exec, "Stream", new stream::COPY(run_params)); + //free_register_kernel(exec, "Stream", new stream::DOT(run_params)); + //free_register_kernel(exec, "Stream", new stream::MUL(run_params)); + //free_register_kernel(exec, "Stream", new stream::TRIAD(run_params)); +/** // Apps free_register_kernel(exec, "Apps", new apps::COUPLE(run_params)); free_register_kernel(exec, "Apps", new apps::DEL_DOT_VEC_2D(run_params)); @@ -252,11 +252,11 @@ namespace rajaperf { //// //// Stream kernels... //// -// std::string("Stream_ADD"), -// std::string("Stream_COPY"), -// std::string("Stream_DOT"), -// std::string("Stream_MUL"), -// std::string("Stream_TRIAD"), + std::string("Stream_ADD"), + std::string("Stream_COPY"), + std::string("Stream_DOT"), + std::string("Stream_MUL"), + std::string("Stream_TRIAD"), // // Apps kernels... // @@ -622,7 +622,6 @@ namespace rajaperf { kernel = new stream::TRIAD(run_params); break; } - // // Apps kernels... // diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 9417954fb..d950974f3 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -362,14 +362,14 @@ enum KernelID { // Polybench_JACOBI_2D, // Polybench_MVT, -// + // Stream kernels... // - //Stream_ADD, - //Stream_COPY, - //Stream_DOT, - //Stream_MUL, - //Stream_TRIAD, + Stream_ADD, + Stream_COPY, + Stream_DOT, + Stream_MUL, + Stream_TRIAD, // // Apps kernels... diff --git a/src/stream-kokkos/ADD-Kokkos.cpp b/src/stream-kokkos/ADD-Kokkos.cpp new file mode 100644 index 000000000..d833bacb3 --- /dev/null +++ b/src/stream-kokkos/ADD-Kokkos.cpp @@ -0,0 +1,152 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + +// Start Kokkos-ifying here: +// Nota bene: the original RAJAPerf Suite code left for reference + /* +void ADD::runSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); +*/ + + void ADD::runKokkosVariant(VariantID vid) + { + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + + ADD_DATA_SETUP; + + // Instiating views using getViewFromPointer + + auto a_view = getViewFromPointer(a, iend); + auto b_view = getViewFromPointer(b, iend); + auto c_view = getViewFromPointer(c, iend); + + + + auto add_lam = [=](Index_type i) { + ADD_BODY; + }; + +#if defined(RUN_KOKKOS) + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + ADD_BODY; + } + + } + stopTimer(); + + break; + } + + + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + add_lam(i); + } + + } + stopTimer(); + + break; + } + +/* + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), add_lam); + + } + stopTimer(); + + break; + } +*/ + +////////////////////////////////////////////////////////////////////////////// +// Kokkos -fying here: +// + + case Kokkos_Lambda : { + + // open Kokkos fence + Kokkos::fence(); + startTimer(); + + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { +/* + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), add_lam); +*/ + Kokkos::parallel_for("ADD_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i){ + // ADD BODY definition in header: + // c[i] = a[i] + b[i]; + c_view[i] = a_view[i] + b_view[i]; + }); + + } + // close Kokkos fence + Kokkos::fence(); + stopTimer(); + + break; + } + + + default : { + std::cout << "\n ADD : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + + moveDataToHostFromKokkosView(a, a_view, iend); + moveDataToHostFromKokkosView(b, b_view, iend); + moveDataToHostFromKokkosView(c, c_view, iend); + + + +} + +} // end namespace stream +} // end namespace rajaperf diff --git a/src/stream-kokkos/CMakeLists.txt b/src/stream-kokkos/CMakeLists.txt new file mode 100644 index 000000000..ffe93cce2 --- /dev/null +++ b/src/stream-kokkos/CMakeLists.txt @@ -0,0 +1,19 @@ +############################################################################### +# Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/../stream) + +blt_add_library( + NAME stream-kokkos + SOURCES ADD-Kokkos.cpp + COPY-Kokkos.cpp + DOT-Kokkos.cpp + MUL-Kokkos.cpp + TRIAD-Kokkos.cpp + DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} + ) diff --git a/src/stream-kokkos/COPY-Kokkos.cpp b/src/stream-kokkos/COPY-Kokkos.cpp new file mode 100644 index 000000000..68bc51e4d --- /dev/null +++ b/src/stream-kokkos/COPY-Kokkos.cpp @@ -0,0 +1,90 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "COPY.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + + +void COPY::runSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + COPY_DATA_SETUP; + + auto copy_lam = [=](Index_type i) { + COPY_BODY; + }; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + COPY_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + copy_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), copy_lam); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n COPY : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace stream +} // end namespace rajaperf diff --git a/src/stream-kokkos/DOT-Kokkos.cpp b/src/stream-kokkos/DOT-Kokkos.cpp new file mode 100644 index 000000000..8f6158a87 --- /dev/null +++ b/src/stream-kokkos/DOT-Kokkos.cpp @@ -0,0 +1,104 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DOT.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + + +void DOT::runSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + DOT_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type dot = m_dot_init; + + for (Index_type i = ibegin; i < iend; ++i ) { + DOT_BODY; + } + + m_dot += dot; + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto dot_base_lam = [=](Index_type i) -> Real_type { + return a[i] * b[i]; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Real_type dot = m_dot_init; + + for (Index_type i = ibegin; i < iend; ++i ) { + dot += dot_base_lam(i); + } + + m_dot += dot; + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceSum dot(m_dot_init); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + DOT_BODY; + }); + + m_dot += static_cast(dot.get()); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n DOT : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace stream +} // end namespace rajaperf diff --git a/src/stream-kokkos/MUL-Kokkos.cpp b/src/stream-kokkos/MUL-Kokkos.cpp new file mode 100644 index 000000000..7b36935e1 --- /dev/null +++ b/src/stream-kokkos/MUL-Kokkos.cpp @@ -0,0 +1,90 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MUL.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + + +void MUL::runSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + MUL_DATA_SETUP; + + auto mul_lam = [=](Index_type i) { + MUL_BODY; + }; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + MUL_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + mul_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), mul_lam); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n MUL : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace stream +} // end namespace rajaperf diff --git a/src/stream-kokkos/TRIAD-Kokkos.cpp b/src/stream-kokkos/TRIAD-Kokkos.cpp new file mode 100644 index 000000000..06885d50c --- /dev/null +++ b/src/stream-kokkos/TRIAD-Kokkos.cpp @@ -0,0 +1,90 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIAD.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace stream +{ + + +void TRIAD::runSeqVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + TRIAD_DATA_SETUP; + + auto triad_lam = [=](Index_type i) { + TRIAD_BODY; + }; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + TRIAD_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + triad_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), triad_lam); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n TRIAD : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace stream +} // end namespace rajaperf diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index 542c253b0..e979e45da 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -47,6 +47,7 @@ class ADD : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); From a0873deea9d9cb35bb648c88aadf526ee035ebf8 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Wed, 26 May 2021 10:22:57 -0600 Subject: [PATCH 080/124] Fixing run_params handling in RPS --- src/common/Executor.cpp | 1749 +++++++++++++++++----------------- src/common/Executor.hpp | 7 + src/common/KernelBase.cpp | 12 +- src/common/RAJAPerfSuite.cpp | 3 +- src/common/RAJAPerfSuite.hpp | 3 +- src/common/RunParams.cpp | 2 + src/stream/ADD.cpp | 2 + 7 files changed, 889 insertions(+), 889 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 71331d04b..558f05512 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -31,13 +31,12 @@ namespace rajaperf { -using namespace std; + using namespace std; -Executor::Executor(int argc, char** argv) - : run_params(argc, argv), - reference_vid(NumVariants) -{ -} + Executor::Executor(int argc, char **argv) + : run_params(argc, argv), + reference_vid(NumVariants) { + } /* * https://www.delftstack.com/howto/cpp/cpp-tilde-operator/ @@ -54,151 +53,145 @@ Executor::Executor(int argc, char** argv) // Destructor for resource de-allocation -Executor::~Executor() -{ - for (size_t ik = 0; ik < kernels.size(); ++ik) { - delete kernels[ik]; - } + Executor::~Executor() { + for (size_t ik = 0; ik < kernels.size(); ++ik) { + delete kernels[ik]; + } - // Pre-processor directives + // Pre-processor directives #if defined(RUN_KOKKOS) - Kokkos::finalize(); // TODO DZP: should this be here? Good question. AJP + Kokkos::finalize(); // TODO DZP: should this be here? Good question. AJP #endif -} + } // New functions for Kokkos to register new group and kernel IDs // The return type is Executor::groupID -Executor::groupID Executor::registerGroup(std::string groupName) -{ - // find() method searches the string for the first occurrence of the sequence specified by its arguments. - // Recall, "kernelsPerGroup" is a mapping of kernel groups (e.g., basic) and their constituent kernels (e.g., DAXPY) - auto checkIfGroupExists = kernelsPerGroup.find(groupName); - - - /* Recall, these items are defined in Executor.hpp: - using groupID = int; - using kernelID = int; - using kernelSet = std::set; // data type: set of KernelBase* instances - using kernelMap = std::map; // data type: map of string kernel names to instances of KernelBase* - using groupMap = std::map; // data type: map of groupNames to sets of kernels - ... - // "allKernels" is an instance of kernelMap, which is a "map" of all kernels and their ID's - kernelMap allKernels; - - // "kernelsPerGroup" is an instance of "groupMap;" "kernelsPerGroup" maps kernels to their categories (e.g., basic, polybench, etc.) - groupMap kernelsPerGroup; - - */ - - /* end() - * Return iterator to end - * Returns an iterator referring to the past-the-end element in the vector container. - * The past-the-end element is the theoretical element that would follow the last element in the vector. - * It does not point to any element, and thus shall not be de-referenced. - * Because the ranges used by functions of the standard library do not include - * the element pointed by their closing iterator, - * this function is often used in combination with vector::begin to specify a range including all the elements in the container. - * If the container is empty, this function returns the same as vector::begin. - * - */ + Executor::groupID Executor::registerGroup(std::string groupName) { + // find() method searches the string for the first occurrence of the sequence specified by its arguments. + // Recall, "kernelsPerGroup" is a mapping of kernel groups (e.g., basic) and their constituent kernels (e.g., DAXPY) + auto checkIfGroupExists = kernelsPerGroup.find(groupName); + + + /* Recall, these items are defined in Executor.hpp: + using groupID = int; + using kernelID = int; + using kernelSet = std::set; // data type: set of KernelBase* instances + using kernelMap = std::map; // data type: map of string kernel names to instances of KernelBase* + using groupMap = std::map; // data type: map of groupNames to sets of kernels + ... + // "allKernels" is an instance of kernelMap, which is a "map" of all kernels and their ID's + kernelMap allKernels; + + // "kernelsPerGroup" is an instance of "groupMap;" "kernelsPerGroup" maps kernels to their categories (e.g., basic, polybench, etc.) + groupMap kernelsPerGroup; + + */ + + /* end() + * Return iterator to end + * Returns an iterator referring to the past-the-end element in the vector container. + * The past-the-end element is the theoretical element that would follow the last element in the vector. + * It does not point to any element, and thus shall not be de-referenced. + * Because the ranges used by functions of the standard library do not include + * the element pointed by their closing iterator, + * this function is often used in combination with vector::begin to specify a range including all the elements in the container. + * If the container is empty, this function returns the same as vector::begin. + * + */ - // HERE, WE ARE CHECKING THE CASE THAT THE groupNAME **IS NOT** IN THE MAP OBJECT - // Using the .end() idiom to check if I've fallen off the edge of the container without finding a match - if (checkIfGroupExists == kernelsPerGroup.end()){ - // If groupName not found, set that groupName in kernelsPerGroup to an empty kernelSet obj - kernelsPerGroup[groupName] = kernelSet(); -} - else { - // ERROR CONDITION: DUPLICATING GROUPS - // Error lists exsiting group, and kills program. + // HERE, WE ARE CHECKING THE CASE THAT THE groupNAME **IS NOT** IN THE MAP OBJECT + // Using the .end() idiom to check if I've fallen off the edge of the container without finding a match + if (checkIfGroupExists == kernelsPerGroup.end()) { + // If groupName not found, set that groupName in kernelsPerGroup to an empty kernelSet obj + kernelsPerGroup[groupName] = kernelSet(); + } else { + // ERROR CONDITION: DUPLICATING GROUPS + // Error lists exsiting group, and kills program. - std::cout << "The Group Name " << groupName << " already exists. Program is exiting." << std::endl; + std::cout << "The Group Name " << groupName << " already exists. Program is exiting." << std::endl; - // In kernelsPerGroup, the Group Name is the first position / key value, and the second position / value type in the set - auto fullKernelSet = checkIfGroupExists->second; + // In kernelsPerGroup, the Group Name is the first position / key value, and the second position / value type in the set + auto fullKernelSet = checkIfGroupExists->second; - // fullKernelSet is of type std::set + // fullKernelSet is of type std::set - for (auto kernel: fullKernelSet) { + for (auto kernel: fullKernelSet) { - std::cout << kernel->getName() << std::endl; + std::cout << kernel->getName() << std::endl; - } + } - exit(1); - -} - // getNewGroupID() is an object of type Executor::groupID, an int - return getNewGroupID(); + exit(1); + } + // getNewGroupID() is an object of type Executor::groupID, an int + return getNewGroupID(); -} + + } // New function with return type Executor::kernelID, returning getNewKernelID(); registerKernel is a new function in the Executor class // -Executor::kernelID Executor::registerKernel(std::string groupName, KernelBase* kernel) -{ - // declaring and setting kernelName to de-referenced kernel pointer obj, an instance of KernelBase* - auto kernelName = kernel->getName(); - // Recall, "allKernels" maps named kernels to their IDs - auto checkIfKernelExists = allKernels.find(kernelName); - // Check if checkKernelExists value IS NOT in the map of all kernels - if (checkIfKernelExists == allKernels.end()) { - // if the kernel name IS NOT in the allKernels map, set kernelName to kernel, the KernelBase* instance - allKernels[kernelName] = kernel; -} - else { - // ERROR CONDITION: if the kernel is found / exists, make the program exit - - std::cout << "Kernel " << checkIfKernelExists->first << " already exists. Program is exiting." << std::endl; - - exit(1); - } - ////////////////////////////////////////////////////////////////////////////// - // This error condition : adding a groupName before checking if the group associated with the kernel exists - // Declare and set checkIfGroupExists to the value of the string-type groupName in the kernelsPerGroup map - auto checkIfGroupExists = kernelsPerGroup.find(groupName); - // LOGIC: Check if checkIfGroupExists value is the same as the past-the-end element in the vector container, which - // does not have a value - // i.e., check for the case that the groupName DOES NOT exist with the ".end()" idiom; - if (checkIfGroupExists == kernelsPerGroup.end()){ + Executor::kernelID Executor::registerKernel(std::string groupName, KernelBase *kernel) { + // declaring and setting kernelName to de-referenced kernel pointer obj, an instance of KernelBase* + auto kernelName = kernel->getName(); + // Recall, "allKernels" maps named kernels to their IDs + auto checkIfKernelExists = allKernels.find(kernelName); + // Check if checkKernelExists value IS NOT in the map of all kernels + if (checkIfKernelExists == allKernels.end()) { + // if the kernel name IS NOT in the allKernels map, set kernelName to kernel, the KernelBase* instance + allKernels[kernelName] = kernel; + } else { + // ERROR CONDITION: if the kernel is found / exists, make the program exit -} + std::cout << "Kernel " << checkIfKernelExists->first << " already exists. Program is exiting." + << std::endl; -else { - // If the groupName DOES EXIST, then insert the kernel (instance of KernelBase*) at the second position of the - // allKernels map to associate the kernel and its groupNAme + exit(1); + } + ////////////////////////////////////////////////////////////////////////////// + // This error condition : adding a groupName before checking if the group associated with the kernel exists + // Declare and set checkIfGroupExists to the value of the string-type groupName in the kernelsPerGroup map + auto checkIfGroupExists = kernelsPerGroup.find(groupName); + // LOGIC: Check if checkIfGroupExists value is the same as the past-the-end element in the vector container, which + // does not have a value + // i.e., check for the case that the groupName DOES NOT exist with the ".end()" idiom; + if (checkIfGroupExists == kernelsPerGroup.end()) { - checkIfGroupExists -> second.insert(kernel); + } else { + // If the groupName DOES EXIST, then insert the kernel (instance of KernelBase*) at the second position of the + // allKernels map to associate the kernel and its groupNAme + + checkIfGroupExists->second.insert(kernel); -} + } - // getNewKernelID is an obj of type Executor::kernelID - return getNewKernelID(); -} + // getNewKernelID is an obj of type Executor::kernelID + return getNewKernelID(); + } // AJP & DZP new function // AJP GOAL: return a vector of all kernelBase* objects to be run by -std::vector Executor::lookUpKernelByName(std::string kernelOrGroupName){ + std::vector Executor::lookUpKernelByName(std::string kernelOrGroupName) { - // The vector / list return type, std::vector will contain - // either all of the kernels with a given kernel name or group name - // We have two maps (defined in Executor.hpp): kernelMap allKernels, groupMap kernelsPerGroup, - // STEPS: - // 1) declare new vector that will contain the string data: - // 2) LOGIC: - // i) check to see if the kernel / group requested on the - // "./rajaperf.exe -k" line (you can pass either a specific kernel or a - // kernel groupName, e.g., "Basic" + // The vector / list return type, std::vector will contain + // either all of the kernels with a given kernel name or group name + // We have two maps (defined in Executor.hpp): kernelMap allKernels, groupMap kernelsPerGroup, + // STEPS: + // 1) declare new vector that will contain the string data: + // 2) LOGIC: + // i) check to see if the kernel / group requested on the + // "./rajaperf.exe -k" line (you can pass either a specific kernel or a + // kernel groupName, e.g., "Basic" - // Declaring the vector kernelsByNameVect of type std::vector; - // This variable will contain the set of kernels to run - std::vector kernelsByNameVect ; + // Declaring the vector kernelsByNameVect of type std::vector; + // This variable will contain the set of kernels to run + std::vector kernelsByNameVect; // CONDITIONS TO INCLUDE: // 1) If kernelName is groupName , then add that set of kernels in the @@ -209,42 +202,45 @@ std::vector Executor::lookUpKernelByName(std::string kernelOrGroup // HINT: Declare iterator against which you can test equivalence - auto checkLookUpGroupNameIterator = kernelsPerGroup.find(kernelOrGroupName); + auto checkLookUpGroupNameIterator = kernelsPerGroup.find(kernelOrGroupName); auto checkLookUpKernelNameIterator = allKernels.find(kernelOrGroupName); // Check to see if groupName NOT in kernelsPerGroup; // end() iterates to the end if (checkLookUpGroupNameIterator != kernelsPerGroup.end()) { - //cout << " STEP 1" << endl; - - // when using the arrow, you get a key, value pair. - // You can access either member by "first" or "second" + //cout << " STEP 1" << endl; - // we have std::set of KernelBase* - auto groupSetForTests = checkLookUpGroupNameIterator -> second; + // when using the arrow, you get a key, value pair. + // You can access either member by "first" or "second" - for (auto item: groupSetForTests) { - kernelsByNameVect.push_back(item); - } - } + // we have std::set of KernelBase* + auto groupSetForTests = checkLookUpGroupNameIterator->second; - else if (checkLookUpKernelNameIterator != allKernels.end()) { + for (auto item: groupSetForTests) { + kernelsByNameVect.push_back(item); + } + } else if (checkLookUpKernelNameIterator != allKernels.end()) { - auto kernel = checkLookUpKernelNameIterator -> second; + auto kernel = checkLookUpKernelNameIterator->second; - kernelsByNameVect.push_back(kernel); + kernelsByNameVect.push_back(kernel); } - // kernelsByNameVect is an object of type std::vector that will be used by - return kernelsByNameVect; - - -} + // kernelsByNameVect is an object of type std::vector that will be used by + return kernelsByNameVect; + + + } + + const RunParams &Executor::getRunParams() { + return run_params; + } + ////////////////////////////////////////////////////////////////////////////////////// // * AJP TASK: change the setupSuite to use the allKernels (type: kernelMap) and kernelsPerGroup (type: groupMap) @@ -254,75 +250,73 @@ std::vector Executor::lookUpKernelByName(std::string kernelOrGroup // * Hint: see line 375-ish for kernels.push_back; // */ ///////////////////////////////////////////////////////////////////////////////////// -void Executor::setupSuite() -{ - // Initial handling of run parameters input - RunParams::InputOpt in_state = run_params.getInputState(); - // QUESTION -- In this first step, are we doing nothing (initially) if we have bad input? - // Should there be an else condition for this conditional? - if ( in_state == RunParams::InfoRequest || in_state == RunParams::BadInput ) { - return; - } - - cout << "\nSetting up suite based on input..." << endl; - - - //////////////////////////////////////////////////////////////////////////////////// - // Declaring function type aliases - - using Slist = list; - using Svector = vector; - // Set of kernel IDs, e.g., DAXPY, IF_QUAD - using KIDset = set; - // "variants" include CUDA, OpenMPTarget, OpenMP, HIP, Serial - using VIDset = set; - /////////////////////////////////////////////////////////////////////////////////// - // Determine which kernels to execute from input. - // run_kern will be non-duplicated ordered set of IDs of kernel to run. - // kernel_input is an object of type reference to Svector; - // kernel_input will contain the input for the kernels to run - const Svector& kernel_input = run_params.getKernelInput(); - - // Declare run_kern of type KIDset; contains the set of kernels (KernelBase* instances to run) - KIDset run_kern; - - /* LOGIC - 1) check if each of the inputs in matches a groupName; - 2) if a match, add every kernel in that group to the vector that will be run; - 3) if no match, check existing kernels - 4) if a match, add that kernel - 5) if no match, add that kernel to set the set of invalid kernels - */ - - Svector invalid; - - // The case when the executable is passed no args - if (kernel_input.empty()) { - // your iterator does the deferencing for you, thus you don't need the input arrow, which is - // necessary for dereferencing - - for (auto iter_input: allKernels) { - kernels.push_back(iter_input.second); + void Executor::setupSuite() { + // Initial handling of run parameters input + RunParams::InputOpt in_state = run_params.getInputState(); + // QUESTION -- In this first step, are we doing nothing (initially) if we have bad input? + // Should there be an else condition for this conditional? + if (in_state == RunParams::InfoRequest || in_state == RunParams::BadInput) { + return; } - } - else { - for (auto kernelName: kernel_input) { - std::vector matchingKernelsVec = lookUpKernelByName(kernelName); - // if everything that matched is in the vector, and nothing matched, i.e., an empty vector, - // i.e., the kernel name was invalid + cout << "\nSetting up suite based on input..." << endl; + + + //////////////////////////////////////////////////////////////////////////////////// + // Declaring function type aliases + + using Slist = list; + using Svector = vector; + // Set of kernel IDs, e.g., DAXPY, IF_QUAD + using KIDset = set; + // "variants" include CUDA, OpenMPTarget, OpenMP, HIP, Serial + using VIDset = set; + /////////////////////////////////////////////////////////////////////////////////// + // Determine which kernels to execute from input. + // run_kern will be non-duplicated ordered set of IDs of kernel to run. + // kernel_input is an object of type reference to Svector; + // kernel_input will contain the input for the kernels to run + const Svector &kernel_input = run_params.getKernelInput(); + + // Declare run_kern of type KIDset; contains the set of kernels (KernelBase* instances to run) + KIDset run_kern; + + /* LOGIC + 1) check if each of the inputs in matches a groupName; + 2) if a match, add every kernel in that group to the vector that will be run; + 3) if no match, check existing kernels + 4) if a match, add that kernel + 5) if no match, add that kernel to set the set of invalid kernels + */ + + Svector invalid; + + // The case when the executable is passed no args + if (kernel_input.empty()) { + // your iterator does the deferencing for you, thus you don't need the input arrow, which is + // necessary for dereferencing + + for (auto iter_input: allKernels) { + kernels.push_back(iter_input.second); + } + } else { - if (matchingKernelsVec.empty()) { - invalid.push_back(kernelName); - } else { + for (auto kernelName: kernel_input) { + std::vector matchingKernelsVec = lookUpKernelByName(kernelName); + // if everything that matched is in the vector, and nothing matched, i.e., an empty vector, + // i.e., the kernel name was invalid - for (auto iter_kern: matchingKernelsVec) { - kernels.push_back(iter_kern); + if (matchingKernelsVec.empty()) { + invalid.push_back(kernelName); + } else { + for (auto iter_kern: matchingKernelsVec) { + kernels.push_back(iter_kern); + + } } } } - } /* if ( kernel_input.empty() ) { @@ -436,104 +430,103 @@ void Executor::setupSuite() // Declare available_var as a VIDset */ - run_params.setInvalidKernelInput(invalid); + run_params.setInvalidKernelInput(invalid); - VIDset available_var; - // iterate the NumVariants & static_cast value at iv(th) index to VariantID - // if the variant is available, insert vid into the VIDset - for (size_t iv = 0; iv < NumVariants; ++iv) { - VariantID vid = static_cast(iv); - if ( isVariantAvailable( vid ) ) { - available_var.insert( vid ); - } - } + VIDset available_var; + // iterate the NumVariants & static_cast value at iv(th) index to VariantID + // if the variant is available, insert vid into the VIDset + for (size_t iv = 0; iv < NumVariants; ++iv) { + VariantID vid = static_cast(iv); + if (isVariantAvailable(vid)) { + available_var.insert(vid); + } + } - // - // Determine variants to execute from input. - // run_var will be non-duplicated ordered set of IDs of variants to run. - // - const Svector& variant_input = run_params.getVariantInput(); + // + // Determine variants to execute from input. + // run_var will be non-duplicated ordered set of IDs of variants to run. + // + const Svector &variant_input = run_params.getVariantInput(); - VIDset run_var; + VIDset run_var; - if ( variant_input.empty() ) { + if (variant_input.empty()) { - // - // No variants specified in input options, run all available. - // Also, set reference variant if specified. - // - for (VIDset::iterator vid_it = available_var.begin(); - vid_it != available_var.end(); ++vid_it) { - VariantID vid = *vid_it; - run_var.insert( vid ); - if ( getVariantName(vid) == run_params.getReferenceVariant() ) { - reference_vid = vid; - } - } + // + // No variants specified in input options, run all available. + // Also, set reference variant if specified. + // + for (VIDset::iterator vid_it = available_var.begin(); + vid_it != available_var.end(); ++vid_it) { + VariantID vid = *vid_it; + run_var.insert(vid); + if (getVariantName(vid) == run_params.getReferenceVariant()) { + reference_vid = vid; + } + } - // - // Set reference variant if not specified. - // Here, this is where base_seq is set as the default baseline; - // the baseline that is used can be changed! - // e.g., kokkos_lambda + // + // Set reference variant if not specified. + // Here, this is where base_seq is set as the default baseline; + // the baseline that is used can be changed! + // e.g., kokkos_lambda - if ( run_params.getReferenceVariant().empty() && !run_var.empty() ) { - reference_vid = *run_var.begin(); - } + if (run_params.getReferenceVariant().empty() && !run_var.empty()) { + reference_vid = *run_var.begin(); + } - } else { + } else { - // - // Parse input to determine which variants to run: - // - variants to run will be the intersection of available variants - // and those specified in input - // - reference variant will be set to specified input if available - // and variant will be run; else first variant that will be run. - // - // Assemble invalid input for warning message. - // - - Svector invalid; + // + // Parse input to determine which variants to run: + // - variants to run will be the intersection of available variants + // and those specified in input + // - reference variant will be set to specified input if available + // and variant will be run; else first variant that will be run. + // + // Assemble invalid input for warning message. + // - for (size_t it = 0; it < variant_input.size(); ++it) { - bool found_it = false; + Svector invalid; - for (VIDset::iterator vid_it = available_var.begin(); - vid_it != available_var.end(); ++vid_it) { - VariantID vid = *vid_it; - if ( getVariantName(vid) == variant_input[it] ) { - run_var.insert(vid); - if ( getVariantName(vid) == run_params.getReferenceVariant() ) { - reference_vid = vid; - } - found_it = true; - } - } + for (size_t it = 0; it < variant_input.size(); ++it) { + bool found_it = false; - if ( !found_it ) invalid.push_back(variant_input[it]); - } + for (VIDset::iterator vid_it = available_var.begin(); + vid_it != available_var.end(); ++vid_it) { + VariantID vid = *vid_it; + if (getVariantName(vid) == variant_input[it]) { + run_var.insert(vid); + if (getVariantName(vid) == run_params.getReferenceVariant()) { + reference_vid = vid; + } + found_it = true; + } + } - // - // Set reference variant if not specified. - // - if ( run_params.getReferenceVariant().empty() && !run_var.empty() ) { - reference_vid = *run_var.begin(); - } + if (!found_it) invalid.push_back(variant_input[it]); + } - run_params.setInvalidVariantInput(invalid); + // + // Set reference variant if not specified. + // + if (run_params.getReferenceVariant().empty() && !run_var.empty()) { + reference_vid = *run_var.begin(); + } - } + run_params.setInvalidVariantInput(invalid); + } - if ( !(run_params.getInvalidKernelInput().empty()) ) { + if (!(run_params.getInvalidKernelInput().empty())) { - run_params.setInputState(RunParams::BadInput); + run_params.setInputState(RunParams::BadInput); - } else { // kernel input looks good + } else { // kernel input looks good - // Get lists using David and Amy's new maps! + // Get lists using David and Amy's new maps! /* for (KIDset::iterator kid = run_kern.begin(); kid != run_kern.end(); ++kid) { @@ -544,744 +537,748 @@ void Executor::setupSuite() } } */ - if ( !(run_params.getInvalidVariantInput().empty()) ) { + if (!(run_params.getInvalidVariantInput().empty())) { - run_params.setInputState(RunParams::BadInput); + run_params.setInputState(RunParams::BadInput); - } else { // variant input lools good + } else { // variant input lools good - for (VIDset::iterator vid = run_var.begin(); - vid != run_var.end(); ++vid) { - variant_ids.push_back( *vid ); - } + for (VIDset::iterator vid = run_var.begin(); + vid != run_var.end(); ++vid) { + variant_ids.push_back(*vid); + } - // - // If we've gotten to this point, we have good input to run. - // - if ( run_params.getInputState() != RunParams::DryRun && - run_params.getInputState() != RunParams::CheckRun ) { - run_params.setInputState(RunParams::PerfRun); - } + // + // If we've gotten to this point, we have good input to run. + // + if (run_params.getInputState() != RunParams::DryRun && + run_params.getInputState() != RunParams::CheckRun) { + run_params.setInputState(RunParams::PerfRun); + } - } // kernel and variant input both look good + } // kernel and variant input both look good #if defined(RUN_KOKKOS) - Kokkos::initialize(); - /** - * DZP: This is a terrible hack to just get the push/pop region - * callbacks without the begin_parallel_x/end_parallel_x ones, - * so we don't overfence and perturb performance - */ - auto events = Kokkos::Tools::Experimental::get_callbacks(); - auto push = events.push_region; - auto pop = events.pop_region; - auto metadata = events.declare_metadata; - Kokkos::Tools::Experimental::pause_tools(); - Kokkos::Tools::Experimental::set_push_region_callback(push); - Kokkos::Tools::Experimental::set_pop_region_callback(pop); - Kokkos::Tools::Experimental::set_declare_metadata_callback(metadata); + Kokkos::initialize(); + /** + * DZP: This is a terrible hack to just get the push/pop region + * callbacks without the begin_parallel_x/end_parallel_x ones, + * so we don't overfence and perturb performance + */ + auto events = Kokkos::Tools::Experimental::get_callbacks(); + auto push = events.push_region; + auto pop = events.pop_region; + auto metadata = events.declare_metadata; + Kokkos::Tools::Experimental::pause_tools(); + Kokkos::Tools::Experimental::set_push_region_callback(push); + Kokkos::Tools::Experimental::set_pop_region_callback(pop); + Kokkos::Tools::Experimental::set_declare_metadata_callback(metadata); #endif - } // if kernel input looks good + } // if kernel input looks good -} + } //////////////////////////////////////////////////////////////////////////////////// -void Executor::reportRunSummary(ostream& str) const -{ - RunParams::InputOpt in_state = run_params.getInputState(); + void Executor::reportRunSummary(ostream &str) const { + RunParams::InputOpt in_state = run_params.getInputState(); - if ( in_state == RunParams::BadInput ) { + if (in_state == RunParams::BadInput) { - str << "\nRunParams state:\n"; - str << "----------------"; - run_params.print(str); + str << "\nRunParams state:\n"; + str << "----------------"; + run_params.print(str); - str << "\n\nSuite will not be run now due to bad input." - << "\n See run parameters or option messages above.\n" - << endl; + str << "\n\nSuite will not be run now due to bad input." + << "\n See run parameters or option messages above.\n" + << endl; - } else if ( in_state == RunParams::PerfRun || - in_state == RunParams::DryRun || - in_state == RunParams::CheckRun ) { + } else if (in_state == RunParams::PerfRun || + in_state == RunParams::DryRun || + in_state == RunParams::CheckRun) { - if ( in_state == RunParams::DryRun ) { + if (in_state == RunParams::DryRun) { - str << "\n\nRAJA performance suite dry run summary...." - << "\n--------------------------------------" << endl; - - str << "\nInput state:"; - str << "\n------------"; - run_params.print(str); + str << "\n\nRAJA performance suite dry run summary...." + << "\n--------------------------------------" << endl; - } + str << "\nInput state:"; + str << "\n------------"; + run_params.print(str); - if ( in_state == RunParams::PerfRun || - in_state == RunParams::CheckRun ) { + } - str << "\n\nRAJA performance suite run summary...." - << "\n--------------------------------------" << endl; + if (in_state == RunParams::PerfRun || + in_state == RunParams::CheckRun) { - } + str << "\n\nRAJA performance suite run summary...." + << "\n--------------------------------------" << endl; - string ofiles; - if ( !run_params.getOutputDirName().empty() ) { - ofiles = run_params.getOutputDirName(); - } else { - ofiles = string("."); - } - ofiles += string("/") + run_params.getOutputFilePrefix() + - string("*"); + } - str << "\nHow suite will be run:" << endl; - str << "\t # passes = " << run_params.getNumPasses() << endl; - str << "\t Kernel size factor = " << run_params.getSizeFactor() << endl; - str << "\t Kernel rep factor = " << run_params.getRepFactor() << endl; - str << "\t Output files will be named " << ofiles << endl; + string ofiles; + if (!run_params.getOutputDirName().empty()) { + ofiles = run_params.getOutputDirName(); + } else { + ofiles = string("."); + } + ofiles += string("/") + run_params.getOutputFilePrefix() + + string("*"); + + str << "\nHow suite will be run:" << endl; + str << "\t # passes = " << run_params.getNumPasses() << endl; + str << "\t Kernel size factor = " << run_params.getSizeFactor() << endl; + str << "\t Kernel rep factor = " << run_params.getRepFactor() << endl; + str << "\t Output files will be named " << ofiles << endl; #if defined(RUN_KOKKOS) - Kokkos::Tools::declareMetadata("replication_factor",std::to_string(run_params.getRepFactor())); - Kokkos::Tools::declareMetadata("size_factor",std::to_string(run_params.getSizeFactor())); + Kokkos::Tools::declareMetadata("replication_factor", std::to_string(run_params.getRepFactor())); + Kokkos::Tools::declareMetadata("size_factor", std::to_string(run_params.getSizeFactor())); #endif - str << "\nThe following kernels and variants (when available) will be run:\n"; + str << "\nThe following kernels and variants (when available) will be run:\n"; - str << "\nVariants" - << "\n--------\n"; - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - str << getVariantName(variant_ids[iv]) << endl; - } + str << "\nVariants" + << "\n--------\n"; + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + str << getVariantName(variant_ids[iv]) << endl; + } + + str << "\nKernels(iterations/rep , reps)" + << "\n-----------------------------\n"; + for (size_t ik = 0; ik < kernels.size(); ++ik) { + KernelBase *kern = kernels[ik]; + str << kern->getName() + << " (" << kern->getItsPerRep() << " , " + << kern->getRunReps() << ")" << endl; + } - str << "\nKernels(iterations/rep , reps)" - << "\n-----------------------------\n"; - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase* kern = kernels[ik]; - str << kern->getName() - << " (" << kern->getItsPerRep() << " , " - << kern->getRunReps() << ")" << endl; + } + + str.flush(); } - } + void Executor::runSuite() { + RunParams::InputOpt in_state = run_params.getInputState(); + if (in_state != RunParams::PerfRun && + in_state != RunParams::CheckRun) { + return; + } - str.flush(); -} + cout << "\n\nRun warmup kernel...\n"; -void Executor::runSuite() -{ - RunParams::InputOpt in_state = run_params.getInputState(); - if ( in_state != RunParams::PerfRun && - in_state != RunParams::CheckRun ) { - return; - } + KernelBase *warmup_kernel = new basic::DAXPY(run_params); - cout << "\n\nRun warmup kernel...\n"; + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + VariantID vid = variant_ids[iv]; + if (run_params.showProgress()) { + if (warmup_kernel->hasVariantToRun(vid)) { + cout << " Running "; + } else { + cout << " No "; + } + cout << getVariantName(vid) << " variant" << endl; + } + if (warmup_kernel->hasVariantToRun(vid)) { + warmup_kernel->execute(vid); + } + } - KernelBase* warmup_kernel = new basic::DAXPY(run_params); + delete warmup_kernel; - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - VariantID vid = variant_ids[iv]; - if ( run_params.showProgress() ) { - if ( warmup_kernel->hasVariantToRun(vid) ) { - cout << " Running "; - } else { - cout << " No "; - } - cout << getVariantName(vid) << " variant" << endl; - } - if ( warmup_kernel->hasVariantToRun(vid) ) { - warmup_kernel->execute(vid); - } - } - delete warmup_kernel; + cout << "\n\nRunning specified kernels and variants...\n"; + const int npasses = run_params.getNumPasses(); + for (int ip = 0; ip < npasses; ++ip) { + if (run_params.showProgress()) { + std::cout << "\nPass through suite # " << ip << "\n"; + } + + for (size_t ik = 0; ik < kernels.size(); ++ik) { + KernelBase *kernel = kernels[ik]; + if (run_params.showProgress()) { + std::cout << "\nRun kernel -- " << kernel->getName() << "\n"; + } + + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + VariantID vid = variant_ids[iv]; + KernelBase *kern = kernels[ik]; + if (run_params.showProgress()) { + if (kern->hasVariantToRun(vid)) { + cout << " Running "; + } else { + cout << " No "; + } + cout << getVariantName(vid) << " variant" << endl; + } + if (kern->hasVariantToRun(vid)) { + kernels[ik]->execute(vid); + } + } // loop over variants - cout << "\n\nRunning specified kernels and variants...\n"; + } // loop over kernels + + } // loop over passes through suite - const int npasses = run_params.getNumPasses(); - for (int ip = 0; ip < npasses; ++ip) { - if ( run_params.showProgress() ) { - std::cout << "\nPass through suite # " << ip << "\n"; } - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase* kernel = kernels[ik]; - if ( run_params.showProgress() ) { - std::cout << "\nRun kernel -- " << kernel->getName() << "\n"; - } + void Executor::outputRunData() { + RunParams::InputOpt in_state = run_params.getInputState(); + if (in_state != RunParams::PerfRun && + in_state != RunParams::CheckRun) { + return; + } - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - VariantID vid = variant_ids[iv]; - KernelBase* kern = kernels[ik]; - if ( run_params.showProgress() ) { - if ( kern->hasVariantToRun(vid) ) { - cout << " Running "; - } else { - cout << " No "; - } - cout << getVariantName(vid) << " variant" << endl; - } - if ( kern->hasVariantToRun(vid) ) { - kernels[ik]->execute(vid); - } - } // loop over variants - - } // loop over kernels - - } // loop over passes through suite - -} - -void Executor::outputRunData() -{ - RunParams::InputOpt in_state = run_params.getInputState(); - if ( in_state != RunParams::PerfRun && - in_state != RunParams::CheckRun ) { - return; - } + cout << "\n\nGenerate run report files...\n"; - cout << "\n\nGenerate run report files...\n"; + // + // Generate output file prefix (including directory path). + // + string out_fprefix; + string outdir = recursiveMkdir(run_params.getOutputDirName()); + if (!outdir.empty()) { + chdir(outdir.c_str()); + } + out_fprefix = "./" + run_params.getOutputFilePrefix(); - // - // Generate output file prefix (including directory path). - // - string out_fprefix; - string outdir = recursiveMkdir(run_params.getOutputDirName()); - if ( !outdir.empty() ) { - chdir(outdir.c_str()); - } - out_fprefix = "./" + run_params.getOutputFilePrefix(); + string filename = out_fprefix + "-timing.csv"; + writeCSVReport(filename, CSVRepMode::Timing, 6 /* prec */); - string filename = out_fprefix + "-timing.csv"; - writeCSVReport(filename, CSVRepMode::Timing, 6 /* prec */); + if (haveReferenceVariant()) { + filename = out_fprefix + "-speedup.csv"; + writeCSVReport(filename, CSVRepMode::Speedup, 3 /* prec */); + } - if ( haveReferenceVariant() ) { - filename = out_fprefix + "-speedup.csv"; - writeCSVReport(filename, CSVRepMode::Speedup, 3 /* prec */); - } + filename = out_fprefix + "-checksum.txt"; + writeChecksumReport(filename); - filename = out_fprefix + "-checksum.txt"; - writeChecksumReport(filename); + filename = out_fprefix + "-fom.csv"; + writeFOMReport(filename); + } - filename = out_fprefix + "-fom.csv"; - writeFOMReport(filename); -} + void Executor::writeCSVReport(const string &filename, CSVRepMode mode, + size_t prec) { + ofstream file(filename.c_str(), ios::out | ios::trunc); + if (!file) { + cout << " ERROR: Can't open output file " << filename << endl; + } -void Executor::writeCSVReport(const string& filename, CSVRepMode mode, - size_t prec) -{ - ofstream file(filename.c_str(), ios::out | ios::trunc); - if ( !file ) { - cout << " ERROR: Can't open output file " << filename << endl; - } + if (file) { - if ( file ) { + // + // Set basic table formatting parameters. + // + const string kernel_col_name("Kernel "); + const string sepchr(" , "); - // - // Set basic table formatting parameters. - // - const string kernel_col_name("Kernel "); - const string sepchr(" , "); + size_t kercol_width = kernel_col_name.size(); + for (size_t ik = 0; ik < kernels.size(); ++ik) { + kercol_width = max(kercol_width, kernels[ik]->getName().size()); + } + kercol_width++; - size_t kercol_width = kernel_col_name.size(); - for (size_t ik = 0; ik < kernels.size(); ++ik) { - kercol_width = max(kercol_width, kernels[ik]->getName().size()); - } - kercol_width++; + vector varcol_width(variant_ids.size()); + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + varcol_width[iv] = max(prec + 2, getVariantName(variant_ids[iv]).size()); + } - vector varcol_width(variant_ids.size()); - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - varcol_width[iv] = max(prec+2, getVariantName(variant_ids[iv]).size()); - } + // + // Print title line. + // + file << getReportTitle(mode); - // - // Print title line. - // - file << getReportTitle(mode); + // + // Wrtie CSV file contents for report. + // - // - // Wrtie CSV file contents for report. - // + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + file << sepchr; + } + file << endl; - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - file << sepchr; - } - file << endl; + // + // Print column title line. + // + file << left << setw(kercol_width) << kernel_col_name; + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + file << sepchr << left << setw(varcol_width[iv]) + << getVariantName(variant_ids[iv]); + } + file << endl; - // - // Print column title line. - // - file <getName(); + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + VariantID vid = variant_ids[iv]; + file << sepchr << right << setw(varcol_width[iv]); + if ((mode == CSVRepMode::Speedup) && + (!kern->hasVariantToRun(reference_vid) || + !kern->hasVariantToRun(vid))) { + file << "Not run"; + } else if ((mode == CSVRepMode::Timing) && + !kern->hasVariantToRun(vid)) { + file << "Not run"; + } else { + file << setprecision(prec) << std::fixed + << getReportDataEntry(mode, kern, vid); + } + } + file << endl; + } - // - // Print row of data for variants of each kernel. - // - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase* kern = kernels[ik]; - file <getName(); - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - VariantID vid = variant_ids[iv]; - file << sepchr <hasVariantToRun(reference_vid) || - !kern->hasVariantToRun(vid)) ) { - file << "Not run"; - } else if ( (mode == CSVRepMode::Timing) && - !kern->hasVariantToRun(vid) ) { - file << "Not run"; - } else { - file << setprecision(prec) << std::fixed - << getReportDataEntry(mode, kern, vid); - } - } - file << endl; + file.flush(); + + } // note file will be closed when file stream goes out of scope } - file.flush(); - } // note file will be closed when file stream goes out of scope -} + void Executor::writeFOMReport(const string &filename) { + vector fom_groups; + getFOMGroups(fom_groups); + if (fom_groups.empty()) { + return; + } + ofstream file(filename.c_str(), ios::out | ios::trunc); + if (!file) { + cout << " ERROR: Can't open output file " << filename << endl; + } -void Executor::writeFOMReport(const string& filename) -{ - vector fom_groups; - getFOMGroups(fom_groups); - if (fom_groups.empty() ) { - return; - } + if (file) { - ofstream file(filename.c_str(), ios::out | ios::trunc); - if ( !file ) { - cout << " ERROR: Can't open output file " << filename << endl; - } + // + // Set basic table formatting parameters. + // + const string kernel_col_name("Kernel "); + const string sepchr(" , "); + size_t prec = 2; - if ( file ) { + size_t kercol_width = kernel_col_name.size(); + for (size_t ik = 0; ik < kernels.size(); ++ik) { + kercol_width = max(kercol_width, kernels[ik]->getName().size()); + } + kercol_width++; - // - // Set basic table formatting parameters. - // - const string kernel_col_name("Kernel "); - const string sepchr(" , "); - size_t prec = 2; + size_t fom_col_width = prec + 14; - size_t kercol_width = kernel_col_name.size(); - for (size_t ik = 0; ik < kernels.size(); ++ik) { - kercol_width = max(kercol_width, kernels[ik]->getName().size()); - } - kercol_width++; + size_t ncols = 0; + for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { + const FOMGroup &group = fom_groups[ifg]; + ncols += group.variants.size(); // num variants to compare + // to each PM baseline + } - size_t fom_col_width = prec+14; + vector col_exec_count(ncols, 0); + vector col_min(ncols, numeric_limits::max()); + vector col_max(ncols, -numeric_limits::max()); + vector col_avg(ncols, 0.0); + vector col_stddev(ncols, 0.0); + vector > pct_diff(kernels.size()); + for (size_t ik = 0; ik < kernels.size(); ++ik) { + pct_diff[ik] = vector(ncols, 0.0); + } - size_t ncols = 0; - for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { - const FOMGroup& group = fom_groups[ifg]; - ncols += group.variants.size(); // num variants to compare - // to each PM baseline - } + // + // Print title line. + // + file + << "FOM Report : signed speedup(-)/slowdown(+) for each PM (base vs. RAJA) -> (T_RAJA - T_base) / T_base )"; + for (size_t iv = 0; iv < ncols * 2; ++iv) { + file << sepchr; + } + file << endl; - vector col_exec_count(ncols, 0); - vector col_min(ncols, numeric_limits::max()); - vector col_max(ncols, -numeric_limits::max()); - vector col_avg(ncols, 0.0); - vector col_stddev(ncols, 0.0); - vector< vector > pct_diff(kernels.size()); - for (size_t ik = 0; ik < kernels.size(); ++ik) { - pct_diff[ik] = vector(ncols, 0.0); - } + file << "'OVER_TOL' in column to right if RAJA speedup is over tolerance"; + for (size_t iv = 0; iv < ncols * 2; ++iv) { + file << sepchr; + } + file << endl; - // - // Print title line. - // - file << "FOM Report : signed speedup(-)/slowdown(+) for each PM (base vs. RAJA) -> (T_RAJA - T_base) / T_base )"; - for (size_t iv = 0; iv < ncols*2; ++iv) { - file << sepchr; - } - file << endl; + string pass(", "); + string fail(",OVER_TOL"); - file << "'OVER_TOL' in column to right if RAJA speedup is over tolerance"; - for (size_t iv = 0; iv < ncols*2; ++iv) { - file << sepchr; - } - file << endl; + // + // Print column title line. + // + file << left << setw(kercol_width) << kernel_col_name; + for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { + const FOMGroup &group = fom_groups[ifg]; + for (size_t gv = 0; gv < group.variants.size(); ++gv) { + string name = getVariantName(group.variants[gv]); + file << sepchr << left << setw(fom_col_width) << name << pass; + } + } + file << endl; - string pass(", "); - string fail(",OVER_TOL"); - // - // Print column title line. - // - file <getName(); - // - // Print row of FOM data for each kernel. - // - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase* kern = kernels[ik]; - - file <getName(); - - int col = 0; - for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { - const FOMGroup& group = fom_groups[ifg]; - - VariantID base_vid = group.base; - - for (size_t gv = 0; gv < group.variants.size(); ++gv) { - VariantID comp_vid = group.variants[gv]; - - // - // If kernel variant was run, generate data for it and - // print (signed) percentage difference from baseline. - // - if ( kern->wasVariantRun(comp_vid) ) { - col_exec_count[col]++; - - pct_diff[ik][col] = - (kern->getTotTime(comp_vid) - kern->getTotTime(base_vid)) / - kern->getTotTime(base_vid); - - string pfstring(pass); - if (pct_diff[ik][col] > run_params.getPFTolerance()) { - pfstring = fail; - } + int col = 0; + for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { + const FOMGroup &group = fom_groups[ifg]; - file << sepchr << setw(fom_col_width) << setprecision(prec) - <wasVariantRun(comp_vid)) { + col_exec_count[col]++; - file << sepchr <getTotTime(comp_vid) - kern->getTotTime(base_vid)) / + kern->getTotTime(base_vid); - } + string pfstring(pass); + if (pct_diff[ik][col] > run_params.getPFTolerance()) { + pfstring = fail; + } - col++; + file << sepchr << setw(fom_col_width) << setprecision(prec) + << left << pct_diff[ik][col] << right << pfstring; - } // loop over group variants + // + // Gather data for column summaries (unsigned). + // + col_min[col] = min(col_min[col], pct_diff[ik][col]); + col_max[col] = max(col_max[col], pct_diff[ik][col]); + col_avg[col] += pct_diff[ik][col]; - } // loop over fom_groups (i.e., columns) + } else { // variant was not run, print a big fat goose egg... - file << endl; + file << sepchr << left << setw(fom_col_width) << setprecision(prec) + << 0.0 << pass; - } // loop over kernels + } + col++; - // - // Compute column summary data. - // + } // loop over group variants - // Column average... - for (size_t col = 0; col < ncols; ++col) { - if ( col_exec_count[col] > 0 ) { - col_avg[col] /= col_exec_count[col]; - } else { - col_avg[col] = 0.0; - } - } + } // loop over fom_groups (i.e., columns) - // Column standard deviaation... - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase* kern = kernels[ik]; + file << endl; - int col = 0; - for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { - const FOMGroup& group = fom_groups[ifg]; + } // loop over kernels - for (size_t gv = 0; gv < group.variants.size(); ++gv) { - VariantID comp_vid = group.variants[gv]; - if ( kern->wasVariantRun(comp_vid) ) { - col_stddev[col] += ( pct_diff[ik][col] - col_avg[col] ) * - ( pct_diff[ik][col] - col_avg[col] ); - } + // + // Compute column summary data. + // - col++; + // Column average... + for (size_t col = 0; col < ncols; ++col) { + if (col_exec_count[col] > 0) { + col_avg[col] /= col_exec_count[col]; + } else { + col_avg[col] = 0.0; + } + } - } // loop over group variants + // Column standard deviaation... + for (size_t ik = 0; ik < kernels.size(); ++ik) { + KernelBase *kern = kernels[ik]; - } // loop over groups + int col = 0; + for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { + const FOMGroup &group = fom_groups[ifg]; - } // loop over kernels - - for (size_t col = 0; col < ncols; ++col) { - if ( col_exec_count[col] > 0 ) { - col_stddev[col] /= col_exec_count[col]; - } else { - col_stddev[col] = 0.0; - } - } + for (size_t gv = 0; gv < group.variants.size(); ++gv) { + VariantID comp_vid = group.variants[gv]; - // - // Print column summaries. - // - file <wasVariantRun(comp_vid)) { + col_stddev[col] += (pct_diff[ik][col] - col_avg[col]) * + (pct_diff[ik][col] - col_avg[col]); + } - file < 0) { + col_stddev[col] /= col_exec_count[col]; + } else { + col_stddev[col] = 0.0; + } + } - } // note file will be closed when file stream goes out of scope -} + // + // Print column summaries. + // + file << left << setw(kercol_width) << " "; + for (size_t iv = 0; iv < ncols; ++iv) { + file << sepchr << setw(fom_col_width) << left << " " << right << pass; + } + file << endl; + file << left << setw(kercol_width) << "Col Min"; + for (size_t col = 0; col < ncols; ++col) { + file << sepchr << left << setw(fom_col_width) << setprecision(prec) + << col_min[col] << pass; + } + file << endl; -void Executor::writeChecksumReport(const string& filename) -{ - ofstream file(filename.c_str(), ios::out | ios::trunc); - if ( !file ) { - cout << " ERROR: Can't open output file " << filename << endl; - } + file << left << setw(kercol_width) << "Col Max"; + for (size_t col = 0; col < ncols; ++col) { + file << sepchr << left << setw(fom_col_width) << setprecision(prec) + << col_max[col] << pass; + } + file << endl; - if ( file ) { + file << left << setw(kercol_width) << "Col Avg"; + for (size_t col = 0; col < ncols; ++col) { + file << sepchr << left << setw(fom_col_width) << setprecision(prec) + << col_avg[col] << pass; + } + file << endl; - // - // Set basic table formatting parameters. - // - const string equal_line("==================================================================================================="); - const string dash_line("----------------------------------------------------------------------------------------"); - const string dash_line_short("-------------------------------------------------------"); - string dot_line("........................................................"); + file << left << setw(kercol_width) << "Col Std Dev"; + for (size_t col = 0; col < ncols; ++col) { + file << sepchr << left << setw(fom_col_width) << setprecision(prec) + << col_stddev[col] << pass; + } + file << endl; - size_t prec = 20; - size_t checksum_width = prec + 8; + file.flush(); - size_t namecol_width = 0; - for (size_t ik = 0; ik < kernels.size(); ++ik) { - namecol_width = max(namecol_width, kernels[ik]->getName().size()); - } - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - namecol_width = max(namecol_width, - getVariantName(variant_ids[iv]).size()); + } // note file will be closed when file stream goes out of scope } - namecol_width++; - // - // Print title. - // - file << equal_line << endl; - file << "Checksum Report " << endl; - file << equal_line << endl; + void Executor::writeChecksumReport(const string &filename) { + ofstream file(filename.c_str(), ios::out | ios::trunc); + if (!file) { + cout << " ERROR: Can't open output file " << filename << endl; + } - // - // Print column title line. - // - file <getName() << endl; - file << dot_line << endl; - - Checksum_type cksum_ref = 0.0; - size_t ivck = 0; - bool found_ref = false; - while ( ivck < variant_ids.size() && !found_ref ) { - VariantID vid = variant_ids[ivck]; - if ( kern->wasVariantRun(vid) ) { - cksum_ref = kern->getChecksum(vid); - found_ref = true; - } - ++ivck; - } + // + // Set basic table formatting parameters. + // + const string equal_line( + "==================================================================================================="); + const string dash_line( + "----------------------------------------------------------------------------------------"); + const string dash_line_short("-------------------------------------------------------"); + string dot_line("........................................................"); + + size_t prec = 20; + size_t checksum_width = prec + 8; + + size_t namecol_width = 0; + for (size_t ik = 0; ik < kernels.size(); ++ik) { + namecol_width = max(namecol_width, kernels[ik]->getName().size()); + } + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + namecol_width = max(namecol_width, + getVariantName(variant_ids[iv]).size()); + } + namecol_width++; - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - VariantID vid = variant_ids[iv]; - - if ( kern->wasVariantRun(vid) ) { - Checksum_type vcheck_sum = kern->getChecksum(vid); - Checksum_type diff = cksum_ref - kern->getChecksum(vid); - - file <getName() << endl; + file << dot_line << endl; + + Checksum_type cksum_ref = 0.0; + size_t ivck = 0; + bool found_ref = false; + while (ivck < variant_ids.size() && !found_ref) { + VariantID vid = variant_ids[ivck]; + if (kern->wasVariantRun(vid)) { + cksum_ref = kern->getChecksum(vid); + found_ref = true; + } + ++ivck; + } - } // note file will be closed when file stream goes out of scope -} + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + VariantID vid = variant_ids[iv]; + if (kern->wasVariantRun(vid)) { + Checksum_type vcheck_sum = kern->getChecksum(vid); + Checksum_type diff = cksum_ref - kern->getChecksum(vid); -string Executor::getReportTitle(CSVRepMode mode) -{ - string title; - switch ( mode ) { - case CSVRepMode::Timing : { - title = string("Mean Runtime Report (sec.) "); - break; - } - case CSVRepMode::Speedup : { - if ( haveReferenceVariant() ) { - title = string("Speedup Report (T_ref/T_var)") + - string(": ref var = ") + getVariantName(reference_vid) + - string(" "); - } - break; + file << left << setw(namecol_width) << getVariantName(vid) + << showpoint << setprecision(prec) + << left << setw(checksum_width) << vcheck_sum + << left << setw(checksum_width) << diff << endl; + } else { + file << left << setw(namecol_width) << getVariantName(vid) + << left << setw(checksum_width) << "Not Run" + << left << setw(checksum_width) << "Not Run" << endl; + } + + } + + file << endl; + file << dash_line_short << endl; + } + + file.flush(); + + } // note file will be closed when file stream goes out of scope } - default : { cout << "\n Unknown CSV report mode = " << mode << endl; } - }; - return title; -} - -long double Executor::getReportDataEntry(CSVRepMode mode, - KernelBase* kern, - VariantID vid) -{ - long double retval = 0.0; - switch ( mode ) { - case CSVRepMode::Timing : { - retval = kern->getTotTime(vid) / run_params.getNumPasses(); - break; + + + string Executor::getReportTitle(CSVRepMode mode) { + string title; + switch (mode) { + case CSVRepMode::Timing : { + title = string("Mean Runtime Report (sec.) "); + break; + } + case CSVRepMode::Speedup : { + if (haveReferenceVariant()) { + title = string("Speedup Report (T_ref/T_var)") + + string(": ref var = ") + getVariantName(reference_vid) + + string(" "); + } + break; + } + default : { + cout << "\n Unknown CSV report mode = " << mode << endl; + } + }; + return title; } - case CSVRepMode::Speedup : { - if ( haveReferenceVariant() ) { - if ( kern->hasVariantToRun(reference_vid) && - kern->hasVariantToRun(vid) ) { - retval = kern->getTotTime(reference_vid) / kern->getTotTime(vid); - } else { - retval = 0.0; - } + + long double Executor::getReportDataEntry(CSVRepMode mode, + KernelBase *kern, + VariantID vid) { + long double retval = 0.0; + switch (mode) { + case CSVRepMode::Timing : { + retval = kern->getTotTime(vid) / run_params.getNumPasses(); + break; + } + case CSVRepMode::Speedup : { + if (haveReferenceVariant()) { + if (kern->hasVariantToRun(reference_vid) && + kern->hasVariantToRun(vid)) { + retval = kern->getTotTime(reference_vid) / kern->getTotTime(vid); + } else { + retval = 0.0; + } #if 0 // RDH DEBUG (leave this here, it's useful for debugging!) - cout << "Kernel(iv): " << kern->getName() << "(" << vid << ")" << endl; - cout << "\tref_time, tot_time, retval = " - << kern->getTotTime(reference_vid) << " , " - << kern->getTotTime(vid) << " , " - << retval << endl; + cout << "Kernel(iv): " << kern->getName() << "(" << vid << ")" << endl; + cout << "\tref_time, tot_time, retval = " + << kern->getTotTime(reference_vid) << " , " + << kern->getTotTime(vid) << " , " + << retval << endl; #endif - } - break; + } + break; + } + default : { + cout << "\n Unknown CSV report mode = " << mode << endl; + } + }; + return retval; } - default : { cout << "\n Unknown CSV report mode = " << mode << endl; } - }; - return retval; -} - -void Executor::getFOMGroups(vector& fom_groups) -{ - fom_groups.clear(); - - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - VariantID vid = variant_ids[iv]; - string vname = getVariantName(vid); - - if ( vname.find("Base") != string::npos ) { - - FOMGroup group; - group.base = vid; - - string::size_type pos = vname.find("_"); - string pm(vname.substr(pos+1, string::npos)); - - for (size_t ivs = iv+1; ivs < variant_ids.size(); ++ivs) { - VariantID vids = variant_ids[ivs]; - if ( getVariantName(vids).find(pm) != string::npos ) { - group.variants.push_back(vids); - } - } - if ( !group.variants.empty() ) { - fom_groups.push_back( group ); - } + void Executor::getFOMGroups(vector &fom_groups) { + fom_groups.clear(); + + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + VariantID vid = variant_ids[iv]; + string vname = getVariantName(vid); - } // if variant name contains 'Base' + if (vname.find("Base") != string::npos) { - } // iterate over variant ids to run + FOMGroup group; + group.base = vid; + + string::size_type pos = vname.find("_"); + string pm(vname.substr(pos + 1, string::npos)); + + for (size_t ivs = iv + 1; ivs < variant_ids.size(); ++ivs) { + VariantID vids = variant_ids[ivs]; + if (getVariantName(vids).find(pm) != string::npos) { + group.variants.push_back(vids); + } + } + + if (!group.variants.empty()) { + fom_groups.push_back(group); + } + + } // if variant name contains 'Base' + + } // iterate over variant ids to run #if 0 // RDH DEBUG (leave this here, it's useful for debugging!) - cout << "\nFOMGroups..." << endl; - for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { - const FOMGroup& group = fom_groups[ifg]; - cout << "\tBase : " << getVariantName(group.base) << endl; - for (size_t iv = 0; iv < group.variants.size(); ++iv) { - cout << "\t\t " << getVariantName(group.variants[iv]) << endl; - } - } + cout << "\nFOMGroups..." << endl; + for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { + const FOMGroup& group = fom_groups[ifg]; + cout << "\tBase : " << getVariantName(group.base) << endl; + for (size_t iv = 0; iv < group.variants.size(); ++iv) { + cout << "\t\t " << getVariantName(group.variants[iv]) << endl; + } + } #endif -} + } // TODO: AJP and DZP talk these functions through; // is the arrow operator here acting as a pointer object to registerGroup, etc.? -void free_register_group(Executor* exec, std::string groupName){ - exec->registerGroup(groupName); -} -void free_register_kernel(Executor* exec, std::string groupName, KernelBase* kernel) { - exec->registerKernel(groupName, kernel); -} + void free_register_group(Executor *exec, std::string groupName) { + exec->registerGroup(groupName); + } + + void free_register_kernel(Executor *exec, std::string groupName, KernelBase *kernel) { + exec->registerKernel(groupName, kernel); + } + + const RunParams& getRunParams(Executor* exec){ + return exec->getRunParams(); + + } } // closing brace for rajaperf namespace diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 2ab4ef055..c147f3d9c 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -66,6 +66,12 @@ class Executor std::vector lookUpKernelByName(std::string kernelOrGroupName); + const RunParams& getRunParams(); + + + + + private: Executor() = delete; @@ -134,6 +140,7 @@ class Executor void free_register_group(Executor*, std::string); void free_register_kernel(Executor*, std::string, KernelBase*); +const RunParams& getRunParams(Executor* exec); } // closing brace for rajaperf namespace diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 8cd5e9b30..3df20ff0e 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -64,21 +64,11 @@ Index_type KernelBase::getRunSize() const //FIXME Index_type KernelBase::getRunReps() const { - // DEBUGGING - std::cout << "AMOS" << std::endl; - - // std::cout << "Get Run Name " << run_params << std::endl; - //std::cout << "Get Variant Name " << run_params.getVariantName << std::endl; - // std::cout << "Get Full Kernel Name " << run_params.getFullKernelName << std::endl; - // std::cout << "Check Run Reps " << run_params.getCheckRunReps << std::endl; - //std::cout << "Get Run Reps " << run_params.getRunReps << std::endl; - //std::cout << "Get Run Rep Factor " << run_params.getRepFactor << std::endl; - // std::cout << "Get Run Size " << run_params.getRunSize -> run_params.getRunSize << std::endl; - // std::cout << "Get Run Size Factor " << run_params.getSizeFactor << std::endl; if (run_params.getInputState() == RunParams::CheckRun) { return static_cast(run_params.getCheckRunReps()); } else { + return static_cast(default_reps*run_params.getRepFactor()); } } diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 29ec1f45e..7bdddeb35 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -89,7 +89,8 @@ namespace rajaperf { void make_perfsuite_executor(rajaperf::Executor *exec, int argc, char *argv[]) { - RunParams run_params(argc, argv); + //RunParams run_params(argc, argv); + const RunParams& run_params = getRunParams(exec); free_register_group(exec, std::string("Basic")); free_register_group(exec, std::string("Lcals")); free_register_group(exec, std::string("Polybench")); diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index d950974f3..6261db80e 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -22,10 +22,11 @@ #include namespace rajaperf { - +class RunParams; class Executor; // forward declaration class KernelBase; +const RunParams& getRunParams(Executor*); void free_register_group(Executor*, std::string); // forward declaration void free_register_kernel(Executor*, std::string, KernelBase*); // forward declaration void make_perfsuite_executor(Executor* exec, int argc, char* argv[]); diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 89c3c0965..08e4d4388 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -42,6 +42,8 @@ RunParams::RunParams(int argc, char** argv) outfile_prefix("RAJAPerf") { parseCommandLineOptions(argc, argv); + auto foo =0; + } diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 7613ed649..4afbaedd5 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -42,6 +42,8 @@ ADD::ADD(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined(Kokkos_Lambda); } ADD::~ADD() From 1514d535727c815a0f3250672d356fa5c88b9c5f Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Wed, 2 Jun 2021 16:56:30 -0600 Subject: [PATCH 081/124] Kokkos translations of stream kernels --- src/common/RAJAPerfSuite.cpp | 8 +-- src/stream-kokkos/COPY-Kokkos.cpp | 53 +++++++++++++++++-- src/stream-kokkos/DOT-Kokkos.cpp | 81 ++++++++++++++++++++++++++++-- src/stream-kokkos/MUL-Kokkos.cpp | 70 ++++++++++++++++++++++++-- src/stream-kokkos/TRIAD-Kokkos.cpp | 60 ++++++++++++++++++++-- src/stream/ADD.cpp | 2 +- src/stream/COPY.cpp | 2 + src/stream/COPY.hpp | 2 + src/stream/DOT.cpp | 2 + src/stream/DOT.hpp | 4 ++ src/stream/MUL.cpp | 2 + src/stream/MUL.hpp | 1 + src/stream/TRIAD.cpp | 3 ++ src/stream/TRIAD.hpp | 2 + 14 files changed, 272 insertions(+), 20 deletions(-) diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 7bdddeb35..e53a7b6b5 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -141,10 +141,10 @@ namespace rajaperf { **/ // Stream free_register_kernel(exec, "Stream", new stream::ADD(run_params)); - //free_register_kernel(exec, "Stream", new stream::COPY(run_params)); - //free_register_kernel(exec, "Stream", new stream::DOT(run_params)); - //free_register_kernel(exec, "Stream", new stream::MUL(run_params)); - //free_register_kernel(exec, "Stream", new stream::TRIAD(run_params)); + free_register_kernel(exec, "Stream", new stream::COPY(run_params)); + free_register_kernel(exec, "Stream", new stream::DOT(run_params)); + free_register_kernel(exec, "Stream", new stream::MUL(run_params)); + free_register_kernel(exec, "Stream", new stream::TRIAD(run_params)); /** // Apps free_register_kernel(exec, "Apps", new apps::COUPLE(run_params)); diff --git a/src/stream-kokkos/COPY-Kokkos.cpp b/src/stream-kokkos/COPY-Kokkos.cpp index 68bc51e4d..01b1f19b5 100644 --- a/src/stream-kokkos/COPY-Kokkos.cpp +++ b/src/stream-kokkos/COPY-Kokkos.cpp @@ -17,19 +17,34 @@ namespace rajaperf namespace stream { - +/* void COPY::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getRunSize(); +*/ + + void COPY::runKokkosVariant(VariantID vid) + + { + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + COPY_DATA_SETUP; + + auto a_view = getViewFromPointer(a, iend); + auto c_view = getViewFromPointer(c, iend); + auto copy_lam = [=](Index_type i) { COPY_BODY; }; +#if defined(RUN_KOKKOS) + switch ( vid ) { case Base_Seq : { @@ -47,7 +62,6 @@ void COPY::runSeqVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) case Lambda_Seq : { startTimer(); @@ -62,7 +76,7 @@ void COPY::runSeqVariant(VariantID vid) break; } - +/* case RAJA_Seq : { startTimer(); @@ -76,7 +90,32 @@ void COPY::runSeqVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ + + */ + + case Kokkos_Lambda : { + + + Kokkos::fence(); + startTimer(); + + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + Kokkos::parallel_for("COPY_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin,iend), + KOKKOS_LAMBDA(Index_type i) { + // COPY BODY DEFINITION IN HEADER: + // c[i] = a[i] ; + c_view[i] = a_view[i]; + }); + + } + Kokkos::fence(); + stopTimer(); + + break; + } + + default : { std::cout << "\n COPY : Unknown variant id = " << vid << std::endl; @@ -84,6 +123,12 @@ void COPY::runSeqVariant(VariantID vid) } + +#endif //RUN_KOKKOS + + moveDataToHostFromKokkosView(a, a_view, iend); + moveDataToHostFromKokkosView(c, c_view, iend); + } } // end namespace stream diff --git a/src/stream-kokkos/DOT-Kokkos.cpp b/src/stream-kokkos/DOT-Kokkos.cpp index 8f6158a87..06a2040b8 100644 --- a/src/stream-kokkos/DOT-Kokkos.cpp +++ b/src/stream-kokkos/DOT-Kokkos.cpp @@ -17,15 +17,38 @@ namespace rajaperf namespace stream { - +/* void DOT::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getRunSize(); +*/ + + +void DOT::runKokkosVariant(VariantID vid) { + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + DOT_DATA_SETUP; + // Instantiation of pointer - wrapped views: + auto a_view = getViewFromPointer(a, iend); + auto b_view = getViewFromPointer(b, iend); + // + // From basic-kokkos - REDUCE3 + // Instantiation of a view from a pointer to a vector + // auto vec_view = getViewFromPointer(vec, iend); + + + + // Pre-processor directive +#if defined(RUN_KOKKOS) + switch ( vid ) { case Base_Seq : { @@ -47,7 +70,7 @@ void DOT::runSeqVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) +// #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { auto dot_base_lam = [=](Index_type i) -> Real_type { @@ -70,7 +93,7 @@ void DOT::runSeqVariant(VariantID vid) break; } - +/* case RAJA_Seq : { startTimer(); @@ -90,7 +113,47 @@ void DOT::runSeqVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ + */ + + case Kokkos_Lambda : { + + // open Kokkosfence + Kokkos::fence(); + startTimer(); + + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + // Declare and initialize dot + // dot will contain the reduction value, + // i.e., the dot product + // + // Reductions combine contributions from + // loop iterations + Real_type dot = m_dot_init; + + parallel_reduce("DOT-Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(const int64_t i, Real_type& dot_res){ + + // DOT BODY definition from header: + // dot += a[i] * b[i] ; + //dot_res += a_view[i]*b_view[i]; + /////////////////////////////// + //Int_type vec_i = vec_view[i]; + dot_res += a_view[i]*b_view[i]; + //dot_res = vec_i; + }, dot); + m_dot += static_cast(dot); + } + + Kokkos::fence(); + stopTimer(); + + break; + } + + + +// #endif // RUN_RAJA_SEQ default : { std::cout << "\n DOT : Unknown variant id = " << vid << std::endl; @@ -98,6 +161,16 @@ void DOT::runSeqVariant(VariantID vid) } + +#endif // RUN_KOKKOS + + std::cout << " FIX ME STREAM DOT -- GET DATA FROM VIEWS " << std::endl; + //moveDataToHostFromKokkosView(a, a_view, iend); + //moveDataToHostFromKokkosView(b, b_view, iend); + + // From REDUCE3-INT + // moveDataToHostFromKokkosView(vec, vec_view, iend); + } } // end namespace stream diff --git a/src/stream-kokkos/MUL-Kokkos.cpp b/src/stream-kokkos/MUL-Kokkos.cpp index 7b36935e1..24b74b4ea 100644 --- a/src/stream-kokkos/MUL-Kokkos.cpp +++ b/src/stream-kokkos/MUL-Kokkos.cpp @@ -17,19 +17,45 @@ namespace rajaperf namespace stream { - +/* void MUL::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getRunSize(); +*/ + + void MUL::runKokkosVariant(VariantID vid) + { + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + MUL_DATA_SETUP; + /* from MUL.hpp + #define MUL_DATA_SETUP \ + Real_ptr b = m_b; \ + Real_ptr c = m_c; \ + Real_type alpha = m_alpha + +*/ + auto b_view = getViewFromPointer(b, iend); + auto c_view = getViewFromPointer(c, iend); + + // Is this needed here? + // The declaration and initialization is from stream/MUL.hpp + //Real_type alpha = m_alpha; + + auto mul_lam = [=](Index_type i) { MUL_BODY; }; + +#if defined(RUN_KOKKOS) + switch ( vid ) { case Base_Seq : { @@ -47,7 +73,7 @@ void MUL::runSeqVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) +// #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { startTimer(); @@ -63,6 +89,7 @@ void MUL::runSeqVariant(VariantID vid) break; } +/* case RAJA_Seq : { startTimer(); @@ -76,7 +103,36 @@ void MUL::runSeqVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ + */ + +//#endif // RUN_RAJA_SEQ + + case Kokkos_Lambda : { + + Kokkos::fence(); + startTimer(); + + for (RepIndex_type irep =0; irep < run_reps; ++irep) { + + Kokkos::parallel_for("MUL_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i) { + // MUL BODY DEFINITION: + // b[i] = alpha * c[i] ; + b_view[i] = alpha * c_view[i]; + }); + + } + Kokkos::fence(); + stopTimer(); + + break; + + } + + + //} + default : { std::cout << "\n MUL : Unknown variant id = " << vid << std::endl; @@ -84,6 +140,14 @@ void MUL::runSeqVariant(VariantID vid) } +#endif // RUN_KOKKOS + + // move data to host from view + + moveDataToHostFromKokkosView(b, b_view, iend); + moveDataToHostFromKokkosView(c, c_view, iend); + + } } // end namespace stream diff --git a/src/stream-kokkos/TRIAD-Kokkos.cpp b/src/stream-kokkos/TRIAD-Kokkos.cpp index 06885d50c..ffac1e2c5 100644 --- a/src/stream-kokkos/TRIAD-Kokkos.cpp +++ b/src/stream-kokkos/TRIAD-Kokkos.cpp @@ -17,19 +17,41 @@ namespace rajaperf namespace stream { - +/* void TRIAD::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getRunSize(); +*/ + +void TRIAD::runKokkosVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + TRIAD_DATA_SETUP; +/* + #define TRIAD_DATA_SETUP \ + Real_ptr a = m_a; \ + Real_ptr b = m_b; \ + Real_ptr c = m_c; \ + Real_type alpha = m_alpha; +*/ + + auto a_view = getViewFromPointer(a, iend); + auto b_view = getViewFromPointer(b, iend); + auto c_view = getViewFromPointer(c, iend); + auto triad_lam = [=](Index_type i) { TRIAD_BODY; }; +#if defined (RUN_KOKKOS) + switch ( vid ) { case Base_Seq : { @@ -47,7 +69,7 @@ void TRIAD::runSeqVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) +// #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { startTimer(); @@ -62,7 +84,7 @@ void TRIAD::runSeqVariant(VariantID vid) break; } - +/* case RAJA_Seq : { startTimer(); @@ -76,7 +98,31 @@ void TRIAD::runSeqVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ + */ + + case Kokkos_Lambda : { + Kokkos::fence(); + startTimer(); + + for (RepIndex_type irep =0; irep < run_reps; ++irep) { + + Kokkos::parallel_for("TRIAD_Kokkos, Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i) { + // TRIAD_BODY definition in TRIAD.hpp + // a[i] = b[i] + alpha * c[i] ; + a_view[i] = b_view[i] + alpha * c_view[i]; + }); + } + + Kokkos::fence(); + stopTimer(); + + break; + + } + +//#endif // RUN_RAJA_SEQ default : { std::cout << "\n TRIAD : Unknown variant id = " << vid << std::endl; @@ -84,6 +130,12 @@ void TRIAD::runSeqVariant(VariantID vid) } +#endif // RUN_KOKKOS + + moveDataToHostFromKokkosView(a, a_view, iend); + moveDataToHostFromKokkosView(b, b_view, iend); + moveDataToHostFromKokkosView(c, c_view, iend); + } } // end namespace stream diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 4afbaedd5..6ea725851 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -43,7 +43,7 @@ ADD::ADD(const RunParams& params) setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); - setVariantDefined(Kokkos_Lambda); + setVariantDefined(Kokkos_Lambda); } ADD::~ADD() diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index db865c688..7e5598476 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -42,6 +42,8 @@ COPY::COPY(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); } COPY::~COPY() diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp index c0e2c455a..0d6122cee 100644 --- a/src/stream/COPY.hpp +++ b/src/stream/COPY.hpp @@ -46,6 +46,8 @@ class COPY : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); + void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index 036127eb9..bf4a8f928 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -40,6 +40,8 @@ DOT::DOT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + // Kokkos info + setVariantDefined(Kokkos_Lambda); } DOT::~DOT() diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index da20ab082..8d698e727 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -46,11 +46,15 @@ class DOT : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + + // Kokkos additions + private: Real_ptr m_a; diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index c07359629..22a3341ae 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -42,6 +42,8 @@ MUL::MUL(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined(Kokkos_Lambda); } MUL::~MUL() diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp index 25943fcbe..eeab89f66 100644 --- a/src/stream/MUL.hpp +++ b/src/stream/MUL.hpp @@ -47,6 +47,7 @@ class MUL : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index 949b4f355..58c5996b3 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -42,6 +42,9 @@ TRIAD::TRIAD(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined(Kokkos_Lambda); + } TRIAD::~TRIAD() diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp index 23ffd168a..33deb44cf 100644 --- a/src/stream/TRIAD.hpp +++ b/src/stream/TRIAD.hpp @@ -48,6 +48,8 @@ class TRIAD : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + + void runKokkosVariant(VariantID vid); void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); From 2ce3e28ebf58a11443f58735566fcc0781f0c397 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Wed, 2 Jun 2021 16:58:07 -0600 Subject: [PATCH 082/124] EDUCE3_INT-Kokkos.cpp: formatting for clarity --- src/basic-kokkos/REDUCE3_INT-Kokkos.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp index ef15833ee..a37f36036 100644 --- a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp +++ b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp @@ -27,7 +27,7 @@ void REDUCE3_INT::runKokkosVariant(VariantID vid) REDUCE3_INT_DATA_SETUP; - //Declare KokkosView that will wrap the pointer + //Declare KokkosView that will wrap the pointer to a vector auto vec_view = getViewFromPointer(vec, iend); From ba8678eab830e90a23695ae75ac6f02bec641b6d Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 14 Jun 2021 11:21:26 -0600 Subject: [PATCH 083/124] watchr_KokkosConfig.json: config for new watchr --- scripts/config/watchr_KokkosConfig.json | 122 ++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100755 scripts/config/watchr_KokkosConfig.json diff --git a/scripts/config/watchr_KokkosConfig.json b/scripts/config/watchr_KokkosConfig.json new file mode 100755 index 000000000..5a6fbdfd4 --- /dev/null +++ b/scripts/config/watchr_KokkosConfig.json @@ -0,0 +1,122 @@ +{ + "plots" : { + "files" : { + "fileName": "RAJAPerfSuite_*", + "type" : "xml", + "ignoreOldFiles" : true, + "recurseDirectories" : false + }, + "categories": [ + "Kokkos_Lambda_CUDA", + "Kokkos_Lambda_Seq", + "Base_CUDA", + "Base_Seq", + "Lambda_Seq", + "RAJA_CUDA", + "RAJA_Seq" + ], + "plot" : [ + { + "autoname" : { + "useProperty" : "y/path" + }, + "category" : "Kokkos_Lambda_CUDA", + "template" : "kokkos_template", + "dataLines" : [ + { + "name" : "Data Line", + "x" : { + "getPath": "*", + "getElement" : "performance-report", + "getKey" : "date", + "unit" : "timestamp" + }, + "y" : { + "getElement" : "performance-report|timing", + "getPath": "*/kokkos_perf_suite/*", + "getPathAttribute": "name", + "getKey" : "Kokkos_Lambda_CUDA", + "unit" : "seconds", + "strategy" : { + "getFirstMatchOnly" : "false", + "recurseChildGraphs" : "true" + } + }, + "color" : "202,77,77" + } + ] + }, { + "inherit" : "kokkos_template", + "category" : "Kokkos_Lambda_Seq", + "dataLines" : [ + { + "y" : { + "getKey" : "Kokkos_Lambda_Seq" + } + } + ] + }, { + "inherit" : "kokkos_template", + "category" : "Base_CUDA", + "dataLines" : [ + { + "y" : { + "getKey" : "Base_CUDA" + } + } + ] + }, { + "inherit" : "kokkos_template", + "category" : "Base_Seq", + "dataLines" : [ + { + "y" : { + "getKey" : "Base_Seq" + } + } + ] + }, { + "inherit" : "kokkos_template", + "category" : "Lambda_Seq", + "dataLines" : [ + { + "y" : { + "getKey" : "Lambda_Seq" + } + } + ] + }, { + "inherit" : "kokkos_template", + "category" : "RAJA_CUDA", + "dataLines" : [ + { + "y" : { + "getKey" : "RAJA_CUDA" + } + } + ] + }, { + "inherit" : "kokkos_template", + "category" : "RAJA_Seq", + "dataLines" : [ + { + "y" : { + "getKey" : "RAJA_Seq" + } + } + ] + } + ] + }, + "graphDisplay": { + "dbLocation" : "root", + "page" : 1, + "displayCategory" : "Kokkos_Lambda_CUDA", + "displayRange" : 30, + "graphWidth" : 450, + "graphHeight" : 450, + "graphsPerRow" : 3, + "graphsPerPage" : 15, + "displayedDecimalPlaces" : 3 + } +} \ No newline at end of file From 80dea71cae94167cbb8f8e0e779fcbb1bc412f3d Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 14 Jun 2021 14:06:28 -0600 Subject: [PATCH 084/124] watchr_KokkosConfig.json: recurse dir set to true --- scripts/config/watchr_KokkosConfig.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/config/watchr_KokkosConfig.json b/scripts/config/watchr_KokkosConfig.json index 5a6fbdfd4..c34a3e262 100755 --- a/scripts/config/watchr_KokkosConfig.json +++ b/scripts/config/watchr_KokkosConfig.json @@ -4,7 +4,7 @@ "fileName": "RAJAPerfSuite_*", "type" : "xml", "ignoreOldFiles" : true, - "recurseDirectories" : false + "recurseDirectories" : true }, "categories": [ "Kokkos_Lambda_CUDA", @@ -119,4 +119,4 @@ "graphsPerPage" : 15, "displayedDecimalPlaces" : 3 } -} \ No newline at end of file +} From 409355726cfb52364def8a3376dd0244ab5c1ea6 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 28 Jun 2021 12:39:56 -0600 Subject: [PATCH 085/124] basic/IF_QUAD.cpp: fix typos and formatting --- src/basic/IF_QUAD.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index 336ea9e9a..5282f0053 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -28,8 +28,6 @@ IF_QUAD::IF_QUAD(const RunParams& params) setVariantDefined( Kokkos_Lambda ); - - setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); From 12af5ca0295973c279c61b1f4d133ba7e93dd296 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 28 Jun 2021 13:01:33 -0600 Subject: [PATCH 086/124] RPS infrastructure changes: --- src/CMakeLists.txt | 84 ++++++++++++++++++++-------------- src/RAJAPerfSuiteDriver.cpp | 12 +++++ src/common/Executor.cpp | 21 +-------- src/common/QuickKernelBase.hpp | 2 +- src/common/RAJAPerfSuite.cpp | 79 ++++++++++++++++---------------- src/common/RAJAPerfSuite.hpp | 24 +++++----- 6 files changed, 114 insertions(+), 108 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a5aebda4e..34f7b8be5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -13,7 +13,8 @@ add_subdirectory(common) add_subdirectory(basic) add_subdirectory(basic-kokkos) #add_subdirectory(kokkos-mechanics) -#add_subdirectory(lcals) +add_subdirectory(lcals) +add_subdirectory(lcals-kokkos) #add_subdirectory(polybench) add_subdirectory(stream) add_subdirectory(stream-kokkos) @@ -25,7 +26,8 @@ set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS basic basic-kokkos #kokkos-mechanics - #lcals + lcals + lcals-kokkos #polybench stream stream-kokkos @@ -102,39 +104,51 @@ blt_add_executable( basic-kokkos/NESTED_INIT-Kokkos.cpp basic-kokkos/REDUCE3_INT-Kokkos.cpp basic-kokkos/TRAP_INT-Kokkos.cpp - #lcals/DIFF_PREDICT.cpp - #lcals/DIFF_PREDICT-Seq.cpp - #lcals/DIFF_PREDICT-OMPTarget.cpp - #lcals/EOS.cpp - #lcals/EOS-Seq.cpp - #lcals/EOS-OMPTarget.cpp - #lcals/FIRST_DIFF.cpp - #lcals/FIRST_DIFF-Seq.cpp - #lcals/FIRST_DIFF-OMPTarget.cpp - #lcals/FIRST_MIN.cpp - #lcals/FIRST_MIN-Seq.cpp - #lcals/FIRST_MIN-OMPTarget.cpp - #lcals/FIRST_SUM.cpp - #lcals/FIRST_SUM-Seq.cpp - #lcals/FIRST_SUM-OMPTarget.cpp - #lcals/GEN_LIN_RECUR.cpp - #lcals/GEN_LIN_RECUR-Seq.cpp - #lcals/GEN_LIN_RECUR-OMPTarget.cpp - #lcals/HYDRO_1D.cpp - #lcals/HYDRO_1D-Seq.cpp - #lcals/HYDRO_1D-OMPTarget.cpp - #lcals/HYDRO_2D.cpp - #lcals/HYDRO_2D-Seq.cpp - #lcals/HYDRO_2D-OMPTarget.cpp - #lcals/INT_PREDICT.cpp - #lcals/INT_PREDICT-Seq.cpp - #lcals/INT_PREDICT-OMPTarget.cpp - #lcals/PLANCKIAN.cpp - #lcals/PLANCKIAN-Seq.cpp - #lcals/PLANCKIAN-OMPTarget.cpp - #lcals/TRIDIAG_ELIM.cpp - #lcals/TRIDIAG_ELIM-Seq.cpp - #lcals/TRIDIAG_ELIM-OMPTarget.cpp + lcals/DIFF_PREDICT.cpp + lcals/DIFF_PREDICT-Seq.cpp + lcals/DIFF_PREDICT-OMPTarget.cpp + lcals/EOS.cpp + lcals/EOS-Seq.cpp + lcals/EOS-OMPTarget.cpp + lcals/FIRST_DIFF.cpp + lcals/FIRST_DIFF-Seq.cpp + lcals/FIRST_DIFF-OMPTarget.cpp + lcals/FIRST_MIN.cpp + lcals/FIRST_MIN-Seq.cpp + lcals/FIRST_MIN-OMPTarget.cpp + lcals/FIRST_SUM.cpp + lcals/FIRST_SUM-Seq.cpp + lcals/FIRST_SUM-OMPTarget.cpp + lcals/GEN_LIN_RECUR.cpp + lcals/GEN_LIN_RECUR-Seq.cpp + lcals/GEN_LIN_RECUR-OMPTarget.cpp + lcals/HYDRO_1D.cpp + lcals/HYDRO_1D-Seq.cpp + lcals/HYDRO_1D-OMPTarget.cpp + lcals/HYDRO_2D.cpp + lcals/HYDRO_2D-Seq.cpp + lcals/HYDRO_2D-OMPTarget.cpp + lcals/INT_PREDICT.cpp + lcals/INT_PREDICT-Seq.cpp + lcals/INT_PREDICT-OMPTarget.cpp + lcals/PLANCKIAN.cpp + lcals/PLANCKIAN-Seq.cpp + lcals/PLANCKIAN-OMPTarget.cpp + lcals/TRIDIAG_ELIM.cpp + lcals/TRIDIAG_ELIM-Seq.cpp + lcals/TRIDIAG_ELIM-OMPTarget.cpp + #Kokkos Bloc + lcals-kokkos/DIFF_PREDICT-Kokkos.cpp + lcals-kokkos/EOS-Kokkos.cpp + lcals-kokkos/FIRST_DIFF-Kokkos.cpp + lcals-kokkos/FIRST_MIN-Kokkos.cpp + lcals-kokkos/FIRST_SUM-Kokkos.cpp + lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp + lcals-kokkos/HYDRO_1D-Kokkos.cpp + lcals-kokkos/HYDRO_2D-Kokkos.cpp + lcals-kokkos/INT_PREDICT-Kokkos.cpp + lcals-kokkos/PLANCKIAN-Kokkos.cpp + lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp #polybench/POLYBENCH_2MM.cpp #polybench/POLYBENCH_2MM-Seq.cpp #polybench/POLYBENCH_2MM-OMPTarget.cpp diff --git a/src/RAJAPerfSuiteDriver.cpp b/src/RAJAPerfSuiteDriver.cpp index 42fe557d2..512698463 100644 --- a/src/RAJAPerfSuiteDriver.cpp +++ b/src/RAJAPerfSuiteDriver.cpp @@ -15,6 +15,11 @@ int main( int argc, char** argv ) { // STEP 1: Create suite executor object //rajaperf::Executor executor(argc, argv); + +#if defined(RUN_KOKKOS) + Kokkos::initialize(argc, argv); +#endif // RUN_KOKKOS + rajaperf::Executor executor(argc, argv); rajaperf::make_perfsuite_executor(&executor, argc, argv); //executor.registerKernel @@ -48,6 +53,13 @@ int main( int argc, char** argv ) // STEP 5: Generate suite execution reports executor.outputRunData(); + // Pre-processor directives + +#if defined(RUN_KOKKOS) + Kokkos::finalize(); // TODO DZP: should this be here? Good question. AJP +#endif + + std::cout << "\n\nDONE!!!...." << std::endl; return 0; diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 558f05512..19c7c6005 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -58,10 +58,6 @@ namespace rajaperf { delete kernels[ik]; } - // Pre-processor directives -#if defined(RUN_KOKKOS) - Kokkos::finalize(); // TODO DZP: should this be here? Good question. AJP -#endif } // New functions for Kokkos to register new group and kernel IDs @@ -557,22 +553,7 @@ namespace rajaperf { } } // kernel and variant input both look good -#if defined(RUN_KOKKOS) - Kokkos::initialize(); - /** - * DZP: This is a terrible hack to just get the push/pop region - * callbacks without the begin_parallel_x/end_parallel_x ones, - * so we don't overfence and perturb performance - */ - auto events = Kokkos::Tools::Experimental::get_callbacks(); - auto push = events.push_region; - auto pop = events.pop_region; - auto metadata = events.declare_metadata; - Kokkos::Tools::Experimental::pause_tools(); - Kokkos::Tools::Experimental::set_push_region_callback(push); - Kokkos::Tools::Experimental::set_pop_region_callback(pop); - Kokkos::Tools::Experimental::set_declare_metadata_callback(metadata); -#endif + } // if kernel input looks good } diff --git a/src/common/QuickKernelBase.hpp b/src/common/QuickKernelBase.hpp index dc0cc7cef..a9cea9dac 100644 --- a/src/common/QuickKernelBase.hpp +++ b/src/common/QuickKernelBase.hpp @@ -64,7 +64,7 @@ namespace rajaperf { void runOpenMPVariant(VariantID vid) override { auto size = getRunSize(); for(int x =0; x< getRunReps(); ++x){ - m_execute(x, size) + m_execute(x, size); } } #endif diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index e53a7b6b5..7046c6b47 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -110,7 +110,6 @@ namespace rajaperf { free_register_kernel(exec, "Basic", new basic::NESTED_INIT(run_params)); free_register_kernel(exec, "Basic", new basic::REDUCE3_INT(run_params)); free_register_kernel(exec, "Basic", new basic::TRAP_INT(run_params)); - /** // Lcals free_register_kernel(exec, "Lcals", new lcals::DIFF_PREDICT(run_params)); free_register_kernel(exec, "Lcals", new lcals::EOS(run_params)); @@ -124,6 +123,7 @@ namespace rajaperf { free_register_kernel(exec, "Lcals", new lcals::PLANCKIAN(run_params)); free_register_kernel(exec, "Lcals", new lcals::TRIDIAG_ELIM(run_params)); + /** // Polybench free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_2MM(run_params)); free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_3MM(run_params)); @@ -221,17 +221,17 @@ namespace rajaperf { // // Lcals kernels... //// -// std::string("Lcals_DIFF_PREDICT"), -// std::string("Lcals_EOS"), -// std::string("Lcals_FIRST_DIFF"), -// std::string("Lcals_FIRST_MIN"), -// std::string("Lcals_FIRST_SUM"), -// std::string("Lcals_GEN_LIN_RECUR"), -// std::string("Lcals_HYDRO_1D"), -// std::string("Lcals_HYDRO_2D"), -// std::string("Lcals_INT_PREDICT"), -// std::string("Lcals_PLANCKIAN"), -// std::string("Lcals_TRIDIAG_ELIM"), + std::string("Lcals_DIFF_PREDICT"), + std::string("Lcals_EOS"), + std::string("Lcals_FIRST_DIFF"), + std::string("Lcals_FIRST_MIN"), + std::string("Lcals_FIRST_SUM"), + std::string("Lcals_GEN_LIN_RECUR"), + std::string("Lcals_HYDRO_1D"), + std::string("Lcals_HYDRO_2D"), + std::string("Lcals_INT_PREDICT"), + std::string("Lcals_PLANCKIAN"), + std::string("Lcals_TRIDIAG_ELIM"), // //// //// Polybench kernels... @@ -423,7 +423,7 @@ namespace rajaperf { vid == Lambda_CUDA || vid == RAJA_CUDA || vid == RAJA_WORKGROUP_CUDA ) { - ret_val = true; + ret_val = true; } #endif @@ -495,10 +495,9 @@ namespace rajaperf { kernel = new basic::TRAP_INT(run_params); break; } -/** DZP: big comment block for unimplemented // // Lcals kernels... -// + case Lcals_DIFF_PREDICT : { kernel = new lcals::DIFF_PREDICT(run_params); break; @@ -511,6 +510,7 @@ namespace rajaperf { kernel = new lcals::FIRST_DIFF(run_params); break; } + case Lcals_FIRST_MIN : { kernel = new lcals::FIRST_MIN(run_params); break; @@ -544,7 +544,32 @@ namespace rajaperf { break; } + +// Stream kernels... // + case Stream_ADD : { + kernel = new stream::ADD(run_params); + break; + } + case Stream_COPY : { + kernel = new stream::COPY(run_params); + break; + } + case Stream_DOT : { + kernel = new stream::DOT(run_params); + break; + } + case Stream_MUL : { + kernel = new stream::MUL(run_params); + break; + } + case Stream_TRIAD : { + kernel = new stream::TRIAD(run_params); + break; + } +// +// +/** DZP: big comment block for unimplemented // Polybench kernels... // case Polybench_2MM : { @@ -600,30 +625,6 @@ namespace rajaperf { break; } -// -// Stream kernels... -// - case Stream_ADD : { - kernel = new stream::ADD(run_params); - break; - } - case Stream_COPY : { - kernel = new stream::COPY(run_params); - break; - } - case Stream_DOT : { - kernel = new stream::DOT(run_params); - break; - } - case Stream_MUL : { - kernel = new stream::MUL(run_params); - break; - } - case Stream_TRIAD : { - kernel = new stream::TRIAD(run_params); - break; - } -// // Apps kernels... // case Apps_COUPLE : { diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 6261db80e..9c3e45aba 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -39,8 +39,6 @@ void make_perfsuite_executor(Executor* exec, int argc, char* argv[]); // 2) Use default execution space // // -// -// // NEW FUNCTION WILL: // 1) Take in a raw pointer (e.g., float*, int*, etc.) // 2) From this pointer, return a Kokkos::View @@ -334,17 +332,17 @@ enum KernelID { // // Lcals kernels... // -// Lcals_DIFF_PREDICT, -// Lcals_EOS, -// Lcals_FIRST_DIFF, -// Lcals_FIRST_MIN, -// Lcals_FIRST_SUM, -// Lcals_GEN_LIN_RECUR, -// Lcals_HYDRO_1D, -// Lcals_HYDRO_2D, -// Lcals_INT_PREDICT, -// Lcals_PLANCKIAN, -// Lcals_TRIDIAG_ELIM, + Lcals_DIFF_PREDICT, + Lcals_EOS, + Lcals_FIRST_DIFF, + Lcals_FIRST_MIN, + Lcals_FIRST_SUM, + Lcals_GEN_LIN_RECUR, + Lcals_HYDRO_1D, + Lcals_HYDRO_2D, + Lcals_INT_PREDICT, + Lcals_PLANCKIAN, + Lcals_TRIDIAG_ELIM, // // Polybench kernels... From b9a1381fedee29dca4d0fa107b61212208980c47 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 28 Jun 2021 13:03:10 -0600 Subject: [PATCH 087/124] lcals: Making kernel group Kokkos runable --- src/lcals/DIFF_PREDICT.cpp | 3 +++ src/lcals/DIFF_PREDICT.hpp | 2 +- src/lcals/EOS.cpp | 2 ++ src/lcals/EOS.hpp | 1 + src/lcals/FIRST_DIFF.cpp | 2 ++ src/lcals/FIRST_DIFF.hpp | 2 ++ src/lcals/FIRST_MIN.cpp | 3 +++ src/lcals/FIRST_MIN.hpp | 2 ++ src/lcals/FIRST_SUM.cpp | 2 ++ src/lcals/FIRST_SUM.hpp | 2 ++ src/lcals/GEN_LIN_RECUR.cpp | 4 ++++ src/lcals/GEN_LIN_RECUR.hpp | 2 ++ src/lcals/HYDRO_1D.cpp | 2 ++ src/lcals/HYDRO_1D.hpp | 2 ++ src/lcals/HYDRO_2D.cpp | 2 ++ src/lcals/HYDRO_2D.hpp | 2 ++ src/lcals/INT_PREDICT.cpp | 3 +++ src/lcals/INT_PREDICT.hpp | 2 ++ src/lcals/PLANCKIAN.cpp | 2 ++ src/lcals/PLANCKIAN.hpp | 2 ++ src/lcals/TRIDIAG_ELIM.cpp | 2 ++ src/lcals/TRIDIAG_ELIM.hpp | 2 ++ 22 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index 13403d483..2c3133093 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -40,6 +40,9 @@ DIFF_PREDICT::DIFF_PREDICT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined(Kokkos_Lambda); + } DIFF_PREDICT::~DIFF_PREDICT() diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp index 23db5cacb..a98d6c042 100644 --- a/src/lcals/DIFF_PREDICT.hpp +++ b/src/lcals/DIFF_PREDICT.hpp @@ -59,7 +59,6 @@ px[i + offset * 8] = br; \ ar = cr - px[i + offset * 9]; \ px[i + offset * 9] = cr; \ - br = ar - px[i + offset * 10]; \ px[i + offset * 10] = ar; \ cr = br - px[i + offset * 11]; \ px[i + offset * 11] = br; \ @@ -88,6 +87,7 @@ class DIFF_PREDICT : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index 8d301d5a9..c0527c48e 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -40,6 +40,8 @@ EOS::EOS(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined(Kokkos_Lambda); } EOS::~EOS() diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp index 22773206e..75645f083 100644 --- a/src/lcals/EOS.hpp +++ b/src/lcals/EOS.hpp @@ -57,6 +57,7 @@ class EOS : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index 27730add4..b7940c9c0 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -40,6 +40,8 @@ FIRST_DIFF::FIRST_DIFF(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); } FIRST_DIFF::~FIRST_DIFF() diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp index 6e08dcbcf..37353384a 100644 --- a/src/lcals/FIRST_DIFF.hpp +++ b/src/lcals/FIRST_DIFF.hpp @@ -47,6 +47,8 @@ class FIRST_DIFF : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); + void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index e0e5760fb..3881eae24 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -43,6 +43,9 @@ FIRST_MIN::FIRST_MIN(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); + } FIRST_MIN::~FIRST_MIN() diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index 0de442445..be0b33be5 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -76,6 +76,8 @@ class FIRST_MIN : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); + void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index c849efdf7..b2a6c9c98 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -40,6 +40,8 @@ FIRST_SUM::FIRST_SUM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); } FIRST_SUM::~FIRST_SUM() diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index 76b2c3552..a796f2192 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -50,6 +50,8 @@ class FIRST_SUM : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); + void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index 20ccfb3f8..5bb51fa18 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -40,6 +40,10 @@ GEN_LIN_RECUR::GEN_LIN_RECUR(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); + + } GEN_LIN_RECUR::~GEN_LIN_RECUR() diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp index 97feb6c0e..1d6b202ae 100644 --- a/src/lcals/GEN_LIN_RECUR.hpp +++ b/src/lcals/GEN_LIN_RECUR.hpp @@ -71,6 +71,8 @@ class GEN_LIN_RECUR : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); + void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index 641b7d130..bfd38abc8 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -40,6 +40,8 @@ HYDRO_1D::HYDRO_1D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); } HYDRO_1D::~HYDRO_1D() diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp index 81dca1c82..43995caf9 100644 --- a/src/lcals/HYDRO_1D.hpp +++ b/src/lcals/HYDRO_1D.hpp @@ -52,6 +52,8 @@ class HYDRO_1D : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); + void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index 6bf082b70..4bca535db 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -46,6 +46,8 @@ HYDRO_2D::HYDRO_2D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); } HYDRO_2D::~HYDRO_2D() diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp index df8d10cfd..c7a97d434 100644 --- a/src/lcals/HYDRO_2D.hpp +++ b/src/lcals/HYDRO_2D.hpp @@ -148,6 +148,8 @@ class HYDRO_2D : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); + void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index b2b7450da..31f32c38c 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -40,6 +40,9 @@ INT_PREDICT::INT_PREDICT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); + } INT_PREDICT::~INT_PREDICT() diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp index 8d93c12ae..7d7319cb5 100644 --- a/src/lcals/INT_PREDICT.hpp +++ b/src/lcals/INT_PREDICT.hpp @@ -67,6 +67,8 @@ class INT_PREDICT : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); + void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index 32ac1bd94..51cf9bbf2 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -40,6 +40,8 @@ PLANCKIAN::PLANCKIAN(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); } PLANCKIAN::~PLANCKIAN() diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp index 1252260d1..8777c3cf2 100644 --- a/src/lcals/PLANCKIAN.hpp +++ b/src/lcals/PLANCKIAN.hpp @@ -52,6 +52,8 @@ class PLANCKIAN : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); + void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index 342454303..f18dbe4d9 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -40,6 +40,8 @@ TRIDIAG_ELIM::TRIDIAG_ELIM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); } TRIDIAG_ELIM::~TRIDIAG_ELIM() diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp index e8220df52..992e799ac 100644 --- a/src/lcals/TRIDIAG_ELIM.hpp +++ b/src/lcals/TRIDIAG_ELIM.hpp @@ -52,6 +52,8 @@ class TRIDIAG_ELIM : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); + void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); From 8d193ed58c9ad3a98164494f1c4e303753fc80d1 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 28 Jun 2021 13:04:33 -0600 Subject: [PATCH 088/124] stream-kokkos/ADD-Kokkos.cpp: fix typos --- src/stream-kokkos/ADD-Kokkos.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/stream-kokkos/ADD-Kokkos.cpp b/src/stream-kokkos/ADD-Kokkos.cpp index d833bacb3..48d5bd20a 100644 --- a/src/stream-kokkos/ADD-Kokkos.cpp +++ b/src/stream-kokkos/ADD-Kokkos.cpp @@ -110,10 +110,6 @@ void ADD::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { -/* - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), add_lam); -*/ Kokkos::parallel_for("ADD_Kokkos Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA(Index_type i){ From bee13410c940de6b85e660713e65047ef30d4eb8 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 28 Jun 2021 13:06:19 -0600 Subject: [PATCH 089/124] lcals-kokkos: lcals kernel group kokkos implement --- src/lcals-kokkos/CMakeLists.txt | 26 ++ src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp | 186 +++++++++++++ src/lcals-kokkos/EOS-Kokkos.cpp | 141 ++++++++++ src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp | 140 ++++++++++ src/lcals-kokkos/FIRST_MIN-Kokkos.cpp | 169 ++++++++++++ src/lcals-kokkos/FIRST_SUM-Kokkos.cpp | 129 +++++++++ src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp | 179 ++++++++++++ src/lcals-kokkos/HYDRO_1D-Kokkos.cpp | 142 ++++++++++ src/lcals-kokkos/HYDRO_2D-Kokkos.cpp | 317 ++++++++++++++++++++++ src/lcals-kokkos/INT_PREDICT-Kokkos.cpp | 158 +++++++++++ src/lcals-kokkos/PLANCKIAN-Kokkos.cpp | 142 ++++++++++ src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp | 133 +++++++++ 12 files changed, 1862 insertions(+) create mode 100644 src/lcals-kokkos/CMakeLists.txt create mode 100644 src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp create mode 100644 src/lcals-kokkos/EOS-Kokkos.cpp create mode 100644 src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp create mode 100644 src/lcals-kokkos/FIRST_MIN-Kokkos.cpp create mode 100644 src/lcals-kokkos/FIRST_SUM-Kokkos.cpp create mode 100644 src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp create mode 100644 src/lcals-kokkos/HYDRO_1D-Kokkos.cpp create mode 100644 src/lcals-kokkos/HYDRO_2D-Kokkos.cpp create mode 100644 src/lcals-kokkos/INT_PREDICT-Kokkos.cpp create mode 100644 src/lcals-kokkos/PLANCKIAN-Kokkos.cpp create mode 100644 src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp diff --git a/src/lcals-kokkos/CMakeLists.txt b/src/lcals-kokkos/CMakeLists.txt new file mode 100644 index 000000000..b032d67e9 --- /dev/null +++ b/src/lcals-kokkos/CMakeLists.txt @@ -0,0 +1,26 @@ +############################################################################### +# Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + + +include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/../lcals) + +blt_add_library( + NAME lcals-kokkos + SOURCES DIFF_PREDICT-Kokkos.cpp + EOS-Kokkos.cpp + FIRST_DIFF-Kokkos.cpp + FIRST_MIN-Kokkos.cpp + FIRST_SUM-Kokkos.cpp + GEN_LIN_RECUR-Kokkos.cpp + HYDRO_1D-Kokkos.cpp + HYDRO_2D-Kokkos.cpp + INT_PREDICT-Kokkos.cpp + PLANCKIAN-Kokkos.cpp + TRIDIAG_ELIM-Kokkos.cpp + DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} + ) diff --git a/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp b/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp new file mode 100644 index 000000000..eefda95a4 --- /dev/null +++ b/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp @@ -0,0 +1,186 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DIFF_PREDICT.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + + +void DIFF_PREDICT::runKokkosVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + + DIFF_PREDICT_DATA_SETUP; + + // Instiating KokkosViews using getViewFromPointer; + // Wrapping pointers in KokkosViews + + // You need to know the actual array size here to catch errors; + // + auto px_view = getViewFromPointer(px, iend*14); + auto cx_view = getViewFromPointer(cx, iend*14); + + // NOTA BENE: in DIFF_PREDICT.hpp, this constant: + // const Index_type offset = m_offset; + + auto diffpredict_lam = [=](Index_type i) { + DIFF_PREDICT_BODY; + }; + + #if defined(RUN_KOKKOS) + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + DIFF_PREDICT_BODY; + } + + } + stopTimer(); + + break; + } + + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + diffpredict_lam(i); + } + + } + stopTimer(); + + break; + } +/* + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), diffpredict_lam); + + } + stopTimer(); + + break; + } +*/ + +// Kokkos-ifying here: +// + case Kokkos_Lambda : { + + // Define ar, br cr because you are not using the DIFF_PREDICT_BODY + + Kokkos::fence(); + startTimer(); + + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Kokkos::parallel_for("DIFF_PREDICT_Kokkos Kokkos_Lambda", +/* +(gdb) p offset +$1 = 100000 +(gdb) +$2 = 100000 +(gdb) p iend +$3 = 100000 +*/ + + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i) { + // DIFF_PREDICT_BODY definition in + // DIFF_PREDICT.hpp: + /* + ar = cx[i + offset * 4]; \ + br = ar - px[i + offset * 4]; \ + px[i + offset * 4] = ar; \ + cr = br - px[i + offset * 5]; \ + px[i + offset * 5] = br; \ + ar = cr - px[i + offset * 6]; \ + px[i + offset * 6] = cr; \ + br = ar - px[i + offset * 7]; \ + px[i + offset * 7] = ar; \ + cr = br - px[i + offset * 8]; \ + px[i + offset * 8] = br; \ + ar = cr - px[i + offset * 9]; \ + px[i + offset * 9] = cr; \ + br = ar - px[i + offset * 10]; \ + px[i + offset * 10] = ar; \ + cr = br - px[i + offset * 11]; \ + px[i + offset * 11] = br; \ + px[i + offset * 13] = cr - px[i + offset * 12]; \ + px[i + offset * 12] = cr; + + */ + + Real_type ar, br, cr; + ar = cx_view[i + offset * 4]; \ + br = ar - px_view[i + offset * 4]; \ + px_view[i + offset * 4] = ar; \ + cr = br - px_view[i + offset * 5]; \ + px_view[i + offset * 5] = br; \ + ar = cr - px_view[i + offset * 6]; \ + px_view[i + offset * 6] = cr; \ + br = ar - px_view[i + offset * 7]; \ + px_view[i + offset * 7] = ar; \ + cr = br - px_view[i + offset * 8]; \ + px_view[i + offset * 8] = br; \ + ar = cr - px_view[i + offset * 9]; \ + px_view[i + offset * 9] = cr; \ + br = ar - px_view[i + offset * 10]; \ + px_view[i + offset * 10] = ar; \ + cr = br - px_view[i + offset * 11]; \ + px_view[i + offset * 11] = br; \ + px_view[i + offset * 13] = cr - px_view[i + offset * 12]; \ + px_view[i + offset * 12] = cr; + }); + + } + Kokkos::fence(); + stopTimer(); + + break; + } + + + default : { + std::cout << "\n DIFF_PREDICT : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + moveDataToHostFromKokkosView(px, px_view, iend*14); + moveDataToHostFromKokkosView(cx, cx_view, iend*14); + +} + +} // end namespace lcals +} // end namespace rajaperf diff --git a/src/lcals-kokkos/EOS-Kokkos.cpp b/src/lcals-kokkos/EOS-Kokkos.cpp new file mode 100644 index 000000000..9fc824c83 --- /dev/null +++ b/src/lcals-kokkos/EOS-Kokkos.cpp @@ -0,0 +1,141 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "EOS.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + + +void EOS::runKokkosVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + EOS_DATA_SETUP; + + auto x_view = getViewFromPointer(x, iend + 7); + auto y_view = getViewFromPointer(y, iend + 7); + auto z_view = getViewFromPointer(z, iend + 7); + auto u_view = getViewFromPointer(u, iend + 7); + + + auto eos_lam = [=](Index_type i) { + EOS_BODY; + }; + + +#if defined(RUN_KOKKOS) + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + EOS_BODY; + } + + } + stopTimer(); + + break; + } + + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + eos_lam(i); + } + + } + stopTimer(); + + break; + } +/* + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), eos_lam); + + } + stopTimer(); + + break; + } + +*/ + + case Kokkos_Lambda : { + + Kokkos::fence(); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + Kokkos::parallel_for("EOS_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i) { + /* + #define EOS_BODY \ + x[i] = u[i] + r*( z[i] + r*y[i] ) + \ + t*( u[i+3] + r*( u[i+2] + r*u[i+1] ) + \ + t*( u[i+6] + q*( u[i+5] + q*u[i+4] ) ) ); + */ + // Declare variables need in the function + // body + //const Real_type q; + //const Real_type r; + //const Real_type t; + + x_view[i] = u_view[i] + r*( z_view[i] + r*y_view[i] ) + \ + t*( u_view[i+3] + r*( u_view[i+2] + r*u_view[i+1] ) + \ + t*( u_view[i+6] + q*( u_view[i+5] + q*u_view[i+4] ) ) ); + }); + + } + Kokkos::fence(); + stopTimer(); + + break; + } + + + + default : { + std::cout << "\n EOS : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + moveDataToHostFromKokkosView(x, x_view, iend + 7); + moveDataToHostFromKokkosView(y, y_view, iend + 7); + moveDataToHostFromKokkosView(z, z_view, iend + 7); + moveDataToHostFromKokkosView(u, u_view, iend + 7); + + +} + +} // end namespace lcals +} // end namespace rajaperf diff --git a/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp b/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp new file mode 100644 index 000000000..68e312495 --- /dev/null +++ b/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp @@ -0,0 +1,140 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "FIRST_DIFF.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + +// Kokkos-ification starts here: + +void FIRST_DIFF::runKokkosVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + FIRST_DIFF_DATA_SETUP; + +// From FIRST_DIFF.hpp +/* +#define FIRST_DIFF_DATA_SETUP \ + Real_ptr x = m_x; \ + Real_ptr y = m_y; + +*/ +// lcals = livermore compiler analysis loops suite + // Instiating KokkosViews using getViewFromPointer; + // Wrapping pointers in KokkosViews + +// attn: look at the definition in setup in FIRST_DIFF.cpp: + auto x_view = getViewFromPointer(x, iend + 1); + auto y_view = getViewFromPointer(y, iend + 1); + + auto firstdiff_lam = [=](Index_type i) { + FIRST_DIFF_BODY; + }; + +#if defined(RUN_KOKKOS) + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + FIRST_DIFF_BODY; + } + + } + stopTimer(); + + break; + } + + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + firstdiff_lam(i); + } + + } + stopTimer(); + + break; + } + +/* + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), firstdiff_lam); + + } + + stopTimer(); + + break; + } +*/ + + // Kokkos-ifying here: + case Kokkos_Lambda : { + + Kokkos::fence(); + startTimer(); + + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + Kokkos::parallel_for("FIRST_DIFF_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i) { + /* #define FIRST_DIFF_BODY \ + x[i] = y[i+1] - y[i]; + */ + x_view[i] = y_view[i + 1] - y_view[i]; + }); + + } + + Kokkos::fence(); + stopTimer(); + + break; + } + + default : { + std::cout << "\n FIRST_DIFF : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + // ATTN: View dimensions must match array dimensions! + moveDataToHostFromKokkosView(x, x_view, iend + 1); + moveDataToHostFromKokkosView(y, y_view, iend + 1); + + +} + +} // end namespace lcals +} // end namespace rajaperf diff --git a/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp b/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp new file mode 100644 index 000000000..73ea40504 --- /dev/null +++ b/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp @@ -0,0 +1,169 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "FIRST_MIN.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + + +void FIRST_MIN::runKokkosVariant(VariantID vid) + +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + FIRST_MIN_DATA_SETUP; + +// #define FIRST_MIN_DATA_SETUP \ +// Real_ptr x = m_x; + + + auto x_view = getViewFromPointer(x, iend); + +#if defined(RUN_KOKKOS) + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + FIRST_MIN_MINLOC_INIT; + + for (Index_type i = ibegin; i < iend; ++i ) { + FIRST_MIN_BODY; + } + + m_minloc = RAJA_MAX(m_minloc, mymin.loc); + + } + stopTimer(); + + break; + } + + case Lambda_Seq : { + + auto firstmin_base_lam = [=](Index_type i) -> Real_type { + return x[i]; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + FIRST_MIN_MINLOC_INIT; + + for (Index_type i = ibegin; i < iend; ++i ) { + if ( firstmin_base_lam(i) < mymin.val ) { \ + mymin.val = x[i]; \ + mymin.loc = i; \ + } + } + + m_minloc = RAJA_MAX(m_minloc, mymin.loc); + + } + stopTimer(); + + break; + } +/* + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::ReduceMinLoc loc( + m_xmin_init, m_initloc); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + FIRST_MIN_BODY_RAJA; + }); + + m_minloc = RAJA_MAX(m_minloc, loc.getLoc()); + + } + stopTimer(); + + break; + } +*/ + + case Kokkos_Lambda : { + +// https://github.com/kokkos/kokkos/wiki/Kokkos::MinLoc +// A templated class: +// MinLoc::value_type result; +// parallel_reduce(N,Functor,MinLoc(result)); + + Kokkos::fence(); + startTimer(); + + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // The 3rd template argument is the memory space in which the + // result will be stored; the result will be in the place the + // kernel is called from , i.e., the Host + using reducer_type = Kokkos::MinLoc; + // must hold the value and the location; + // Create a variable to hold the result from parallel_reduce + reducer_type::value_type min_result_obj; + + Kokkos::parallel_reduce("FIRST_MIN_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i, reducer_type::value_type& mymin) { + + // #define FIRST_MIN_BODY + // if ( x[i] < mymin.val ) { + // mymin.val = x[i]; + // mymin.loc = i; + // } + + if (x_view[i] < mymin.val) { + mymin.val = x_view[i]; + mymin.loc = i; + } + + // Kokkos knows how to handle a MinLoc type + }, reducer_type(min_result_obj)); + + + // Kokkos translation of line below is needed + // m_minloc = RAJA_MAX(m_minloc, loc.getLoc()); + m_minloc = min_result_obj.loc; + + } + Kokkos::fence(); + stopTimer(); + + break; + } + + default : { + std::cout << "\n FIRST_MIN : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + moveDataToHostFromKokkosView(x, x_view, iend); +} + +} // end namespace lcals +} // end namespace rajaperf diff --git a/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp b/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp new file mode 100644 index 000000000..653cac0d1 --- /dev/null +++ b/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp @@ -0,0 +1,129 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "FIRST_SUM.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + + +void FIRST_SUM::runKokkosVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 1; + const Index_type iend = getRunSize(); + + FIRST_SUM_DATA_SETUP; + + // wrap pointers in Kokkos Views + auto x_view = getViewFromPointer(x, iend); + auto y_view = getViewFromPointer(y, iend); + + auto firstsum_lam = [=](Index_type i) { + FIRST_SUM_BODY; + }; + + +#if defined(RUN_KOKKOS) + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + FIRST_SUM_BODY; + } + + } + stopTimer(); + + break; + } + + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + firstsum_lam(i); + } + + } + stopTimer(); + + break; + } + + +/* + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), firstsum_lam); + + } + stopTimer(); + + break; + } + + */ + + + case Kokkos_Lambda : { + + Kokkos::fence(); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Kokkos::parallel_for("FIRST_SUM_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i) { + //#define FIRST_SUM_BODY + //x[i] = y[i-1] + y[i]; + x_view[i] = y_view[i - 1] + y_view[i]; + }); + + } + + Kokkos::fence(); + stopTimer(); + + break; + } + + + default : { + std::cout << "\n FIRST_SUM : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + moveDataToHostFromKokkosView(x, x_view, iend); + moveDataToHostFromKokkosView(y, y_view, iend); + + +} + +} // end namespace lcals +} // end namespace rajaperf diff --git a/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp b/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp new file mode 100644 index 000000000..9bf21a15c --- /dev/null +++ b/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp @@ -0,0 +1,179 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "GEN_LIN_RECUR.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + + +void GEN_LIN_RECUR::runKokkosVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 1; + const Index_type iend = getRunSize(); + + GEN_LIN_RECUR_DATA_SETUP; + +// wrap pointers in Kokkos Views + + auto b5_view = getViewFromPointer(b5, iend); + auto sa_view = getViewFromPointer(sa, iend); + auto sb_view = getViewFromPointer(sb, iend); + auto stb5_view = getViewFromPointer(stb5, iend); + +// RPS Lambdas + + auto genlinrecur_lam1 = [=](Index_type k) { + GEN_LIN_RECUR_BODY1; + }; + auto genlinrecur_lam2 = [=](Index_type i) { + GEN_LIN_RECUR_BODY2; + }; + +#if defined(RUN_KOKKOS) + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type k = 0; k < N; ++k ) { + GEN_LIN_RECUR_BODY1; + } + + for (Index_type i = 1; i < N+1; ++i ) { + GEN_LIN_RECUR_BODY2; + } + + } + stopTimer(); + + break; + } + + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type k = 0; k < N; ++k ) { + genlinrecur_lam1(k); + } + + for (Index_type i = 1; i < N+1; ++i ) { + genlinrecur_lam2(i); + } + + } + stopTimer(); + + break; + } +/* + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(0, N), genlinrecur_lam1); + + + + + RAJA::forall( + RAJA::RangeSegment(1, N+1), genlinrecur_lam2); + + } + stopTimer(); + + break; + } +*/ + + case Kokkos_Lambda : { + + Kokkos::fence(); + startTimer(); + + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // NOTA BENE: + // Index_type kb5i = m_kb5i; + // Index_type N = m_N; + + + Kokkos::parallel_for("GEN_LIN_RECUR_Kokkos Kokkos Lambda -- BODY1", + // RPS indices are (0, N) for BODY1 + Kokkos::RangePolicy(0, N), + KOKKOS_LAMBDA(Index_type k) { + /* + * #define GEN_LIN_RECUR_BODY1 + * b5[k+kb5i] = sa[k] + stb5[k]*sb[k]; + * stb5[k] = b5[k+kb5i] - stb5[k]; + * */ + b5_view[k+kb5i] = sa_view[k] + stb5_view[k]*sb_view[k]; + stb5_view[k] = b5_view[k+kb5i] - stb5_view[k]; + }); + + + + Kokkos::parallel_for("GEN_LIN_RECUR_Kokkos Kokkos Lambda -- BODY2", + // ATTN: you must adjust indices to align with + // RPS design intent here + // RPS indices are (1, N+1) for BODY2 + Kokkos::RangePolicy(1, N+1), + KOKKOS_LAMBDA(Index_type i) { + /* + #define GEN_LIN_RECUR_BODY2 \ + Index_type k = N - i ; \ + b5[k+kb5i] = sa[k] + stb5[k]*sb[k]; \ + stb5[k] = b5[k+kb5i] - stb5[k]; + */ + Index_type k = N - i ; + + b5_view[k+kb5i] = sa_view[k] + stb5_view[k]*sb_view[k]; + stb5_view[k] = b5_view[k+kb5i] - stb5_view[k]; + + }); + + } + + Kokkos::fence(); + stopTimer(); + + break; + } + + default : { + std::cout << "\n GEN_LIN_RECUR : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + moveDataToHostFromKokkosView(b5, b5_view, iend); + moveDataToHostFromKokkosView(sa, sa_view, iend); + moveDataToHostFromKokkosView(sb, sb_view, iend); + moveDataToHostFromKokkosView(stb5, stb5_view, iend); + + +} + +} // end namespace lcals +} // end namespace rajaperf diff --git a/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp b/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp new file mode 100644 index 000000000..5b3c3a544 --- /dev/null +++ b/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp @@ -0,0 +1,142 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HYDRO_1D.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + + +void HYDRO_1D::runKokkosVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + HYDRO_1D_DATA_SETUP; + + // Wrap pointers in Kokkos Views + /* + * #define HYDRO_1D_DATA_SETUP \ + Real_ptr x = m_x; \ + Real_ptr y = m_y; \ + Real_ptr z = m_z; + */ + + auto x_view = getViewFromPointer(x, iend + 12); + auto y_view = getViewFromPointer(y, iend + 12); + auto z_view = getViewFromPointer(z, iend + 12); + + + auto hydro1d_lam = [=](Index_type i) { + HYDRO_1D_BODY; + }; + +#if defined(RUN_KOKKOS) + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + HYDRO_1D_BODY; + } + + } + stopTimer(); + + break; + } + + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + hydro1d_lam(i); + } + + } + stopTimer(); + + break; + } +/* + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), hydro1d_lam); + + } + stopTimer(); + + break; + } + + */ + + + + case Kokkos_Lambda : { + + Kokkos::fence(); + startTimer(); + + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Kokkos::parallel_for("HYDRO_1D_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + // #define HYDRO_1D_BODY + // x[i] = q + y[i]*( r*z[i+10] + t*z[i+11] ); + KOKKOS_LAMBDA(Index_type i) { + x_view[i] = q + y_view[i]*( r*z_view[i+10] + t*z_view[i+11] ); + }); + + } + + Kokkos::fence(); + stopTimer(); + + break; + } + + + default : { + std::cout << "\n HYDRO_1D : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + // ATTN: Adjust arr dimensions to be congruent with the setup + // in the .cpp file: + // m_array_length = getRunSize() + 12; + + + moveDataToHostFromKokkosView(x, x_view, iend + 12); + moveDataToHostFromKokkosView(y, y_view, iend + 12); + moveDataToHostFromKokkosView(z, z_view, iend + 12); + +} + +} // end namespace lcals +} // end namespace rajaperf diff --git a/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp b/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp new file mode 100644 index 000000000..82cdf60ef --- /dev/null +++ b/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp @@ -0,0 +1,317 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HYDRO_2D.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + + +void HYDRO_2D::runKokkosVariant(VariantID vid) +{ + + const Index_type run_reps = getRunReps(); + const Index_type kbeg = 1; + const Index_type kend = m_kn - 1; + const Index_type jbeg = 1; + const Index_type jend = m_jn - 1; + + HYDRO_2D_DATA_SETUP; + + // Wrap input pointers in Kokkos::Views +/* +#define HYDRO_2D_DATA_SETUP \ + Real_ptr zadat = m_za; \ + Real_ptr zbdat = m_zb; \ + Real_ptr zmdat = m_zm; \ + Real_ptr zpdat = m_zp; \ + Real_ptr zqdat = m_zq; \ + Real_ptr zrdat = m_zr; \ + Real_ptr zudat = m_zu; \ + Real_ptr zvdat = m_zv; \ + Real_ptr zzdat = m_zz; \ +\ + Real_ptr zroutdat = m_zrout; \ + Real_ptr zzoutdat = m_zzout; \ +\ + +*/ +// ATTN: THESE ARE 2D Views: +// + auto zadat_view = getViewFromPointer(zadat, kn, jn ); + auto zbdat_view = getViewFromPointer(zbdat, kn, jn ); + auto zmdat_view = getViewFromPointer(zmdat, kn, jn ); + auto zpdat_view = getViewFromPointer(zpdat, kn, jn ); + auto zqdat_view = getViewFromPointer(zqdat, kn, jn ); + auto zrdat_view = getViewFromPointer(zrdat, kn, jn ); + auto zudat_view = getViewFromPointer(zudat, kn, jn ); + auto zvdat_view = getViewFromPointer(zvdat, kn, jn ); + auto zzdat_view = getViewFromPointer(zzdat, kn, jn ); + + // Wrap output pointers into Kokkos::Views + + auto zroutdat_view = getViewFromPointer(zroutdat, kn, jn ); + auto zzoutdat_view = getViewFromPointer(zzoutdat, kn, jn ); + +// Pre-processor directives +// +#if defined(RUN_KOKKOS) + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type k = kbeg; k < kend; ++k ) { + for (Index_type j = jbeg; j < jend; ++j ) { + HYDRO_2D_BODY1; + } + } + + for (Index_type k = kbeg; k < kend; ++k ) { + for (Index_type j = jbeg; j < jend; ++j ) { + HYDRO_2D_BODY2; + } + } + + for (Index_type k = kbeg; k < kend; ++k ) { + for (Index_type j = jbeg; j < jend; ++j ) { + HYDRO_2D_BODY3; + } + } + + } + stopTimer(); + + break; + } + + case Lambda_Seq : { + + auto hydro2d_base_lam1 = [=] (Index_type k, Index_type j) { + HYDRO_2D_BODY1; + }; + auto hydro2d_base_lam2 = [=] (Index_type k, Index_type j) { + HYDRO_2D_BODY2; + }; + auto hydro2d_base_lam3 = [=] (Index_type k, Index_type j) { + HYDRO_2D_BODY3; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type k = kbeg; k < kend; ++k ) { + for (Index_type j = jbeg; j < jend; ++j ) { + hydro2d_base_lam1(k, j); + } + } + + for (Index_type k = kbeg; k < kend; ++k ) { + for (Index_type j = jbeg; j < jend; ++j ) { + hydro2d_base_lam2(k, j); + } + } + + for (Index_type k = kbeg; k < kend; ++k ) { + for (Index_type j = jbeg; j < jend; ++j ) { + hydro2d_base_lam3(k, j); + } + } + + } + stopTimer(); + + break; + } +/* + case RAJA_Seq : { + + HYDRO_2D_VIEWS_RAJA; + + auto hydro2d_lam1 = [=] (Index_type k, Index_type j) { + HYDRO_2D_BODY1_RAJA; + }; + auto hydro2d_lam2 = [=] (Index_type k, Index_type j) { + HYDRO_2D_BODY2_RAJA; + }; + auto hydro2d_lam3 = [=] (Index_type k, Index_type j) { + HYDRO_2D_BODY3_RAJA; + }; + + using EXECPOL = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::loop_exec, // k + RAJA::statement::For<1, RAJA::loop_exec, // j + RAJA::statement::Lambda<0> + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( + RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), + RAJA::RangeSegment(jbeg, jend)), + hydro2d_lam1); + + RAJA::kernel( + RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), + RAJA::RangeSegment(jbeg, jend)), + hydro2d_lam2); + + RAJA::kernel( + RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), + RAJA::RangeSegment(jbeg, jend)), + hydro2d_lam3); + + } + stopTimer(); + + break; + } +*/ + + + case Kokkos_Lambda : { + + Kokkos::fence(); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + + // Use MDRangePolicy for multidimensional arrays + // https://github.com/kokkos/kokkos/wiki/Kokkos::MDRangePolicy + + Kokkos::parallel_for("HYDRO_2D_Kokkos Kokkos_Lambda--BODY1", + Kokkos::MDRangePolicy>({kbeg,jbeg}, {kend,jend}), + KOKKOS_LAMBDA(int64_t k, int64_t j) { + /* + #define HYDRO_2D_BODY1_RAJA \ + za(k,j) = ( zp(k+1,j-1) + zq(k+1,j-1) - zp(k,j-1) - zq(k,j-1) ) * \ + ( zr(k,j) + zr(k,j-1) ) / ( zm(k,j-1) + zm(k+1,j-1) ); \ + zb(k,j) = ( zp(k,j-1) + zq(k,j-1) - zp(k,j) - zq(k,j) ) * \ + ( zr(k,j) + zr(k-1,j) ) / ( zm(k,j) + zm(k,j-1)); + */ + zadat_view(k,j) = ( zpdat_view(k+1,j-1) + zqdat_view(k+1,j-1) - zpdat_view(k,j-1) - zqdat_view(k,j-1) ) * \ + ( zrdat_view(k,j) + zrdat_view(k,j-1) ) / ( zmdat_view(k,j-1) + zmdat_view(k+1,j-1) ); \ + + zbdat_view(k,j) = ( zpdat_view(k,j-1) + zqdat_view(k,j-1) - zpdat_view(k,j) - zqdat_view(k,j) ) * \ + ( zrdat_view(k,j) + zrdat_view(k-1,j) ) / ( zmdat_view(k,j) + zmdat_view(k,j-1)); + }); + + + Kokkos::parallel_for("HYDRO_2D_Kokkos Kokkos_Lambda--BODY2", + Kokkos::MDRangePolicy>({kbeg,jbeg}, {kend,jend}), + KOKKOS_LAMBDA(int64_t k, int64_t j) { + + /* + #define HYDRO_2D_BODY2_RAJA \ + zu(k,j) += s*( za(k,j) * ( zz(k,j) - zz(k,j+1) ) - \ + za(k,j-1) * ( zz(k,j) - zz(k,j-1) ) - \ + zb(k,j) * ( zz(k,j) - zz(k-1,j) ) + \ + zb(k+1,j) * ( zz(k,j) - zz(k+1,j) ) ); \ + zv(k,j) += s*( za(k,j) * ( zr(k,j) - zr(k,j+1) ) - \ + za(k,j-1) * ( zr(k,j) - zr(k,j-1) ) - \ + zb(k,j) * ( zr(k,j) - zr(k-1,j) ) + \ + zb(k+1,j) * ( zr(k,j) - zr(k+1,j) ) ); + */ + + zudat_view(k,j) += s*( zadat_view(k,j) * ( zzdat_view(k,j) - zzdat_view(k,j+1) ) - \ + zadat_view(k,j-1) * (zzdat_view(k,j) - zzdat_view(k,j-1) ) - \ + zbdat_view(k,j) * ( zzdat_view(k,j) - zzdat_view(k-1,j) ) + \ + zbdat_view(k+1,j) * ( zzdat_view(k,j) - zzdat_view(k+1,j) ) ); \ + zvdat_view(k,j) += s*( zadat_view(k,j) * ( zrdat_view(k,j) - zrdat_view(k,j+1) ) - \ + zadat_view(k,j-1) * ( zrdat_view(k,j) - zrdat_view(k,j-1) ) - \ + zbdat_view(k,j) * ( zrdat_view(k,j) - zrdat_view(k-1,j) ) + \ + zbdat_view(k+1,j) * ( zrdat_view(k,j) - zrdat_view(k+1,j) ) ); + + }); + + + Kokkos::parallel_for("HYDRO_2D_Kokkos Kokkos_Lambda--BODY3", + Kokkos::MDRangePolicy>({kbeg,jbeg}, {kend,jend}), + KOKKOS_LAMBDA(int64_t k, int64_t j) { + /* + #define HYDRO_2D_BODY3_RAJA \ + zrout(k,j) = zr(k,j) + t*zu(k,j); \ + zzout(k,j) = zz(k,j) + t*zv(k,j); + */ + + zroutdat_view(k,j) = zrdat_view(k,j) + t*zudat_view(k,j); \ + zzoutdat_view(k,j) = zzdat_view(k,j) + t*zvdat_view(k,j); + }); + + } + + Kokkos::fence(); + stopTimer(); + + break; + } + + + default : { + std::cout << "\n HYDRO_2D : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + + + // Wrap input pointers in Kokkos::Views +/* +#define HYDRO_2D_DATA_SETUP \ + Real_ptr zadat = m_za; \ + Real_ptr zbdat = m_zb; \ + Real_ptr zmdat = m_zm; \ + Real_ptr zpdat = m_zp; \ + Real_ptr zqdat = m_zq; \ + Real_ptr zrdat = m_zr; \ + Real_ptr zudat = m_zu; \ + Real_ptr zvdat = m_zv; \ + Real_ptr zzdat = m_zz; \ +\ + Real_ptr zroutdat = m_zrout; \ + Real_ptr zzoutdat = m_zzout; \ +\ + + +*/ + + + // There are 9 inputs: + moveDataToHostFromKokkosView(zadat, zadat_view, kn, jn); + moveDataToHostFromKokkosView(zbdat, zbdat_view, kn, jn); + moveDataToHostFromKokkosView(zmdat, zmdat_view, kn, jn); + moveDataToHostFromKokkosView(zpdat, zpdat_view, kn, jn); + moveDataToHostFromKokkosView(zqdat, zqdat_view, kn, jn); + moveDataToHostFromKokkosView(zrdat, zrdat_view, kn, jn); + moveDataToHostFromKokkosView(zudat, zudat_view, kn, jn); + moveDataToHostFromKokkosView(zvdat, zvdat_view, kn, jn); + moveDataToHostFromKokkosView(zzdat, zzdat_view, kn, jn); + + // There are 2 output views + moveDataToHostFromKokkosView(zroutdat, zroutdat_view, kn, jn); + moveDataToHostFromKokkosView(zzoutdat, zzoutdat_view, kn, jn); + +} + +} // end namespace lcals +} // end namespace rajaperf diff --git a/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp b/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp new file mode 100644 index 000000000..c9ef8c430 --- /dev/null +++ b/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp @@ -0,0 +1,158 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "INT_PREDICT.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + + +void INT_PREDICT::runKokkosVariant(VariantID vid) +{ + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + INT_PREDICT_DATA_SETUP; + +/* + *#define INT_PREDICT_DATA_SETUP \ + Real_ptr px = m_px; \ + Real_type dm22 = m_dm22; \ + Real_type dm23 = m_dm23; \ + Real_type dm24 = m_dm24; \ + Real_type dm25 = m_dm25; \ + Real_type dm26 = m_dm26; \ + Real_type dm27 = m_dm27; \ + Real_type dm28 = m_dm28; \ + Real_type c0 = m_c0; \ + +*/ + + // Wrap pointer in Kokkos View + auto px_view = getViewFromPointer(px, iend*13); + + + auto intpredict_lam = [=](Index_type i) { + INT_PREDICT_BODY; + }; + +#if defined(RUN_KOKKOS) + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + INT_PREDICT_BODY; + } + + } + stopTimer(); + + break; + } + + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + intpredict_lam(i); + } + + } + stopTimer(); + + break; + } +/* + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), intpredict_lam); + + } + stopTimer(); + + break; + } +*/ + + case Kokkos_Lambda : { + + Kokkos::fence(); + startTimer(); + + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // Declare variables in INT_PREDICT.hpp + Real_type dm22 = m_dm22; + Real_type dm23 = m_dm23; + Real_type dm24 = m_dm24; + Real_type dm25 = m_dm25; + Real_type dm26 = m_dm26; + Real_type dm27 = m_dm27; + Real_type dm28 = m_dm28; + + /* + #define INT_PREDICT_BODY \ + px[i] = dm28*px[i + offset * 12] + dm27*px[i + offset * 11] + \ + dm26*px[i + offset * 10] + dm25*px[i + offset * 9] + \ + dm24*px[i + offset * 8] + dm23*px[i + offset * 7] + \ + dm22*px[i + offset * 6] + \ + c0*( px[i + offset * 4] + px[i + offset * 5] ) + \ + px[i + offset * 2]; + */ + Kokkos::parallel_for("INT_PREDICT_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i){ + // #define INT_PREDICT_BODY + px_view[i] = dm28*px_view[i + offset * 12] + dm27*px_view[i + offset * 11] + \ + dm26*px_view[i + offset * 10] + dm25*px_view[i + offset * 9] + \ + dm24*px_view[i + offset * 8] + dm23*px_view[i + offset * 7] + \ + dm22*px_view[i + offset * 6] + \ + c0*( px_view[i + offset * 4] + px_view[i + offset * 5] ) + \ + px_view[i + offset * 2]; + }); + + } + Kokkos::fence(); + stopTimer(); + + break; + } + + + default : { + std::cout << "\n INT_PREDICT : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + moveDataToHostFromKokkosView(px, px_view, iend*13); +} + +} // end namespace lcals +} // end namespace rajaperf diff --git a/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp b/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp new file mode 100644 index 000000000..5a850cae6 --- /dev/null +++ b/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp @@ -0,0 +1,142 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "PLANCKIAN.hpp" + +#include "RAJA/RAJA.hpp" + +#include +#include + +namespace rajaperf +{ +namespace lcals +{ + + +void PLANCKIAN::runKokkosVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + PLANCKIAN_DATA_SETUP; + + /* +#define PLANCKIAN_DATA_SETUP \ + Real_ptr x = m_x; \ + Real_ptr y = m_y; \ + Real_ptr u = m_u; \ + Real_ptr v = m_v; \ + Real_ptr w = m_w; +*/ + + auto x_view = getViewFromPointer(x, iend); + auto y_view = getViewFromPointer(y, iend); + auto u_view = getViewFromPointer(u, iend); + auto v_view = getViewFromPointer(v, iend); + auto w_view = getViewFromPointer(w, iend); + + + auto planckian_lam = [=](Index_type i) { + PLANCKIAN_BODY; + }; + +# if defined (RUN_KOKKOS) + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + PLANCKIAN_BODY; + } + + } + stopTimer(); + + break; + } + + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + planckian_lam(i); + } + + } + stopTimer(); + + break; + } +/* + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), planckian_lam); + + } + stopTimer(); + + break; + } +*/ + + + case Kokkos_Lambda : { + + Kokkos::fence(); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Kokkos::parallel_for("PLANCKIAN_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i){ + /* #define PLANCKIAN_BODY \ + * y[i] = u[i] / v[i]; \ + * w[i] = x[i] / ( exp( y[i] ) - 1.0 ); + */ + y_view[i] = u_view[i] / v_view[i]; + w_view[i] = x_view[i] / ( exp( y_view[i] ) - 1.0 ); + }); + } + + Kokkos::fence(); + stopTimer(); + + break; + } + + default : { + std::cout << "\n PLANCKIAN : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + moveDataToHostFromKokkosView(x, x_view, iend); + moveDataToHostFromKokkosView(y, y_view, iend); + moveDataToHostFromKokkosView(u, u_view, iend); + moveDataToHostFromKokkosView(v, v_view, iend); + moveDataToHostFromKokkosView(w, w_view, iend); + + + +} + +} // end namespace lcals +} // end namespace rajaperf diff --git a/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp b/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp new file mode 100644 index 000000000..92b6b46f4 --- /dev/null +++ b/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp @@ -0,0 +1,133 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "TRIDIAG_ELIM.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace lcals +{ + + +void TRIDIAG_ELIM::runKokkosVariant(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 1; + const Index_type iend = m_N; + + TRIDIAG_ELIM_DATA_SETUP; + +/* +#define TRIDIAG_ELIM_DATA_SETUP \ + Real_ptr xout = m_xout; \ + Real_ptr xin = m_xin; \ + Real_ptr y = m_y; \ + Real_ptr z = m_z; +*/ + + + auto xout_view = getViewFromPointer(xout, iend); + auto xin_view = getViewFromPointer(xin, iend); + auto y_view = getViewFromPointer(y, iend); + auto z_view = getViewFromPointer(z, iend); + + + auto tridiag_elim_lam = [=](Index_type i) { + TRIDIAG_ELIM_BODY; + }; + +#if defined(RUN_KOKKOS) + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + TRIDIAG_ELIM_BODY; + } + + } + stopTimer(); + + break; + } + + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + tridiag_elim_lam(i); + } + + } + stopTimer(); + + break; + } +/* + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), tridiag_elim_lam); + + } + stopTimer(); + + break; + } +*/ + + case Kokkos_Lambda : { + + Kokkos::fence(); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Kokkos::parallel_for("TRIDIAG_ELIM_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i){ + // #define TRIDIAG_ELIM_BODY + // xout[i] = z[i] * ( y[i] - xin[i-1] ); + xout_view[i] = z_view[i] * ( y_view[i] - xin_view[i-1] ); + }); + } + Kokkos::fence(); + stopTimer(); + + break; + } + + default : { + std::cout << "\n TRIDIAG_ELIM : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + moveDataToHostFromKokkosView(xout, xout_view, iend); + moveDataToHostFromKokkosView(xin, xin_view, iend); + moveDataToHostFromKokkosView(y, y_view, iend); + moveDataToHostFromKokkosView(z, z_view, iend); + +} + +} // end namespace lcals +} // end namespace rajaperf From d90c1fa979fc37ce2c50c187a36c5aaaf2b8b483 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 29 Jun 2021 14:48:20 -0600 Subject: [PATCH 090/124] apps-kokkos: initial Kokkos-i-fication of apps kernels --- src/CMakeLists.txt | 61 ++++--- src/apps-kokkos/AppsData.cpp | 113 +++++++++++++ src/apps-kokkos/CMakeLists.txt | 24 +++ src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp | 112 +++++++++++++ src/apps-kokkos/ENERGY-Kokkos.cpp | 167 +++++++++++++++++++ src/apps-kokkos/FIR-Kokkos.cpp | 98 +++++++++++ src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp | 168 +++++++++++++++++++ src/apps-kokkos/LTIMES-Kokkos.cpp | 128 +++++++++++++++ src/apps-kokkos/LTIMES_NOVIEW-Kokkos.cpp | 120 ++++++++++++++ src/apps-kokkos/PRESSURE-Kokkos.cpp | 111 +++++++++++++ src/apps-kokkos/VOL3D-Kokkos.cpp | 99 +++++++++++ src/apps-kokkos/WIP-COUPLE.cpp.kokkos.wip | 192 ++++++++++++++++++++++ src/common/RAJAPerfSuite.cpp | 13 +- 13 files changed, 1378 insertions(+), 28 deletions(-) create mode 100644 src/apps-kokkos/AppsData.cpp create mode 100644 src/apps-kokkos/CMakeLists.txt create mode 100644 src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp create mode 100644 src/apps-kokkos/ENERGY-Kokkos.cpp create mode 100644 src/apps-kokkos/FIR-Kokkos.cpp create mode 100644 src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp create mode 100644 src/apps-kokkos/LTIMES-Kokkos.cpp create mode 100644 src/apps-kokkos/LTIMES_NOVIEW-Kokkos.cpp create mode 100644 src/apps-kokkos/PRESSURE-Kokkos.cpp create mode 100644 src/apps-kokkos/VOL3D-Kokkos.cpp create mode 100644 src/apps-kokkos/WIP-COUPLE.cpp.kokkos.wip diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 34f7b8be5..97b2f221f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,7 +9,8 @@ include_directories(.) add_subdirectory(common) -#add_subdirectory(apps) +add_subdirectory(apps) +add_subdirectory(apps-kokkos) add_subdirectory(basic) add_subdirectory(basic-kokkos) #add_subdirectory(kokkos-mechanics) @@ -22,7 +23,8 @@ add_subdirectory(stream-kokkos) set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS common - #apps + apps + apps-kokkos basic basic-kokkos #kokkos-mechanics @@ -41,29 +43,40 @@ include_directories(basic) blt_add_executable( NAME raja-perf-omptarget.exe SOURCES RAJAPerfSuiteDriver.cpp - #apps/AppsData.cpp - #apps/DEL_DOT_VEC_2D.cpp - #apps/DEL_DOT_VEC_2D-Seq.cpp - #apps/DEL_DOT_VEC_2D-OMPTarget.cpp - #apps/ENERGY.cpp - #apps/ENERGY-Seq.cpp - #apps/ENERGY-OMPTarget.cpp - #apps/FIR.cpp - #apps/FIR-Seq.cpp - #apps/FIR-OMPTarget.cpp - #apps/PRESSURE.cpp - #apps/PRESSURE-Seq.cpp - #apps/PRESSURE-OMPTarget.cpp - #apps/LTIMES.cpp - #apps/LTIMES-Seq.cpp - #apps/LTIMES-OMPTarget.cpp - #apps/LTIMES_NOVIEW.cpp - #apps/LTIMES_NOVIEW-Seq.cpp - #apps/LTIMES_NOVIEW-OMPTarget.cpp - #apps/VOL3D.cpp - #apps/VOL3D-Seq.cpp - #apps/VOL3D-OMPTarget.cpp + apps/AppsData.cpp + apps/DEL_DOT_VEC_2D.cpp + apps/DEL_DOT_VEC_2D-Seq.cpp + apps/DEL_DOT_VEC_2D-OMPTarget.cpp + apps/ENERGY.cpp + apps/ENERGY-Seq.cpp + apps/ENERGY-OMPTarget.cpp + apps/FIR.cpp + apps/FIR-Seq.cpp + apps/FIR-OMPTarget.cpp + apps/PRESSURE.cpp + apps/PRESSURE-Seq.cpp + apps/PRESSURE-OMPTarget.cpp + apps/LTIMES.cpp + apps/LTIMES-Seq.cpp + apps/LTIMES-OMPTarget.cpp + apps/LTIMES_NOVIEW.cpp + apps/LTIMES_NOVIEW-Seq.cpp + apps/LTIMES_NOVIEW-OMPTarget.cpp + apps/VOL3D.cpp + apps/VOL3D-Seq.cpp + apps/VOL3D-OMPTarget.cpp #apps/WIP-COUPLE.cpp + #Kokkos bloc + apps-kokkos/AppsData.cpp + apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp + apps-kokkos/ENERGY-Kokkos.cpp + apps-kokkos/FIR-Kokkos.cpp + apps-kokkos/HALOEXCHANGE-Kokkos.cpp + apps-kokkos/PRESSURE-Kokkos.cpp + apps-kokkos/LTIMES-Kokkos.cpp + apps-kokkos/LTIMES_NOVIEW-Kokkos.cpp + apps-kokkos/VOL3D-Kokkos.cpp + #apps-kokkos/WIP-COUPLE.cpp basic/ATOMIC_PI.cpp basic/ATOMIC_PI-Seq.cpp basic/ATOMIC_PI-OMPTarget.cpp diff --git a/src/apps-kokkos/AppsData.cpp b/src/apps-kokkos/AppsData.cpp new file mode 100644 index 000000000..b3c042162 --- /dev/null +++ b/src/apps-kokkos/AppsData.cpp @@ -0,0 +1,113 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + +// +// Set mesh positions for 2d mesh. +// +void setMeshPositions_2d(Real_ptr x, Real_type dx, + Real_ptr y, Real_type dy, + const ADomain& domain) +{ + if (domain.ndims != 2) { + std::cout << "\n******* ERROR!!! domain is not 2d *******" << std::endl; + return; + } + + Index_type imin = domain.imin; + Index_type imax = domain.imax; + Index_type jmin = domain.jmin; + Index_type jmax = domain.jmax; + + Index_type jp = domain.jp; + + Index_type npnl = domain.NPNL; + Index_type npnr = domain.NPNR; + + Real_ptr x1, x2, x3, x4; + Real_ptr y1, y2, y3, y4; + NDSET2D(domain.jp, x, x1,x2,x3,x4) ; + NDSET2D(domain.jp, y, y1,y2,y3,y4) ; + + for (Index_type j = jmin - npnl; j < jmax + npnr; j++) { + for (Index_type i = imin - npnl; i < imax + npnr; i++) { + Index_type iz = i + j*jp ; + + x3[iz] = x4[iz] = i*dx; + x1[iz] = x2[iz] = (i+1)*dx; + + y1[iz] = y4[iz] = j*dy; + y2[iz] = y3[iz] = (j+1)*dy; + + } + } +} + + +// +// Set mesh positions for 2d mesh. +// +void setMeshPositions_3d(Real_ptr x, Real_type dx, + Real_ptr y, Real_type dy, + Real_ptr z, Real_type dz, + const ADomain& domain) +{ + if (domain.ndims != 3) { + std::cout << "\n******* ERROR!!! domain is not 3d *******" << std::endl; + return; + } + + Index_type imin = domain.imin; + Index_type imax = domain.imax; + Index_type jmin = domain.jmin; + Index_type jmax = domain.jmax; + Index_type kmin = domain.kmin; + Index_type kmax = domain.kmax; + + Index_type jp = domain.jp; + Index_type kp = domain.kp; + + Index_type npnl = domain.NPNL; + Index_type npnr = domain.NPNR; + + Real_ptr x0, x1, x2, x3, x4, x5, x6, x7; + Real_ptr y0, y1, y2, y3, y4, y5, y6, y7; + Real_ptr z0, z1, z2, z3, z4, z5, z6, z7; + NDPTRSET(domain.jp, domain.kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; + NDPTRSET(domain.jp, domain.kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ; + NDPTRSET(domain.jp, domain.kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ; + + for (Index_type k = kmin - npnl; k < kmax + npnr; k++) { + for (Index_type j = jmin - npnl; j < jmax + npnr; j++) { + for (Index_type i = imin - npnl; i < imax + npnr; i++) { + Index_type iz = i + j*jp + kp*k ; + + x0[iz] = x2[iz] = x4[iz] = x6[iz] = i*dx; + x1[iz] = x3[iz] = x5[iz] = x7[iz] = (i+1)*dx; + + y0[iz] = y1[iz] = y4[iz] = y5[iz] = j*dy; + y2[iz] = y3[iz] = y6[iz] = y7[iz] = (j+1)*dy; + + z0[iz] = z1[iz] = z2[iz] = z3[iz] = k*dz; + z4[iz] = z5[iz] = z6[iz] = z7[iz] = (k+1)*dz; + + } + } + } +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps-kokkos/CMakeLists.txt b/src/apps-kokkos/CMakeLists.txt new file mode 100644 index 000000000..832f8617d --- /dev/null +++ b/src/apps-kokkos/CMakeLists.txt @@ -0,0 +1,24 @@ +############################################################################### +# Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + + +include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/../apps) + +blt_add_library( + NAME apps + SOURCES AppsData.cpp + DEL_DOT_VEC_2D-Kokkos.cpp + ENERGY-Kokkos.cpp + FIR-Kokkos.cpp + HALOEXCHANGE-Kokkos.cpp + LTIMES-Kokkos.cpp + LTIMES_NOVIEW-Kokkos.cpp + PRESSURE-Kokkos.cpp + VOL3D-Kokkos.cpp + DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} + ) diff --git a/src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp b/src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp new file mode 100644 index 000000000..83a4c7b3a --- /dev/null +++ b/src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp @@ -0,0 +1,112 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "DEL_DOT_VEC_2D.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" + +#include "camp/resource.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void DEL_DOT_VEC_2D::runKokkosVariant(VariantID vid) +{ + //FIXME + return; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + DEL_DOT_VEC_2D_DATA_SETUP; + + NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; + NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; + NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; + NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + DEL_DOT_VEC_2D_BODY_INDEX; + DEL_DOT_VEC_2D_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto deldotvec2d_base_lam = [=](Index_type ii) { + DEL_DOT_VEC_2D_BODY_INDEX; + DEL_DOT_VEC_2D_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type ii = ibegin ; ii < iend ; ++ii ) { + deldotvec2d_base_lam(ii); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + camp::resources::Resource working_res{camp::resources::Host()}; + RAJA::TypedListSegment zones(m_domain->real_zones, + m_domain->n_real_zones, + working_res); + + auto deldotvec2d_lam = [=](Index_type i) { + DEL_DOT_VEC_2D_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall(zones, deldotvec2d_lam); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n DEL_DOT_VEC_2D : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps-kokkos/ENERGY-Kokkos.cpp b/src/apps-kokkos/ENERGY-Kokkos.cpp new file mode 100644 index 000000000..0b69cf129 --- /dev/null +++ b/src/apps-kokkos/ENERGY-Kokkos.cpp @@ -0,0 +1,167 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "ENERGY.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void ENERGY::runKokkosVariant(VariantID vid) +{ + //FIXME + return; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + ENERGY_DATA_SETUP; + + auto energy_lam1 = [=](Index_type i) { + ENERGY_BODY1; + }; + auto energy_lam2 = [=](Index_type i) { + ENERGY_BODY2; + }; + auto energy_lam3 = [=](Index_type i) { + ENERGY_BODY3; + }; + auto energy_lam4 = [=](Index_type i) { + ENERGY_BODY4; + }; + auto energy_lam5 = [=](Index_type i) { + ENERGY_BODY5; + }; + auto energy_lam6 = [=](Index_type i) { + ENERGY_BODY6; + }; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + ENERGY_BODY1; + } + + for (Index_type i = ibegin; i < iend; ++i ) { + ENERGY_BODY2; + } + + for (Index_type i = ibegin; i < iend; ++i ) { + ENERGY_BODY3; + } + + for (Index_type i = ibegin; i < iend; ++i ) { + ENERGY_BODY4; + } + + for (Index_type i = ibegin; i < iend; ++i ) { + ENERGY_BODY5; + } + + for (Index_type i = ibegin; i < iend; ++i ) { + ENERGY_BODY6; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + energy_lam1(i); + } + + for (Index_type i = ibegin; i < iend; ++i ) { + energy_lam2(i); + } + + for (Index_type i = ibegin; i < iend; ++i ) { + energy_lam3(i); + } + + for (Index_type i = ibegin; i < iend; ++i ) { + energy_lam4(i); + } + + for (Index_type i = ibegin; i < iend; ++i ) { + energy_lam5(i); + } + + for (Index_type i = ibegin; i < iend; ++i ) { + energy_lam6(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::region( [=]() { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), energy_lam1); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), energy_lam2); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), energy_lam3); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), energy_lam4); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), energy_lam5); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), energy_lam6); + + }); // end sequential region (for single-source code) + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n ENERGY : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps-kokkos/FIR-Kokkos.cpp b/src/apps-kokkos/FIR-Kokkos.cpp new file mode 100644 index 000000000..51d9d3e85 --- /dev/null +++ b/src/apps-kokkos/FIR-Kokkos.cpp @@ -0,0 +1,98 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "FIR.hpp" + +#include "RAJA/RAJA.hpp" + +#include +#include + +namespace rajaperf +{ +namespace apps +{ + + +void FIR::runKokkosVariant(VariantID vid) +{ + // FIXME + return; + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize() - m_coefflen; + + FIR_COEFF; + + FIR_DATA_SETUP; + + Real_type coeff[FIR_COEFFLEN]; + std::copy(std::begin(coeff_array), std::end(coeff_array), std::begin(coeff)); + + auto fir_lam = [=](Index_type i) { + FIR_BODY; + }; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + FIR_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + fir_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), fir_lam); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n FIR : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp b/src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp new file mode 100644 index 000000000..70f4216a4 --- /dev/null +++ b/src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp @@ -0,0 +1,168 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "HALOEXCHANGE.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void HALOEXCHANGE::runKokkosVariant(VariantID vid) +{ + //FIXME + return; + + const Index_type run_reps = getRunReps(); + + HALOEXCHANGE_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + for (Index_type i = 0; i < len; i++) { + HALOEXCHANGE_PACK_BODY; + } + buffer += len; + } + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + for (Index_type i = 0; i < len; i++) { + HALOEXCHANGE_UNPACK_BODY; + } + buffer += len; + } + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_pack_base_lam = [=](Index_type i) { + HALOEXCHANGE_PACK_BODY; + }; + for (Index_type i = 0; i < len; i++) { + haloexchange_pack_base_lam(i); + } + buffer += len; + } + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_unpack_base_lam = [=](Index_type i) { + HALOEXCHANGE_UNPACK_BODY; + }; + for (Index_type i = 0; i < len; i++) { + haloexchange_unpack_base_lam(i); + } + buffer += len; + } + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + using EXEC_POL = RAJA::loop_exec; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_pack_base_lam = [=](Index_type i) { + HALOEXCHANGE_PACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + haloexchange_pack_base_lam ); + buffer += len; + } + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_unpack_base_lam = [=](Index_type i) { + HALOEXCHANGE_UNPACK_BODY; + }; + RAJA::forall( + RAJA::TypedRangeSegment(0, len), + haloexchange_unpack_base_lam ); + buffer += len; + } + } + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps-kokkos/LTIMES-Kokkos.cpp b/src/apps-kokkos/LTIMES-Kokkos.cpp new file mode 100644 index 000000000..39152d23d --- /dev/null +++ b/src/apps-kokkos/LTIMES-Kokkos.cpp @@ -0,0 +1,128 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "LTIMES.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void LTIMES::runKokkosVariant(VariantID vid) +{ + // FIXME + return; + + const Index_type run_reps = getRunReps(); + + LTIMES_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type z = 0; z < num_z; ++z ) { + for (Index_type g = 0; g < num_g; ++g ) { + for (Index_type m = 0; m < num_m; ++m ) { + for (Index_type d = 0; d < num_d; ++d ) { + LTIMES_BODY; + } + } + } + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto ltimes_base_lam = [=](Index_type d, Index_type z, + Index_type g, Index_type m) { + LTIMES_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type z = 0; z < num_z; ++z ) { + for (Index_type g = 0; g < num_g; ++g ) { + for (Index_type m = 0; m < num_m; ++m ) { + for (Index_type d = 0; d < num_d; ++d ) { + ltimes_base_lam(d, z, g, m); + } + } + } + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + LTIMES_VIEWS_RANGES_RAJA; + + auto ltimes_lam = [=](ID d, IZ z, IG g, IM m) { + LTIMES_BODY_RAJA; + }; + + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<1, RAJA::loop_exec, // z + RAJA::statement::For<2, RAJA::loop_exec, // g + RAJA::statement::For<3, RAJA::loop_exec, // m + RAJA::statement::For<0, RAJA::loop_exec, // d + RAJA::statement::Lambda<0> + > + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( RAJA::make_tuple(IDRange(0, num_d), + IZRange(0, num_z), + IGRange(0, num_g), + IMRange(0, num_m)), + ltimes_lam + ); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n LTIMES : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps-kokkos/LTIMES_NOVIEW-Kokkos.cpp b/src/apps-kokkos/LTIMES_NOVIEW-Kokkos.cpp new file mode 100644 index 000000000..daa9cf46c --- /dev/null +++ b/src/apps-kokkos/LTIMES_NOVIEW-Kokkos.cpp @@ -0,0 +1,120 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "LTIMES_NOVIEW.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void LTIMES_NOVIEW::runKokkosVariant(VariantID vid) +{ + // FIXME + return; + const Index_type run_reps = getRunReps(); + + LTIMES_NOVIEW_DATA_SETUP; + + auto ltimesnoview_lam = [=](Index_type d, Index_type z, + Index_type g, Index_type m) { + LTIMES_NOVIEW_BODY; + }; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type z = 0; z < num_z; ++z ) { + for (Index_type g = 0; g < num_g; ++g ) { + for (Index_type m = 0; m < num_m; ++m ) { + for (Index_type d = 0; d < num_d; ++d ) { + LTIMES_NOVIEW_BODY; + } + } + } + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type z = 0; z < num_z; ++z ) { + for (Index_type g = 0; g < num_g; ++g ) { + for (Index_type m = 0; m < num_m; ++m ) { + for (Index_type d = 0; d < num_d; ++d ) { + ltimesnoview_lam(d, z, g, m); + } + } + } + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<1, RAJA::loop_exec, // z + RAJA::statement::For<2, RAJA::loop_exec, // g + RAJA::statement::For<3, RAJA::loop_exec, // m + RAJA::statement::For<0, RAJA::loop_exec, // d + RAJA::statement::Lambda<0> + > + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, num_d), + RAJA::RangeSegment(0, num_z), + RAJA::RangeSegment(0, num_g), + RAJA::RangeSegment(0, num_m)), + ltimesnoview_lam + ); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n LTIMES_NOVIEW : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps-kokkos/PRESSURE-Kokkos.cpp b/src/apps-kokkos/PRESSURE-Kokkos.cpp new file mode 100644 index 000000000..ea09713b6 --- /dev/null +++ b/src/apps-kokkos/PRESSURE-Kokkos.cpp @@ -0,0 +1,111 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "PRESSURE.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void PRESSURE::runKokkosVariant(VariantID vid) +{ + // FIXME + return; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getRunSize(); + + PRESSURE_DATA_SETUP; + + auto pressure_lam1 = [=](Index_type i) { + PRESSURE_BODY1; + }; + auto pressure_lam2 = [=](Index_type i) { + PRESSURE_BODY2; + }; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + PRESSURE_BODY1; + } + + for (Index_type i = ibegin; i < iend; ++i ) { + PRESSURE_BODY2; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin; i < iend; ++i ) { + pressure_lam1(i); + } + + for (Index_type i = ibegin; i < iend; ++i ) { + pressure_lam2(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::region( [=]() { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), pressure_lam1); + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), pressure_lam2); + + }); // end sequential region (for single-source code) + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n PRESSURE : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps-kokkos/VOL3D-Kokkos.cpp b/src/apps-kokkos/VOL3D-Kokkos.cpp new file mode 100644 index 000000000..7194cce22 --- /dev/null +++ b/src/apps-kokkos/VOL3D-Kokkos.cpp @@ -0,0 +1,99 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "VOL3D.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void VOL3D::runKokkosVariant(VariantID vid) +{ + // FIXME + return; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = m_domain->fpz; + const Index_type iend = m_domain->lpz+1; + + VOL3D_DATA_SETUP; + + NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; + NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ; + NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ; + + auto vol3d_lam = [=](Index_type i) { + VOL3D_BODY; + }; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin ; i < iend ; ++i ) { + VOL3D_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = ibegin ; i < iend ; ++i ) { + vol3d_lam(i); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(ibegin, iend), vol3d_lam); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n VOL3D : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps-kokkos/WIP-COUPLE.cpp.kokkos.wip b/src/apps-kokkos/WIP-COUPLE.cpp.kokkos.wip new file mode 100644 index 000000000..ed205e08e --- /dev/null +++ b/src/apps-kokkos/WIP-COUPLE.cpp.kokkos.wip @@ -0,0 +1,192 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "WIP-COUPLE.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" +#include "common/DataUtils.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +COUPLE::COUPLE(const RunParams& params) + : KernelBase(rajaperf::Apps_COUPLE, params) +{ + setDefaultSize(64); // See rzmax in ADomain struct + setDefaultReps(60); + + m_domain = new ADomain(getRunSize(), /* ndims = */ 3); + + m_imin = m_domain->imin; + m_imax = m_domain->imax; + m_jmin = m_domain->jmin; + m_jmax = m_domain->jmax; + m_kmin = m_domain->kmin; + m_kmax = m_domain->kmax; +} + +COUPLE::~COUPLE() +{ + delete m_domain; +} + +Index_type COUPLE::getItsPerRep() const +{ + return ( (m_imax - m_imin) * (m_jmax - m_jmin) * (m_kmax - m_kmin) ); +} + +void COUPLE::setUp(VariantID vid) +{ + Index_type max_loop_index = m_domain->lrn; + + allocAndInitData(m_t0, max_loop_index, vid); + allocAndInitData(m_t1, max_loop_index, vid); + allocAndInitData(m_t2, max_loop_index, vid); + allocAndInitData(m_denac, max_loop_index, vid); + allocAndInitData(m_denlw, max_loop_index, vid); + + m_clight = 3.e+10; + m_csound = 3.09e+7; + m_omega0 = 0.9; + m_omegar = 0.9; + m_dt = 0.208; + m_c10 = 0.25 * (m_clight / m_csound); + m_fratio = sqrt(m_omegar / m_omega0); + m_r_fratio = 1.0/m_fratio; + m_c20 = 0.25 * (m_clight / m_csound) * m_r_fratio; + m_ireal = Complex_type(0.0, 1.0); +} + +void COUPLE::runKernel(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + COUPLE_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type k = kmin ; k < kmax ; ++k ) { + COUPLE_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(kmin, kmax), [=](Index_type k) { + COUPLE_BODY; + }); + + } + stopTimer(); + + break; + } +#endif + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type k = kmin ; k < kmax ; ++k ) { + COUPLE_BODY; + } + + } + stopTimer(); + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(kmin, kmax), [=](Index_type k) { + COUPLE_BODY; + }); + + } + stopTimer(); + + break; + } +#endif + +#if defined(RAJA_ENABLE_TARGET_OPENMP) && 0 + case Base_OpenMPTarget : + case RAJA_OpenMPTarget : + { + runOpenMPTargetVariant(vid); + break; + } +#endif + +#if defined(RAJA_ENABLE_CUDA) && 0 + case Base_CUDA : + case RAJA_CUDA : + { + runCudaVariant(vid); + break; + } +#endif + + default : { + std::cout << "\n COUPLE : Unknown variant id = " << vid << std::endl; + } + + } +} + +void COUPLE::updateChecksum(VariantID vid) +{ + Index_type max_loop_index = m_domain->lrn; + + checksum[vid] += calcChecksum(m_t0, max_loop_index); + checksum[vid] += calcChecksum(m_t1, max_loop_index); + checksum[vid] += calcChecksum(m_t2, max_loop_index); +} + +void COUPLE::tearDown(VariantID vid) +{ + (void) vid; + + deallocData(m_t0); + deallocData(m_t1); + deallocData(m_t2); + deallocData(m_denac); + deallocData(m_denlw); +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 7046c6b47..4b1238f17 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -145,9 +145,9 @@ namespace rajaperf { free_register_kernel(exec, "Stream", new stream::DOT(run_params)); free_register_kernel(exec, "Stream", new stream::MUL(run_params)); free_register_kernel(exec, "Stream", new stream::TRIAD(run_params)); -/** // Apps - free_register_kernel(exec, "Apps", new apps::COUPLE(run_params)); + // Item below is a WIP from the RPS side + // free_register_kernel(exec, "Apps", new apps::COUPLE(run_params)); free_register_kernel(exec, "Apps", new apps::DEL_DOT_VEC_2D(run_params)); free_register_kernel(exec, "Apps", new apps::ENERGY(run_params)); free_register_kernel(exec, "Apps", new apps::FIR(run_params)); @@ -157,6 +157,7 @@ namespace rajaperf { free_register_kernel(exec, "Apps", new apps::PRESSURE(run_params)); free_register_kernel(exec, "Apps", new apps::VOL3D(run_params)); +/** // Algorithm free_register_kernel(exec, "Algorithm", new algorithm::SORT(run_params)); free_register_kernel(exec, "Algorithm", new algorithm::SORTPAIRS(run_params)); @@ -625,12 +626,16 @@ namespace rajaperf { break; } +//////////////////////////////////////////////////////////////// // Apps kernels... -// +/* case Apps_COUPLE : { kernel = new apps::COUPLE(run_params); break; } + + */ + case Apps_DEL_DOT_VEC_2D : { kernel = new apps::DEL_DOT_VEC_2D(run_params); break; @@ -666,7 +671,7 @@ namespace rajaperf { // // Algorithm kernels... -// +/* case Algorithm_SORT: { kernel = new algorithm::SORT(run_params); break; From 2a65ef70acd396beead619a1877f99eac4636dc2 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 19 Jul 2021 15:12:37 -0600 Subject: [PATCH 091/124] CMakeLists.txt: apps-kokkos fix up --- src/apps-kokkos/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apps-kokkos/CMakeLists.txt b/src/apps-kokkos/CMakeLists.txt index 832f8617d..daa85c881 100644 --- a/src/apps-kokkos/CMakeLists.txt +++ b/src/apps-kokkos/CMakeLists.txt @@ -10,7 +10,7 @@ include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/../apps) blt_add_library( - NAME apps + NAME apps-kokkos SOURCES AppsData.cpp DEL_DOT_VEC_2D-Kokkos.cpp ENERGY-Kokkos.cpp From 4a24d9ad364a5e88974dd46d1cce04fb12b859bf Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 19 Jul 2021 15:13:42 -0600 Subject: [PATCH 092/124] apps header files: add runKokkosVariant --- src/apps/DEL_DOT_VEC_2D.hpp | 1 + src/apps/ENERGY.hpp | 2 ++ src/apps/FIR.hpp | 2 ++ src/apps/HALOEXCHANGE.hpp | 2 ++ src/apps/LTIMES.hpp | 2 ++ src/apps/LTIMES_NOVIEW.hpp | 2 ++ src/apps/PRESSURE.hpp | 2 ++ src/apps/VOL3D.hpp | 2 ++ src/apps/WIP-COUPLE.hpp | 2 ++ 9 files changed, 17 insertions(+) diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index abf9bfea1..fa878bc0f 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -115,6 +115,7 @@ class DEL_DOT_VEC_2D : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); private: Real_ptr m_x; diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp index bb7eb6c9b..8d802ce68 100644 --- a/src/apps/ENERGY.hpp +++ b/src/apps/ENERGY.hpp @@ -204,6 +204,8 @@ class ENERGY : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Real_ptr m_e_new; Real_ptr m_e_old; diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp index f8b46b3bd..e155b6c25 100644 --- a/src/apps/FIR.hpp +++ b/src/apps/FIR.hpp @@ -81,6 +81,8 @@ class FIR : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Real_ptr m_in; Real_ptr m_out; diff --git a/src/apps/HALOEXCHANGE.hpp b/src/apps/HALOEXCHANGE.hpp index dcfa367dc..2564bf4cf 100644 --- a/src/apps/HALOEXCHANGE.hpp +++ b/src/apps/HALOEXCHANGE.hpp @@ -94,6 +94,8 @@ class HALOEXCHANGE : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: static const int s_num_neighbors = 26; diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index cebece76b..23b5951a8 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -117,6 +117,8 @@ class LTIMES : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Real_ptr m_phidat; Real_ptr m_elldat; diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index fb015f7e4..beae673cf 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -67,6 +67,8 @@ class LTIMES_NOVIEW : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Real_ptr m_phidat; Real_ptr m_elldat; diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index 2119a0c45..de27d0f58 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -73,6 +73,8 @@ class PRESSURE : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Real_ptr m_compression; Real_ptr m_bvc; diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp index 7b1267081..549bf02fa 100644 --- a/src/apps/VOL3D.hpp +++ b/src/apps/VOL3D.hpp @@ -172,6 +172,8 @@ class VOL3D : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Real_ptr m_x; Real_ptr m_y; diff --git a/src/apps/WIP-COUPLE.hpp b/src/apps/WIP-COUPLE.hpp index 42ab11b59..e972cbfa8 100644 --- a/src/apps/WIP-COUPLE.hpp +++ b/src/apps/WIP-COUPLE.hpp @@ -174,6 +174,8 @@ class COUPLE : public KernelBase void runHipVariant(VariantID vid) {(void) vid;} void runOpenMPTargetVariant(VariantID vid) {(void) vid;} + void runKokkosVariant(VariantID vid) {(void) vid;} + private: Complex_ptr m_t0; Complex_ptr m_t1; From 061169686c58c1908e92210678a2905e945f5de4 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 19 Jul 2021 15:14:57 -0600 Subject: [PATCH 093/124] RAJAPerfSuite: fix up for apps kernels --- src/common/RAJAPerfSuite.cpp | 18 ++++++++---------- src/common/RAJAPerfSuite.hpp | 18 +++++++++--------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 4b1238f17..404d704bd 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -205,7 +205,6 @@ namespace rajaperf { static const std::string KernelNames[] = { -// // Basic kernels... // std::string("Basic_ATOMIC_PI"), @@ -234,7 +233,6 @@ namespace rajaperf { std::string("Lcals_PLANCKIAN"), std::string("Lcals_TRIDIAG_ELIM"), // -//// //// Polybench kernels... //// // std::string("Polybench_2MM"), @@ -263,14 +261,14 @@ namespace rajaperf { // Apps kernels... // // std::string("Apps_COUPLE"), -// std::string("Apps_DEL_DOT_VEC_2D"), -// std::string("Apps_ENERGY"), -// std::string("Apps_FIR"), -// std::string("Apps_HALOEXCHANGE"), -// std::string("Apps_LTIMES"), -// std::string("Apps_LTIMES_NOVIEW"), -// std::string("Apps_PRESSURE"), -// std::string("Apps_VOL3D"), + std::string("Apps_DEL_DOT_VEC_2D"), + std::string("Apps_ENERGY"), + std::string("Apps_FIR"), + std::string("Apps_HALOEXCHANGE"), + std::string("Apps_LTIMES"), + std::string("Apps_LTIMES_NOVIEW"), + std::string("Apps_PRESSURE"), + std::string("Apps_VOL3D"), // // Algorithm kernels... diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 9c3e45aba..e34fbeff9 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -373,15 +373,15 @@ enum KernelID { // // Apps kernels... // -//Apps_COUPLE, -//Apps_DEL_DOT_VEC_2D, -//Apps_ENERGY, -//Apps_FIR, -//Apps_HALOEXCHANGE, -//Apps_LTIMES, -//Apps_LTIMES_NOVIEW, -//Apps_PRESSURE, -//Apps_VOL3D, +// Apps_COUPLE, +Apps_DEL_DOT_VEC_2D, +Apps_ENERGY, +Apps_FIR, +Apps_HALOEXCHANGE, +Apps_LTIMES, +Apps_LTIMES_NOVIEW, +Apps_PRESSURE, +Apps_VOL3D, // // Algorithm kernels... From fa7f247db2b03480b6366b571f58327f4f0250b2 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 19 Jul 2021 15:18:52 -0600 Subject: [PATCH 094/124] WIP-COUPLE.cpp: temp fix comment body --- src/apps/WIP-COUPLE.cpp | 334 ++++++++++++++++++++-------------------- 1 file changed, 168 insertions(+), 166 deletions(-) diff --git a/src/apps/WIP-COUPLE.cpp b/src/apps/WIP-COUPLE.cpp index ed205e08e..34c1fb255 100644 --- a/src/apps/WIP-COUPLE.cpp +++ b/src/apps/WIP-COUPLE.cpp @@ -20,173 +20,175 @@ namespace rajaperf namespace apps { +// +//COUPLE::COUPLE(const RunParams& params) +// : KernelBase(rajaperf::Apps_COUPLE, params) +//{ +// +// setDefaultSize(64); // See rzmax in ADomain struct +// setDefaultReps(60); +// +// m_domain = new ADomain(getRunSize(), /* ndims = */ 3); +// +// m_imin = m_domain->imin; +// m_imax = m_domain->imax; +// m_jmin = m_domain->jmin; +// m_jmax = m_domain->jmax; +// m_kmin = m_domain->kmin; +// m_kmax = m_domain->kmax; +//} +// -COUPLE::COUPLE(const RunParams& params) - : KernelBase(rajaperf::Apps_COUPLE, params) -{ - setDefaultSize(64); // See rzmax in ADomain struct - setDefaultReps(60); - - m_domain = new ADomain(getRunSize(), /* ndims = */ 3); - - m_imin = m_domain->imin; - m_imax = m_domain->imax; - m_jmin = m_domain->jmin; - m_jmax = m_domain->jmax; - m_kmin = m_domain->kmin; - m_kmax = m_domain->kmax; -} - -COUPLE::~COUPLE() -{ - delete m_domain; -} - -Index_type COUPLE::getItsPerRep() const -{ - return ( (m_imax - m_imin) * (m_jmax - m_jmin) * (m_kmax - m_kmin) ); -} - -void COUPLE::setUp(VariantID vid) -{ - Index_type max_loop_index = m_domain->lrn; - - allocAndInitData(m_t0, max_loop_index, vid); - allocAndInitData(m_t1, max_loop_index, vid); - allocAndInitData(m_t2, max_loop_index, vid); - allocAndInitData(m_denac, max_loop_index, vid); - allocAndInitData(m_denlw, max_loop_index, vid); - - m_clight = 3.e+10; - m_csound = 3.09e+7; - m_omega0 = 0.9; - m_omegar = 0.9; - m_dt = 0.208; - m_c10 = 0.25 * (m_clight / m_csound); - m_fratio = sqrt(m_omegar / m_omega0); - m_r_fratio = 1.0/m_fratio; - m_c20 = 0.25 * (m_clight / m_csound) * m_r_fratio; - m_ireal = Complex_type(0.0, 1.0); -} - -void COUPLE::runKernel(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - - COUPLE_DATA_SETUP; - - switch ( vid ) { - - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type k = kmin ; k < kmax ; ++k ) { - COUPLE_BODY; - } - - } - stopTimer(); - - break; - } - -#if defined(RUN_RAJA_SEQ) - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(kmin, kmax), [=](Index_type k) { - COUPLE_BODY; - }); - - } - stopTimer(); - - break; - } -#endif - -#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) - case Base_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - #pragma omp parallel for - for (Index_type k = kmin ; k < kmax ; ++k ) { - COUPLE_BODY; - } - - } - stopTimer(); - break; - } - - case RAJA_OpenMP : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(kmin, kmax), [=](Index_type k) { - COUPLE_BODY; - }); - - } - stopTimer(); - - break; - } -#endif - -#if defined(RAJA_ENABLE_TARGET_OPENMP) && 0 - case Base_OpenMPTarget : - case RAJA_OpenMPTarget : - { - runOpenMPTargetVariant(vid); - break; - } -#endif - -#if defined(RAJA_ENABLE_CUDA) && 0 - case Base_CUDA : - case RAJA_CUDA : - { - runCudaVariant(vid); - break; - } -#endif - - default : { - std::cout << "\n COUPLE : Unknown variant id = " << vid << std::endl; - } - - } -} - -void COUPLE::updateChecksum(VariantID vid) -{ - Index_type max_loop_index = m_domain->lrn; - - checksum[vid] += calcChecksum(m_t0, max_loop_index); - checksum[vid] += calcChecksum(m_t1, max_loop_index); - checksum[vid] += calcChecksum(m_t2, max_loop_index); -} - -void COUPLE::tearDown(VariantID vid) -{ - (void) vid; - - deallocData(m_t0); - deallocData(m_t1); - deallocData(m_t2); - deallocData(m_denac); - deallocData(m_denlw); -} +//COUPLE::~COUPLE() +//{ +// delete m_domain; +//} +// +//Index_type COUPLE::getItsPerRep() const +//{ +// return ( (m_imax - m_imin) * (m_jmax - m_jmin) * (m_kmax - m_kmin) ); +//} +// +//void COUPLE::setUp(VariantID vid) +//{ +// Index_type max_loop_index = m_domain->lrn; +// +// allocAndInitData(m_t0, max_loop_index, vid); +// allocAndInitData(m_t1, max_loop_index, vid); +// allocAndInitData(m_t2, max_loop_index, vid); +// allocAndInitData(m_denac, max_loop_index, vid); +// allocAndInitData(m_denlw, max_loop_index, vid); +// +// m_clight = 3.e+10; +// m_csound = 3.09e+7; +// m_omega0 = 0.9; +// m_omegar = 0.9; +// m_dt = 0.208; +// m_c10 = 0.25 * (m_clight / m_csound); +// m_fratio = sqrt(m_omegar / m_omega0); +// m_r_fratio = 1.0/m_fratio; +// m_c20 = 0.25 * (m_clight / m_csound) * m_r_fratio; +// m_ireal = Complex_type(0.0, 1.0); +//} +// +//void COUPLE::runKernel(VariantID vid) +//{ +// const Index_type run_reps = getRunReps(); +// +// COUPLE_DATA_SETUP; +// +// switch ( vid ) { +// +// case Base_Seq : { +// +// startTimer(); +// for (RepIndex_type irep = 0; irep < run_reps; ++irep) { +// +// for (Index_type k = kmin ; k < kmax ; ++k ) { +// COUPLE_BODY; +// } +// +// } +// stopTimer(); +// +// break; +// } +// +//#if defined(RUN_RAJA_SEQ) +// case RAJA_Seq : { +// +// startTimer(); +// for (RepIndex_type irep = 0; irep < run_reps; ++irep) { +// +// RAJA::forall( +// RAJA::RangeSegment(kmin, kmax), [=](Index_type k) { +// COUPLE_BODY; +// }); +// +// } +// stopTimer(); +// +// break; +// } +//#endif +// +//#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) +// case Base_OpenMP : { +// +// startTimer(); +// for (RepIndex_type irep = 0; irep < run_reps; ++irep) { +// +// #pragma omp parallel for +// for (Index_type k = kmin ; k < kmax ; ++k ) { +// COUPLE_BODY; +// } +// +// } +// stopTimer(); +// break; +// } +// +// case RAJA_OpenMP : { +// +// startTimer(); +// for (RepIndex_type irep = 0; irep < run_reps; ++irep) { +// +// RAJA::forall( +// RAJA::RangeSegment(kmin, kmax), [=](Index_type k) { +// COUPLE_BODY; +// }); +// +// } +// stopTimer(); +// +// break; +// } +//#endif +// +//#if defined(RAJA_ENABLE_TARGET_OPENMP) && 0 +// case Base_OpenMPTarget : +// case RAJA_OpenMPTarget : +// { +// runOpenMPTargetVariant(vid); +// break; +// } +//#endif +// +//#if defined(RAJA_ENABLE_CUDA) && 0 +// case Base_CUDA : +// case RAJA_CUDA : +// { +// runCudaVariant(vid); +// break; +// } +//#endif +// +// default : { +// std::cout << "\n COUPLE : Unknown variant id = " << vid << std::endl; +// } +// +// } +//} +// +//void COUPLE::updateChecksum(VariantID vid) +//{ +// Index_type max_loop_index = m_domain->lrn; +// +// checksum[vid] += calcChecksum(m_t0, max_loop_index); +// checksum[vid] += calcChecksum(m_t1, max_loop_index); +// checksum[vid] += calcChecksum(m_t2, max_loop_index); +//} +// +//void COUPLE::tearDown(VariantID vid) +//{ +// (void) vid; +// +// deallocData(m_t0); +// deallocData(m_t1); +// deallocData(m_t2); +// deallocData(m_denac); +// deallocData(m_denlw); +//} } // end namespace apps } // end namespace rajaperf From a8a7553e31a6c1701290e6700089ec342e464c85 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Tue, 20 Jul 2021 11:27:07 -0600 Subject: [PATCH 095/124] Attempt at merge conflict resolution --- src/apps-kokkos/CMakeLists.txt | 2 +- src/apps-kokkos/ENERGY-Kokkos.cpp | 2 +- src/apps-kokkos/FIR-Kokkos.cpp | 2 +- src/apps-kokkos/PRESSURE-Kokkos.cpp | 2 +- src/apps/DEL_DOT_VEC_2D.hpp | 1 + src/apps/ENERGY.hpp | 1 + src/apps/FIR.hpp | 1 + src/apps/HALOEXCHANGE.hpp | 1 + src/apps/LTIMES.hpp | 1 + src/apps/LTIMES_NOVIEW.hpp | 1 + src/apps/PRESSURE.hpp | 1 + src/apps/VOL3D.hpp | 1 + src/apps/WIP-COUPLE.hpp | 1 + src/basic-kokkos/CMakeLists.txt | 2 +- src/basic-kokkos/DAXPY-Kokkos.cpp | 2 +- src/basic-kokkos/IF_QUAD-Kokkos.cpp | 2 +- src/basic-kokkos/INIT3-Kokkos.cpp | 2 +- src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp | 2 +- .../INIT_VIEW1D_OFFSET-Kokkos.cpp | 2 +- src/basic-kokkos/MULADDSUB-Kokkos.cpp | 2 +- ...MIC_PI-Kokkos.cpp => PI_ATOMIC-Kokkos.cpp} | 20 +- src/basic-kokkos/REDUCE3_INT-Kokkos.cpp | 2 +- src/basic-kokkos/TRAP_INT-Kokkos.cpp | 2 +- src/common/Executor.cpp | 1518 ++++++++--------- src/common/KernelBase.cpp | 4 +- src/common/QuickKernelBase.hpp | 6 +- src/common/RAJAPerfSuite.cpp | 503 +++--- src/common/RAJAPerfSuite.hpp | 3 - src/common/RunParams.cpp | 80 +- src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp | 2 +- src/lcals-kokkos/EOS-Kokkos.cpp | 2 +- src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp | 2 +- src/lcals-kokkos/FIRST_MIN-Kokkos.cpp | 2 +- src/lcals-kokkos/FIRST_SUM-Kokkos.cpp | 2 +- src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp | 2 +- src/lcals-kokkos/HYDRO_1D-Kokkos.cpp | 4 +- src/lcals-kokkos/INT_PREDICT-Kokkos.cpp | 2 +- src/lcals-kokkos/PLANCKIAN-Kokkos.cpp | 2 +- src/stream-kokkos/ADD-Kokkos.cpp | 4 +- src/stream-kokkos/COPY-Kokkos.cpp | 4 +- src/stream-kokkos/DOT-Kokkos.cpp | 4 +- src/stream-kokkos/MUL-Kokkos.cpp | 4 +- src/stream-kokkos/TRIAD-Kokkos.cpp | 4 +- 43 files changed, 1024 insertions(+), 1185 deletions(-) rename src/basic-kokkos/{ATOMIC_PI-Kokkos.cpp => PI_ATOMIC-Kokkos.cpp} (87%) diff --git a/src/apps-kokkos/CMakeLists.txt b/src/apps-kokkos/CMakeLists.txt index 832f8617d..daa85c881 100644 --- a/src/apps-kokkos/CMakeLists.txt +++ b/src/apps-kokkos/CMakeLists.txt @@ -10,7 +10,7 @@ include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/../apps) blt_add_library( - NAME apps + NAME apps-kokkos SOURCES AppsData.cpp DEL_DOT_VEC_2D-Kokkos.cpp ENERGY-Kokkos.cpp diff --git a/src/apps-kokkos/ENERGY-Kokkos.cpp b/src/apps-kokkos/ENERGY-Kokkos.cpp index 0b69cf129..39a2f64e4 100644 --- a/src/apps-kokkos/ENERGY-Kokkos.cpp +++ b/src/apps-kokkos/ENERGY-Kokkos.cpp @@ -25,7 +25,7 @@ void ENERGY::runKokkosVariant(VariantID vid) const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); ENERGY_DATA_SETUP; diff --git a/src/apps-kokkos/FIR-Kokkos.cpp b/src/apps-kokkos/FIR-Kokkos.cpp index 51d9d3e85..322bd0210 100644 --- a/src/apps-kokkos/FIR-Kokkos.cpp +++ b/src/apps-kokkos/FIR-Kokkos.cpp @@ -25,7 +25,7 @@ void FIR::runKokkosVariant(VariantID vid) return; const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize() - m_coefflen; + const Index_type iend = getActualProblemSize() - m_coefflen; FIR_COEFF; diff --git a/src/apps-kokkos/PRESSURE-Kokkos.cpp b/src/apps-kokkos/PRESSURE-Kokkos.cpp index ea09713b6..c3b7cad38 100644 --- a/src/apps-kokkos/PRESSURE-Kokkos.cpp +++ b/src/apps-kokkos/PRESSURE-Kokkos.cpp @@ -25,7 +25,7 @@ void PRESSURE::runKokkosVariant(VariantID vid) const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); PRESSURE_DATA_SETUP; diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index 5b5b77dcb..4968590df 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -113,6 +113,7 @@ class DEL_DOT_VEC_2D : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); private: Real_ptr m_x; diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp index d7e53bf22..6025b0761 100644 --- a/src/apps/ENERGY.hpp +++ b/src/apps/ENERGY.hpp @@ -203,6 +203,7 @@ class ENERGY : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); private: Real_ptr m_e_new; diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp index fa900d7ac..51c234702 100644 --- a/src/apps/FIR.hpp +++ b/src/apps/FIR.hpp @@ -78,6 +78,7 @@ class FIR : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); private: Real_ptr m_in; diff --git a/src/apps/HALOEXCHANGE.hpp b/src/apps/HALOEXCHANGE.hpp index 1d3739d5e..7d246ccdc 100644 --- a/src/apps/HALOEXCHANGE.hpp +++ b/src/apps/HALOEXCHANGE.hpp @@ -93,6 +93,7 @@ class HALOEXCHANGE : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); private: static const int s_num_neighbors = 26; diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index 18707982e..919fb4d9b 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -116,6 +116,7 @@ class LTIMES : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); private: Real_ptr m_phidat; diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index 596cdd1d1..d05606f7c 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -66,6 +66,7 @@ class LTIMES_NOVIEW : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); private: Real_ptr m_phidat; diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index 927aaafdd..13c2cf840 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -72,6 +72,7 @@ class PRESSURE : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); private: Real_ptr m_compression; diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp index edabcfe0d..75fc75f98 100644 --- a/src/apps/VOL3D.hpp +++ b/src/apps/VOL3D.hpp @@ -169,6 +169,7 @@ class VOL3D : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); private: Real_ptr m_x; diff --git a/src/apps/WIP-COUPLE.hpp b/src/apps/WIP-COUPLE.hpp index ae374a848..8136e5f2f 100644 --- a/src/apps/WIP-COUPLE.hpp +++ b/src/apps/WIP-COUPLE.hpp @@ -171,6 +171,7 @@ class COUPLE : public KernelBase void runCudaVariant(VariantID vid) {(void) vid;} void runHipVariant(VariantID vid) {(void) vid;} void runOpenMPTargetVariant(VariantID vid) {(void) vid;} + void runKokkosVariant(VariantID vid); private: Complex_ptr m_t0; diff --git a/src/basic-kokkos/CMakeLists.txt b/src/basic-kokkos/CMakeLists.txt index 9a0f9a7bd..6744c3662 100644 --- a/src/basic-kokkos/CMakeLists.txt +++ b/src/basic-kokkos/CMakeLists.txt @@ -11,7 +11,7 @@ include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/../basic) blt_add_library( NAME basic-kokkos SOURCES - ATOMIC_PI-Kokkos.cpp + PI_ATOMIC-Kokkos.cpp DAXPY-Kokkos.cpp IF_QUAD-Kokkos.cpp INIT3-Kokkos.cpp diff --git a/src/basic-kokkos/DAXPY-Kokkos.cpp b/src/basic-kokkos/DAXPY-Kokkos.cpp index fd6569200..b21e0ce16 100644 --- a/src/basic-kokkos/DAXPY-Kokkos.cpp +++ b/src/basic-kokkos/DAXPY-Kokkos.cpp @@ -29,7 +29,7 @@ void DAXPY::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); DAXPY_DATA_SETUP; diff --git a/src/basic-kokkos/IF_QUAD-Kokkos.cpp b/src/basic-kokkos/IF_QUAD-Kokkos.cpp index b515d67e5..a1d25d3fe 100644 --- a/src/basic-kokkos/IF_QUAD-Kokkos.cpp +++ b/src/basic-kokkos/IF_QUAD-Kokkos.cpp @@ -25,7 +25,7 @@ void IF_QUAD::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); IF_QUAD_DATA_SETUP; diff --git a/src/basic-kokkos/INIT3-Kokkos.cpp b/src/basic-kokkos/INIT3-Kokkos.cpp index 9a13476da..1ceb5174c 100644 --- a/src/basic-kokkos/INIT3-Kokkos.cpp +++ b/src/basic-kokkos/INIT3-Kokkos.cpp @@ -22,7 +22,7 @@ void INIT3::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); INIT3_DATA_SETUP; diff --git a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp index efba110c1..7afb010ee 100644 --- a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp +++ b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp @@ -22,7 +22,7 @@ void INIT_VIEW1D::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); INIT_VIEW1D_DATA_SETUP; diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp index 5f010597b..d47ed1462 100644 --- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp @@ -23,7 +23,7 @@ void INIT_VIEW1D_OFFSET::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; - const Index_type iend = getRunSize()+1; + const Index_type iend = getActualProblemSize()+1; INIT_VIEW1D_OFFSET_DATA_SETUP; diff --git a/src/basic-kokkos/MULADDSUB-Kokkos.cpp b/src/basic-kokkos/MULADDSUB-Kokkos.cpp index 9efcc2c39..edb54ae3b 100644 --- a/src/basic-kokkos/MULADDSUB-Kokkos.cpp +++ b/src/basic-kokkos/MULADDSUB-Kokkos.cpp @@ -22,7 +22,7 @@ void MULADDSUB::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); MULADDSUB_DATA_SETUP; diff --git a/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp b/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp similarity index 87% rename from src/basic-kokkos/ATOMIC_PI-Kokkos.cpp rename to src/basic-kokkos/PI_ATOMIC-Kokkos.cpp index 58cccdf06..8bb5b4bad 100644 --- a/src/basic-kokkos/ATOMIC_PI-Kokkos.cpp +++ b/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#include "ATOMIC_PI.hpp" +#include "PI_ATOMIC.hpp" #include "RAJA/RAJA.hpp" @@ -15,14 +15,14 @@ namespace rajaperf { namespace basic { -void ATOMIC_PI::runKokkosVariant(VariantID vid) { +void PI_ATOMIC::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); - ATOMIC_PI_DATA_SETUP; + PI_ATOMIC_DATA_SETUP; - // Declare Kokkos View that will wrap the pointer defined in ATOMIC_PI.hpp + // Declare Kokkos View that will wrap the pointer defined in PI_ATOMIC.hpp auto pi_view = getViewFromPointer(pi, 1); #if defined(RUN_KOKKOS) @@ -74,7 +74,7 @@ void ATOMIC_PI::runKokkosVariant(VariantID vid) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - // Here, making a pointer of pi defined in ATOMIC_PI.hpp; we will use a + // Here, making a pointer of pi defined in PI_ATOMIC.hpp; we will use a // KokkosView instead // *pi = m_pi_init; // RAJA::forall( RAJA::RangeSegment(ibegin, iend), @@ -94,11 +94,11 @@ void ATOMIC_PI::runKokkosVariant(VariantID vid) { pi_view = getViewFromPointer(pi, 1); Kokkos::parallel_for( - "ATOMIC_PI-Kokkos Kokkos_Lambda", + "PI_ATOMIC-Kokkos Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA(Index_type i) { - // Original ATOMIC_PI kernel reference implementation - // defined in ATOMIC_PI.hpp + // Original PI_ATOMIC kernel reference implementation + // defined in PI_ATOMIC.hpp double x = (double(i) + 0.5) * dx; // Make a reference to the 0th element of a 1D view with one // element @@ -124,7 +124,7 @@ void ATOMIC_PI::runKokkosVariant(VariantID vid) { } default: { - std::cout << "\n ATOMIC_PI : Unknown variant id = " << vid << std::endl; + std::cout << "\n PI_ATOMIC : Unknown variant id = " << vid << std::endl; } } #endif // RUN_KOKKOS diff --git a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp index a37f36036..4f340a919 100644 --- a/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp +++ b/src/basic-kokkos/REDUCE3_INT-Kokkos.cpp @@ -23,7 +23,7 @@ void REDUCE3_INT::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); REDUCE3_INT_DATA_SETUP; diff --git a/src/basic-kokkos/TRAP_INT-Kokkos.cpp b/src/basic-kokkos/TRAP_INT-Kokkos.cpp index bf882da0d..45e822015 100644 --- a/src/basic-kokkos/TRAP_INT-Kokkos.cpp +++ b/src/basic-kokkos/TRAP_INT-Kokkos.cpp @@ -38,7 +38,7 @@ void TRAP_INT::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); TRAP_INT_DATA_SETUP; diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 9e5b384b6..f57fa62d0 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - +// status: functions copied over, need to reconcile #include "Executor.hpp" #include "common/KernelBase.hpp" @@ -15,7 +15,6 @@ // Warmup kernel to run first to remove startup overheads in timings #include "basic/DAXPY.hpp" -// Standard library includes #include #include #include @@ -31,432 +30,245 @@ namespace rajaperf { - using namespace std; - - Executor::Executor(int argc, char **argv) - : run_params(argc, argv), - reference_vid(NumVariants) { - } - -/* - * https://www.delftstack.com/howto/cpp/cpp-tilde-operator/ - * - * The destructor is a special member function that handles the deallocation of the class object’s resources. - * AS opposed to the class constructors, it has only one destructor function for a given class. - * The class destructor is declared with the same name as the class plus the prefix ~ tilde operator. - *... - * Generally, the class members are destroyed after the destructor function code is run; - * thus, we can demonstrate how the StringArray class instance goes out of scope and hence - * printing to the console the corresponding text. - * - */ - -// Destructor for resource de-allocation - - Executor::~Executor() { - for (size_t ik = 0; ik < kernels.size(); ++ik) { - delete kernels[ik]; - } - - } - -// New functions for Kokkos to register new group and kernel IDs -// The return type is Executor::groupID - - - Executor::groupID Executor::registerGroup(std::string groupName) { - // find() method searches the string for the first occurrence of the sequence specified by its arguments. - // Recall, "kernelsPerGroup" is a mapping of kernel groups (e.g., basic) and their constituent kernels (e.g., DAXPY) - auto checkIfGroupExists = kernelsPerGroup.find(groupName); - - - /* Recall, these items are defined in Executor.hpp: - using groupID = int; - using kernelID = int; - using kernelSet = std::set; // data type: set of KernelBase* instances - using kernelMap = std::map; // data type: map of string kernel names to instances of KernelBase* - using groupMap = std::map; // data type: map of groupNames to sets of kernels - ... - // "allKernels" is an instance of kernelMap, which is a "map" of all kernels and their ID's - kernelMap allKernels; - - // "kernelsPerGroup" is an instance of "groupMap;" "kernelsPerGroup" maps kernels to their categories (e.g., basic, polybench, etc.) - groupMap kernelsPerGroup; - - */ - - /* end() - * Return iterator to end - * Returns an iterator referring to the past-the-end element in the vector container. - * The past-the-end element is the theoretical element that would follow the last element in the vector. - * It does not point to any element, and thus shall not be de-referenced. - * Because the ranges used by functions of the standard library do not include - * the element pointed by their closing iterator, - * this function is often used in combination with vector::begin to specify a range including all the elements in the container. - * If the container is empty, this function returns the same as vector::begin. - * - */ - - - // HERE, WE ARE CHECKING THE CASE THAT THE groupNAME **IS NOT** IN THE MAP OBJECT - // Using the .end() idiom to check if I've fallen off the edge of the container without finding a match - if (checkIfGroupExists == kernelsPerGroup.end()) { - // If groupName not found, set that groupName in kernelsPerGroup to an empty kernelSet obj - kernelsPerGroup[groupName] = kernelSet(); - } else { - // ERROR CONDITION: DUPLICATING GROUPS - // Error lists exsiting group, and kills program. - - std::cout << "The Group Name " << groupName << " already exists. Program is exiting." << std::endl; - - // In kernelsPerGroup, the Group Name is the first position / key value, and the second position / value type in the set - auto fullKernelSet = checkIfGroupExists->second; - - // fullKernelSet is of type std::set - - for (auto kernel: fullKernelSet) { - - std::cout << kernel->getName() << std::endl; - - } - - exit(1); - - } - // getNewGroupID() is an object of type Executor::groupID, an int - return getNewGroupID(); - - - } - -// New function with return type Executor::kernelID, returning getNewKernelID(); registerKernel is a new function in the Executor class -// - - Executor::kernelID Executor::registerKernel(std::string groupName, KernelBase *kernel) { - // declaring and setting kernelName to de-referenced kernel pointer obj, an instance of KernelBase* - auto kernelName = kernel->getName(); - // Recall, "allKernels" maps named kernels to their IDs - auto checkIfKernelExists = allKernels.find(kernelName); - // Check if checkKernelExists value IS NOT in the map of all kernels - if (checkIfKernelExists == allKernels.end()) { - // if the kernel name IS NOT in the allKernels map, set kernelName to kernel, the KernelBase* instance - allKernels[kernelName] = kernel; - } else { - // ERROR CONDITION: if the kernel is found / exists, make the program exit - - std::cout << "Kernel " << checkIfKernelExists->first << " already exists. Program is exiting." - << std::endl; - - exit(1); - } - ////////////////////////////////////////////////////////////////////////////// - // This error condition : adding a groupName before checking if the group associated with the kernel exists - // Declare and set checkIfGroupExists to the value of the string-type groupName in the kernelsPerGroup map - auto checkIfGroupExists = kernelsPerGroup.find(groupName); - // LOGIC: Check if checkIfGroupExists value is the same as the past-the-end element in the vector container, which - // does not have a value - // i.e., check for the case that the groupName DOES NOT exist with the ".end()" idiom; - if (checkIfGroupExists == kernelsPerGroup.end()) { +using namespace std; - } else { - // If the groupName DOES EXIST, then insert the kernel (instance of KernelBase*) at the second position of the - // allKernels map to associate the kernel and its groupNAme - - checkIfGroupExists->second.insert(kernel); - - } - - // getNewKernelID is an obj of type Executor::kernelID - return getNewKernelID(); - } - -// AJP & DZP new function -// AJP GOAL: return a vector of all kernelBase* objects to be run by - - std::vector Executor::lookUpKernelByName(std::string kernelOrGroupName) { - - // The vector / list return type, std::vector will contain - // either all of the kernels with a given kernel name or group name - // We have two maps (defined in Executor.hpp): kernelMap allKernels, groupMap kernelsPerGroup, - // STEPS: - // 1) declare new vector that will contain the string data: - // 2) LOGIC: - // i) check to see if the kernel / group requested on the - // "./rajaperf.exe -k" line (you can pass either a specific kernel or a - // kernel groupName, e.g., "Basic" +Executor::Executor(int argc, char** argv) + : run_params(argc, argv), + reference_vid(NumVariants) +{ +} - // Declaring the vector kernelsByNameVect of type std::vector; - // This variable will contain the set of kernels to run - std::vector kernelsByNameVect; - // CONDITIONS TO INCLUDE: - // 1) If kernelName is groupName , then add that set of kernels in the - // group to the vector +Executor::~Executor() +{ + for (size_t ik = 0; ik < kernels.size(); ++ik) { + delete kernels[ik]; + } +} - // 2) else if kernelName is kernel, then add the kernel to the vector - // 3) else if kernelName is horse stuff, then say so - // HINT: Declare iterator against which you can test equivalence +void Executor::setupSuite() +{ + RunParams::InputOpt in_state = run_params.getInputState(); + if ( in_state == RunParams::InfoRequest || in_state == RunParams::BadInput ) { + return; + } - auto checkLookUpGroupNameIterator = kernelsPerGroup.find(kernelOrGroupName); - auto checkLookUpKernelNameIterator = allKernels.find(kernelOrGroupName); + cout << "\nSetting up suite based on input..." << endl; - // Check to see if groupName NOT in kernelsPerGroup; - // end() iterates to the end - if (checkLookUpGroupNameIterator != kernelsPerGroup.end()) { - //cout << " STEP 1" << endl; + using Slist = list; + using Svector = vector; + using KIDset = set; + using VIDset = set; - // when using the arrow, you get a key, value pair. - // You can access either member by "first" or "second" // + // Determine which kernels to execute from input. + // run_kern will be non-duplicated ordered set of IDs of kernel to run. + // + const Svector& kernel_input = run_params.getKernelInput(); + const Svector& feature_input = run_params.getFeatureInput(); - // we have std::set of KernelBase* - auto groupSetForTests = checkLookUpGroupNameIterator->second; - - for (auto item: groupSetForTests) { - kernelsByNameVect.push_back(item); - } - } else if (checkLookUpKernelNameIterator != allKernels.end()) { - - auto kernel = checkLookUpKernelNameIterator->second; - - kernelsByNameVect.push_back(kernel); - - - } - - - // kernelsByNameVect is an object of type std::vector that will be used by - return kernelsByNameVect; + KIDset run_kern; + if ( kernel_input.empty() && feature_input.empty() ) { + // + // No kernels or fatures specified in input, run them all... + // + for (auto iter_input: allKernels) { + kernels.push_back(iter_input.second); } - const RunParams &Executor::getRunParams() { + } else { + + // + // Need to parse input to determine which kernels to run + // + // + // Look for kernels using features if such input provided + // + if ( !feature_input.empty() ) { - return run_params; - } + // First, check for invalid feature input. + // Assemble invalid input for warning message. + // +/** TODO: reimplement + Svector invalid; + for (size_t i = 0; i < feature_input.size(); ++i) { + bool found_it = false; -////////////////////////////////////////////////////////////////////////////////////// -// * AJP TASK: change the setupSuite to use the allKernels (type: kernelMap) and kernelsPerGroup (type: groupMap) -// * maps; -// * The goal here is to make a vector of the different instances of KernelBase*, kernel, that are to be run; -// * The vector you'll need already exists! -// * Hint: see line 375-ish for kernels.push_back; -// */ -///////////////////////////////////////////////////////////////////////////////////// - void Executor::setupSuite() { - // Initial handling of run parameters input - RunParams::InputOpt in_state = run_params.getInputState(); - // QUESTION -- In this first step, are we doing nothing (initially) if we have bad input? - // Should there be an else condition for this conditional? - if (in_state == RunParams::InfoRequest || in_state == RunParams::BadInput) { - return; + for (size_t fid = 0; fid < NumFeatures && !found_it; ++fid) { + FeatureID tfid = static_cast(fid); + if ( getFeatureName(tfid) == feature_input[i] ) { + found_it = true; + } } - cout << "\nSetting up suite based on input..." << endl; - - - //////////////////////////////////////////////////////////////////////////////////// - // Declaring function type aliases - - using Slist = list; - using Svector = vector; - // Set of kernel IDs, e.g., DAXPY, IF_QUAD - using KIDset = set; - // "variants" include CUDA, OpenMPTarget, OpenMP, HIP, Serial - using VIDset = set; - /////////////////////////////////////////////////////////////////////////////////// - // Determine which kernels to execute from input. - // run_kern will be non-duplicated ordered set of IDs of kernel to run. - // kernel_input is an object of type reference to Svector; - // kernel_input will contain the input for the kernels to run - const Svector &kernel_input = run_params.getKernelInput(); - - // Declare run_kern of type KIDset; contains the set of kernels (KernelBase* instances to run) - KIDset run_kern; - - /* LOGIC - 1) check if each of the inputs in matches a groupName; - 2) if a match, add every kernel in that group to the vector that will be run; - 3) if no match, check existing kernels - 4) if a match, add that kernel - 5) if no match, add that kernel to set the set of invalid kernels - */ + if ( !found_it ) invalid.push_back( feature_input[i] ); + } + run_params.setInvalidFeatureInput(invalid); - Svector invalid; + // + // If feature input is valid, determine which kernels use + // input-specified features and add to set of kernels to run. + // + if ( run_params.getInvalidFeatureInput().empty() ) { - // The case when the executable is passed no args - if (kernel_input.empty()) { - // your iterator does the deferencing for you, thus you don't need the input arrow, which is - // necessary for dereferencing + for (size_t i = 0; i < feature_input.size(); ++i) { - for (auto iter_input: allKernels) { - kernels.push_back(iter_input.second); - } - } else { + const string& feature = feature_input[i]; - for (auto kernelName: kernel_input) { - std::vector matchingKernelsVec = lookUpKernelByName(kernelName); - // if everything that matched is in the vector, and nothing matched, i.e., an empty vector, - // i.e., the kernel name was invalid + bool found_it = false; + for (size_t fid = 0; fid < NumFeatures && !found_it; ++fid) { + FeatureID tfid = static_cast(fid); + if ( getFeatureName(tfid) == feature ) { + found_it = true; - if (matchingKernelsVec.empty()) { - invalid.push_back(kernelName); - } else { + for (int kid = 0; kid < NumKernels; ++kid) { + KernelID tkid = static_cast(kid); + KernelBase* kern = getKernelObject(tkid, run_params); + if ( kern->usesFeature(tfid) ) { + run_kern.insert( tkid ); + } + delete kern; + } // loop over kernels - for (auto iter_kern: matchingKernelsVec) { - kernels.push_back(iter_kern); + } // if input feature name matches feature id + } // loop over feature ids until name match is found - } - } - } - } + } // loop over feature name input -/* - if ( kernel_input.empty() ) { + } // if feature name input is valid +*/ + } // if !feature_input.empty() + /** + // Make list copy of kernel name input to manipulate for + // processing potential group names and/or kernel names, next + Slist kern_names(kernel_input.begin(), kernel_input.end()); // - // if No kernels specified in input, run them all... + // Search kern_names for matching group names. + // groups2run will contain names of groups to run. // - for (size_t ik = 0; ik < NumKernels; ++ik) { - // here, inserting kernels to run; you must cast ik (of type size_t), the indexing variable, as a KernelID type - run_kern.insert( static_cast(ik) ); - } - - } else { - - - // Parse input to determine which kernels to run - // Make list of strings copy of kernel input for the parsing - // (need to process potential group names and/or kernel names) - - // Slist is a type alias for list - // Populate list with the kernel_input, from the beginning index to the end - Slist input(kernel_input.begin(), kernel_input.end()); - - // AJP code addition -- print list of inputs - - for (auto idx: input ) - - std::cout << "Input parameters list: " << idx << std:: endl; - - // Search input for matching group names. - // groups2run is a vector of strings (of type Svector, a type alias of vector) containing names - // of groups to run if passed in as input. - Svector groups2run; - // Outer loop: Iterate through the list of strings from the first to the last item - for (Slist::iterator it = input.begin(); it != input.end(); ++it) { - // inner loop: iterate over NumGroups, a member of GroupID enum defined in RAJAPerfSuite.hpp - + for (Slist::iterator it = kern_names.begin(); it != kern_names.end(); ++it) + { for (size_t ig = 0; ig < NumGroups; ++ig) { - // declare a constant (immutable) string reference "group_name" - // Store the value at the the ig(th) index as a GroupID in group_name const string& group_name = getGroupName(static_cast(ig)); - // if group_name is equal to the value the it(th)* index points to, - // push_back / append that group_name to groups2run vector of strings if ( group_name == *it ) { groups2run.push_back(group_name); } } } + // + // If group name(s) found in kern_names, assemble kernels in group(s) + // to run and remove those group name(s) from kern_names list. + // + for (size_t ig = 0; ig < groups2run.size(); ++ig) { const string& gname(groups2run[ig]); - // NumKernels is always the last member of KernelID, an enum, declared in RAJAPerfSuite.hpp - // Iterate over NumKernels, casting the index ik to a KernelID type, and setting it to kid - // for (size_t ik = 0; ik < NumKernels; ++ik) { KernelID kid = static_cast(ik); - // if the group name DOES occur within the string full kernel name (npos means until the end of the string), - // insert the kid (of KernelID type) into the run_kern (of type KIDset) if ( getFullKernelName(kid).find(gname) != string::npos ) { run_kern.insert(kid); } } - // remember, gname is a const/immutable string reference containing group names as a string - input.remove(gname); - } - + kern_names.remove(gname); + } - // Look for matching names of individual kernels in remaining input. + // + // Look for matching names of individual kernels in remaining kern_names. + // // Assemble invalid input for warning message. - // Declare the vector "invalid" of type Svector (type alias for vector) to hold ... - // Iterate over the input from beginning to the end item; - for (Slist::iterator it = input.begin(); it != input.end(); ++it) { - // initialize a boolean, "found_it" to false; - // why do we need this variable? AJP -- ANSWER HERE + // + Svector invalid; + + for (Slist::iterator it = kern_names.begin(); it != kern_names.end(); ++it) + { bool found_it = false; - // Iterate ik over NumKernels & TRUE; - // Iterate until you hit the end of the list , or until you find what you're looking for. + for (size_t ik = 0; ik < NumKernels && !found_it; ++ik) { - // cast the ik(th) value to a KernelID, and set equal to kid KernelID kid = static_cast(ik); - // if the kernel name (for a kid, of type KernelID) is equal to the value pointed at at the it(th) index - // OR if the full kernel name (for a kid) is equal to the value pointed at at the it(th) index - // insert that kid into the run_kern (of type KIDset) and set found_it boolean to true if ( getKernelName(kid) == *it || getFullKernelName(kid) == *it ) { run_kern.insert(kid); found_it = true; } } - // ATTN: found_it depend on whether or not the kernel was found; - // if the kernel was NOT found, we want to push it back to the set of invalid; - // if found_it = false, push back the value pointed at at the it(th) index to the vector of strings, "&invalid," - // which is of type Svector (a type alias) - if ( !found_it ) invalid.push_back(*it); + + if ( !found_it ) invalid.push_back(*it); } - // Update the run_params obj with data in the invalid vector reference + run_params.setInvalidKernelInput(invalid); + */ + Svector invalid; + for (auto kernelName: kernel_input) { + std::vector matchingKernelsVec = lookUpKernelByName(kernelName); + // if everything that matched is in the vector, and nothing matched, i.e., an empty vector, + // i.e., the kernel name was invalid + if (matchingKernelsVec.empty()) { + invalid.push_back(kernelName); + } else { + + for (auto iter_kern: matchingKernelsVec) { + kernels.push_back(iter_kern); + + } + } + } + run_params.setInvalidKernelInput(invalid); } + // // Assemble set of available variants to run // (based on compile-time configuration). - // Recall, a variant will be: base_seq, base_CUDA, Raja_lambda, kokkos_lambda, etc. + // + VIDset available_var; + for (size_t iv = 0; iv < NumVariants; ++iv) { + VariantID vid = static_cast(iv); + if ( isVariantAvailable( vid ) ) { + available_var.insert( vid ); + } + } -*/ + // + // Determine variants to execute from input. + // run_var will be non-duplicated ordered set of IDs of variants to run. + // + const Svector& variant_names = run_params.getVariantInput(); - run_params.setInvalidKernelInput(invalid); + VIDset run_var; - VIDset available_var; - // iterate the NumVariants & static_cast value at iv(th) index to VariantID - // if the variant is available, insert vid into the VIDset - for (size_t iv = 0; iv < NumVariants; ++iv) { - VariantID vid = static_cast(iv); - if (isVariantAvailable(vid)) { - available_var.insert(vid); - } - } + if ( variant_names.empty() ) { - // - // Determine variants to execute from input. - // run_var will be non-duplicated ordered set of IDs of variants to run. - // - const Svector &variant_input = run_params.getVariantInput(); + // + // No variants specified in input options, run all available. + // Also, set reference variant if specified. + // + for (VIDset::iterator vid_it = available_var.begin(); + vid_it != available_var.end(); ++vid_it) { + VariantID vid = *vid_it; + run_var.insert( vid ); + if ( getVariantName(vid) == run_params.getReferenceVariant() ) { + reference_vid = vid; + } + } - VIDset run_var; + // + // Set reference variant if not specified. + // + if ( run_params.getReferenceVariant().empty() && !run_var.empty() ) { + reference_vid = *run_var.begin(); + } - if (variant_input.empty()) { + } else { - // - // No variants specified in input options, run all available. - // Also, set reference variant if specified. - // - for (VIDset::iterator vid_it = available_var.begin(); - vid_it != available_var.end(); ++vid_it) { - VariantID vid = *vid_it; - run_var.insert(vid); - if (getVariantName(vid) == run_params.getReferenceVariant()) { - reference_vid = vid; - } - } // // Parse input to determine which variants to run: // - variants to run will be the intersection of available variants @@ -467,58 +279,36 @@ namespace rajaperf { // Assemble invalid input for warning message. // - // - // Set reference variant if not specified. - // Here, this is where base_seq is set as the default baseline; - // the baseline that is used can be changed! - // e.g., kokkos_lambda - - if (run_params.getReferenceVariant().empty() && !run_var.empty()) { - reference_vid = *run_var.begin(); - } - - } else { - - // - // Parse input to determine which variants to run: - // - variants to run will be the intersection of available variants - // and those specified in input - // - reference variant will be set to specified input if available - // and variant will be run; else first variant that will be run. - // - // Assemble invalid input for warning message. - // - - Svector invalid; + Svector invalid; - for (size_t it = 0; it < variant_input.size(); ++it) { - bool found_it = false; + for (size_t it = 0; it < variant_names.size(); ++it) { + bool found_it = false; - for (VIDset::iterator vid_it = available_var.begin(); - vid_it != available_var.end(); ++vid_it) { - VariantID vid = *vid_it; - if (getVariantName(vid) == variant_input[it]) { - run_var.insert(vid); - if (getVariantName(vid) == run_params.getReferenceVariant()) { - reference_vid = vid; - } - found_it = true; - } - } + for (VIDset::iterator vid_it = available_var.begin(); + vid_it != available_var.end(); ++vid_it) { + VariantID vid = *vid_it; + if ( getVariantName(vid) == variant_names[it] ) { + run_var.insert(vid); + if ( getVariantName(vid) == run_params.getReferenceVariant() ) { + reference_vid = vid; + } + found_it = true; + } + } - if (!found_it) invalid.push_back(variant_input[it]); - } + if ( !found_it ) invalid.push_back(variant_names[it]); + } - // - // Set reference variant if not specified. - // - if (run_params.getReferenceVariant().empty() && !run_var.empty()) { - reference_vid = *run_var.begin(); - } + // + // Set reference variant if not specified. + // + if ( run_params.getReferenceVariant().empty() && !run_var.empty() ) { + reference_vid = *run_var.begin(); + } - run_params.setInvalidVariantInput(invalid); + run_params.setInvalidVariantInput(invalid); - } + } // // Create kernel objects and variants to execute. If invalid input is not @@ -527,45 +317,36 @@ namespace rajaperf { // A message will be emitted later so user can sort it out... // - if (!(run_params.getInvalidKernelInput().empty())) { + if ( !(run_params.getInvalidKernelInput().empty()) ) { + + run_params.setInputState(RunParams::BadInput); - run_params.setInputState(RunParams::BadInput); + } else if ( !(run_params.getInvalidFeatureInput().empty()) ) { - } else { // kernel input looks good + run_params.setInputState(RunParams::BadInput); - // Get lists using David and Amy's new maps! + } else { // kernel and feature input looks good -/* for (KIDset::iterator kid = run_kern.begin(); + for (KIDset::iterator kid = run_kern.begin(); kid != run_kern.end(); ++kid) { -/// RDH DISABLE COUPLE KERNEL until we find a reasonable way to do -/// complex numbers in GPU code - if ( true ) { - kernels.push_back( getKernelObject(*kid, run_params) ); - } +/// RDH DISABLE COUPLE KERNEL until we find a reasonable way to do +/// complex numbers in GPU code + //if ( *kid != Apps_COUPLE ) { + // kernels.push_back( getKernelObject(*kid, run_params) ); + //} } -*/ - if (!(run_params.getInvalidVariantInput().empty())) { - run_params.setInputState(RunParams::BadInput); + if ( !(run_params.getInvalidVariantInput().empty()) ) { - } else { // variant input lools good + run_params.setInputState(RunParams::BadInput); - for (VIDset::iterator vid = run_var.begin(); - vid != run_var.end(); ++vid) { - variant_ids.push_back(*vid); - } + } else { // variant input lools good - // - // If we've gotten to this point, we have good input to run. - // - if (run_params.getInputState() != RunParams::DryRun && - run_params.getInputState() != RunParams::CheckRun) { - run_params.setInputState(RunParams::PerfRun); - } + for (VIDset::iterator vid = run_var.begin(); + vid != run_var.end(); ++vid) { + variant_ids.push_back( *vid ); + } - } // kernel and variant input both look good -/* - ======= // // If we've gotten to this point, we have good input to run. // @@ -573,85 +354,77 @@ namespace rajaperf { run_params.getInputState() != RunParams::CheckRun ) { run_params.setInputState(RunParams::PerfRun); } ->>>>>>> develop -*/ - } // if kernel input looks good - } -//////////////////////////////////////////////////////////////////////////////////// + } // kernel and variant input both look good - void Executor::reportRunSummary(ostream &str) const { - RunParams::InputOpt in_state = run_params.getInputState(); + } // if kernel input looks good - if (in_state == RunParams::BadInput) { +} - str << "\nRunParams state:\n"; - str << "----------------"; - run_params.print(str); - str << "\n\nSuite will not be run now due to bad input." - << "\n See run parameters or option messages above.\n" - << endl; +void Executor::reportRunSummary(ostream& str) const +{ + RunParams::InputOpt in_state = run_params.getInputState(); - } else if (in_state == RunParams::PerfRun || - in_state == RunParams::DryRun || - in_state == RunParams::CheckRun) { + if ( in_state == RunParams::BadInput ) { - if (in_state == RunParams::DryRun) { + str << "\nRunParams state:\n"; + str << "----------------"; + run_params.print(str); - str << "\n\nRAJA performance suite dry run summary...." - << "\n--------------------------------------" << endl; - str << "\nInput state:"; - str << "\n------------"; - run_params.print(str); + str << "\n\nSuite will not be run now due to bad input." + << "\n See run parameters or option messages above.\n" + << endl; - } + } else if ( in_state == RunParams::PerfRun || + in_state == RunParams::DryRun || + in_state == RunParams::CheckRun ) { - if (in_state == RunParams::PerfRun || - in_state == RunParams::CheckRun) { + if ( in_state == RunParams::DryRun ) { - str << "\n\nRAJA performance suite run summary...." - << "\n--------------------------------------" << endl; + str << "\n\nRAJA performance suite dry run summary...." + << "\n--------------------------------------" << endl; - } + str << "\nInput state:"; + str << "\n------------"; + run_params.print(str); - string ofiles; - if (!run_params.getOutputDirName().empty()) { - ofiles = run_params.getOutputDirName(); - } else { - ofiles = string("."); - } - ofiles += string("/") + run_params.getOutputFilePrefix() + - string("*"); - - str << "\nHow suite will be run:" << endl; - str << "\t # passes = " << run_params.getNumPasses() << endl; - str << "\t Kernel size factor = " << run_params.getSizeFactor() << endl; - str << "\t Kernel rep factor = " << run_params.getRepFactor() << endl; - str << "\t Output files will be named " << ofiles << endl; - -#if defined(RUN_KOKKOS) - Kokkos::Tools::declareMetadata("replication_factor", std::to_string(run_params.getRepFactor())); - Kokkos::Tools::declareMetadata("size_factor", std::to_string(run_params.getSizeFactor())); -#endif + } - str << "\nThe following kernels and variants (when available) will be run:\n"; + if ( in_state == RunParams::PerfRun || + in_state == RunParams::CheckRun ) { - str << "\nVariants" - << "\n--------\n"; - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - str << getVariantName(variant_ids[iv]) << endl; - } + str << "\n\nRAJA performance suite run summary...." + << "\n--------------------------------------" << endl; - str << "\nKernels(iterations/rep , reps)" - << "\n-----------------------------\n"; - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase *kern = kernels[ik]; - str << kern->getName() - << " (" << kern->getItsPerRep() << " , " - << kern->getRunReps() << ")" << endl; - } - } + } + + string ofiles; + if ( !run_params.getOutputDirName().empty() ) { + ofiles = run_params.getOutputDirName(); + } else { + ofiles = string("."); + } + ofiles += string("/") + run_params.getOutputFilePrefix() + + string("*"); + + str << "\nHow suite will be run:" << endl; + str << "\t # passes = " << run_params.getNumPasses() << endl; + if (run_params.getSizeMeaning() == RunParams::SizeMeaning::Factor) { + str << "\t Kernel size factor = " << run_params.getSize() << endl; + } else if (run_params.getSizeMeaning() == RunParams::SizeMeaning::Direct) { + str << "\t Kernel size = " << run_params.getSize() << endl; + } + str << "\t Kernel rep factor = " << run_params.getRepFactor() << endl; + str << "\t Output files will be named " << ofiles << endl; + + str << "\nThe following kernels and variants (when available for a kernel) will be run:" << endl; + + str << "\nVariants" + << "\n--------\n"; + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + str << getVariantName(variant_ids[iv]) << endl; + } str << endl; @@ -769,80 +542,73 @@ void Executor::runSuite() return; } - KernelBase *warmup_kernel = new basic::DAXPY(run_params); - - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - VariantID vid = variant_ids[iv]; - if (run_params.showProgress()) { - if (warmup_kernel->hasVariantToRun(vid)) { - cout << " Running "; - } else { - cout << " No "; - } - cout << getVariantName(vid) << " variant" << endl; - } - if (warmup_kernel->hasVariantToRun(vid)) { - warmup_kernel->execute(vid); - } - } - - delete warmup_kernel; + cout << "\n\nRun warmup kernel...\n"; + KernelBase* warmup_kernel = new basic::DAXPY(run_params); - cout << "\n\nRunning specified kernels and variants...\n"; + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + VariantID vid = variant_ids[iv]; + if ( run_params.showProgress() ) { + if ( warmup_kernel->hasVariantDefined(vid) ) { + cout << " Running "; + } else { + cout << " No "; + } + cout << getVariantName(vid) << " variant" << endl; + } + if ( warmup_kernel->hasVariantDefined(vid) ) { + warmup_kernel->execute(vid); + } + } - const int npasses = run_params.getNumPasses(); - for (int ip = 0; ip < npasses; ++ip) { - if (run_params.showProgress()) { - std::cout << "\nPass through suite # " << ip << "\n"; - } + delete warmup_kernel; - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase *kernel = kernels[ik]; - if (run_params.showProgress()) { - std::cout << "\nRun kernel -- " << kernel->getName() << "\n"; - } - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - VariantID vid = variant_ids[iv]; - KernelBase *kern = kernels[ik]; - if (run_params.showProgress()) { - if (kern->hasVariantToRun(vid)) { - cout << " Running "; - } else { - cout << " No "; - } - cout << getVariantName(vid) << " variant" << endl; - } - if (kern->hasVariantToRun(vid)) { - kernels[ik]->execute(vid); - } - } // loop over variants + cout << "\n\nRunning specified kernels and variants...\n"; - } // loop over kernels + const int npasses = run_params.getNumPasses(); + for (int ip = 0; ip < npasses; ++ip) { + if ( run_params.showProgress() ) { + std::cout << "\nPass through suite # " << ip << "\n"; + } - } // loop over passes through suite + for (size_t ik = 0; ik < kernels.size(); ++ik) { + KernelBase* kernel = kernels[ik]; + if ( run_params.showProgress() ) { + std::cout << "\nRun kernel -- " << kernel->getName() << "\n"; + } - } + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + VariantID vid = variant_ids[iv]; + KernelBase* kern = kernels[ik]; + if ( run_params.showProgress() ) { + if ( kern->hasVariantDefined(vid) ) { + cout << " Running "; + } else { + cout << " No "; + } + cout << getVariantName(vid) << " variant" << endl; + } + if ( kern->hasVariantDefined(vid) ) { + kernels[ik]->execute(vid); + } + } // loop over variants + + } // loop over kernels + + } // loop over passes through suite - void Executor::outputRunData() { - RunParams::InputOpt in_state = run_params.getInputState(); - if (in_state != RunParams::PerfRun && - in_state != RunParams::CheckRun) { - return; - } +} - cout << "\n\nGenerate run report files...\n"; +void Executor::outputRunData() +{ + RunParams::InputOpt in_state = run_params.getInputState(); + if ( in_state != RunParams::PerfRun && + in_state != RunParams::CheckRun ) { + return; + } - // - // Generate output file prefix (including directory path). - // - string out_fprefix; - string outdir = recursiveMkdir(run_params.getOutputDirName()); - if (!outdir.empty()) { - chdir(outdir.c_str()); - } - out_fprefix = "./" + run_params.getOutputFilePrefix(); + cout << "\n\nGenerate run report files...\n"; // // Generate output file prefix (including directory path). @@ -854,41 +620,48 @@ void Executor::runSuite() } out_fprefix = "./" + run_params.getOutputFilePrefix(); - if (haveReferenceVariant()) { - filename = out_fprefix + "-speedup.csv"; - writeCSVReport(filename, CSVRepMode::Speedup, 3 /* prec */); - } + string filename = out_fprefix + "-timing.csv"; + writeCSVReport(filename, CSVRepMode::Timing, 6 /* prec */); if ( haveReferenceVariant() ) { filename = out_fprefix + "-speedup.csv"; writeCSVReport(filename, CSVRepMode::Speedup, 3 /* prec */); } - filename = out_fprefix + "-fom.csv"; - writeFOMReport(filename); - } + filename = out_fprefix + "-checksum.txt"; + writeChecksumReport(filename); + filename = out_fprefix + "-fom.csv"; + writeFOMReport(filename); - void Executor::writeCSVReport(const string &filename, CSVRepMode mode, - size_t prec) { - ofstream file(filename.c_str(), ios::out | ios::trunc); - if (!file) { - cout << " ERROR: Can't open output file " << filename << endl; - } + filename = out_fprefix + "-kernels.csv"; + ofstream file(filename.c_str(), ios::out | ios::trunc); + if ( !file ) { + cout << " ERROR: Can't open output file " << filename << endl; + } - if (file) { + if ( file ) { + bool to_file = true; + writeKernelInfoSummary(file, to_file); + } +} - // - // Set basic table formatting parameters. - // - const string kernel_col_name("Kernel "); - const string sepchr(" , "); - size_t kercol_width = kernel_col_name.size(); - for (size_t ik = 0; ik < kernels.size(); ++ik) { - kercol_width = max(kercol_width, kernels[ik]->getName().size()); - } - kercol_width++; +void Executor::writeCSVReport(const string& filename, CSVRepMode mode, + size_t prec) +{ + ofstream file(filename.c_str(), ios::out | ios::trunc); + if ( !file ) { + cout << " ERROR: Can't open output file " << filename << endl; + } + + if ( file ) { + + // + // Set basic table formatting parameters. + // + const string kernel_col_name("Kernel "); + const string sepchr(" , "); size_t kercol_width = kernel_col_name.size(); for (size_t ik = 0; ik < kernels.size(); ++ik) { @@ -901,48 +674,29 @@ void Executor::runSuite() varcol_width[iv] = max(prec+2, getVariantName(variant_ids[iv]).size()); } + // + // Print title line. + // + file << getReportTitle(mode); // // Wrtie CSV file contents for report. + // - // - // Print column title line. - // - file << left << setw(kercol_width) << kernel_col_name; - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - file << sepchr << left << setw(varcol_width[iv]) - << getVariantName(variant_ids[iv]); - } - file << endl; - - // - // Print row of data for variants of each kernel. - // - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase *kern = kernels[ik]; - file << left << setw(kercol_width) << kern->getName(); - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - VariantID vid = variant_ids[iv]; - file << sepchr << right << setw(varcol_width[iv]); - if ((mode == CSVRepMode::Speedup) && - (!kern->hasVariantToRun(reference_vid) || - !kern->hasVariantToRun(vid))) { - file << "Not run"; - } else if ((mode == CSVRepMode::Timing) && - !kern->hasVariantToRun(vid)) { - file << "Not run"; - } else { - file << setprecision(prec) << std::fixed - << getReportDataEntry(mode, kern, vid); - } - } - file << endl; - } - - file.flush(); + for (size_t iv = 0; iv < variant_ids.size(); ++iv) { + file << sepchr; + } + file << endl; - } // note file will be closed when file stream goes out of scope + // + // Print column title line. + // + file < col_exec_count(ncols, 0); - vector col_min(ncols, numeric_limits::max()); - vector col_max(ncols, -numeric_limits::max()); - vector col_avg(ncols, 0.0); - vector col_stddev(ncols, 0.0); - vector > pct_diff(kernels.size()); - for (size_t ik = 0; ik < kernels.size(); ++ik) { - pct_diff[ik] = vector(ncols, 0.0); - } + // + // Set basic table formatting parameters. + // + const string kernel_col_name("Kernel "); + const string sepchr(" , "); + size_t prec = 2; size_t kercol_width = kernel_col_name.size(); for (size_t ik = 0; ik < kernels.size(); ++ik) { @@ -1009,11 +756,7 @@ void Executor::writeFOMReport(const string& filename) } kercol_width++; - file << "'OVER_TOL' in column to right if RAJA speedup is over tolerance"; - for (size_t iv = 0; iv < ncols * 2; ++iv) { - file << sepchr; - } - file << endl; + size_t fom_col_width = prec+14; size_t ncols = 0; for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { @@ -1022,29 +765,33 @@ void Executor::writeFOMReport(const string& filename) // to each PM baseline } - // - // Print column title line. - // - file << left << setw(kercol_width) << kernel_col_name; - for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { - const FOMGroup &group = fom_groups[ifg]; - for (size_t gv = 0; gv < group.variants.size(); ++gv) { - string name = getVariantName(group.variants[gv]); - file << sepchr << left << setw(fom_col_width) << name << pass; - } - } - file << endl; + vector col_exec_count(ncols, 0); + vector col_min(ncols, numeric_limits::max()); + vector col_max(ncols, -numeric_limits::max()); + vector col_avg(ncols, 0.0); + vector col_stddev(ncols, 0.0); + vector< vector > pct_diff(kernels.size()); + for (size_t ik = 0; ik < kernels.size(); ++ik) { + pct_diff[ik] = vector(ncols, 0.0); + } + // + // Print title line. + // + file << "FOM Report : signed speedup(-)/slowdown(+) for each PM (base vs. RAJA) -> (T_RAJA - T_base) / T_base )"; + for (size_t iv = 0; iv < ncols*2; ++iv) { + file << sepchr; + } + file << endl; - // - // Write CSV file contents for FOM report. - // + file << "'OVER_TOL' in column to right if RAJA speedup is over tolerance"; + for (size_t iv = 0; iv < ncols*2; ++iv) { + file << sepchr; + } + file << endl; - // - // Print row of FOM data for each kernel. - // - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase *kern = kernels[ik]; + string pass(", "); + string fail(",OVER_TOL"); // // Print column title line. @@ -1059,9 +806,6 @@ void Executor::writeFOMReport(const string& filename) } file << endl; - int col = 0; - for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { - const FOMGroup &group = fom_groups[ifg]; // // Write CSV file contents for FOM report. @@ -1079,14 +823,10 @@ void Executor::writeFOMReport(const string& filename) for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { const FOMGroup& group = fom_groups[ifg]; - pct_diff[ik][col] = - (kern->getTotTime(comp_vid) - kern->getTotTime(base_vid)) / - kern->getTotTime(base_vid); + VariantID base_vid = group.base; - string pfstring(pass); - if (pct_diff[ik][col] > run_params.getPFTolerance()) { - pfstring = fail; - } + for (size_t gv = 0; gv < group.variants.size(); ++gv) { + VariantID comp_vid = group.variants[gv]; // // If kernel variant was run, generate data for it and @@ -1099,10 +839,13 @@ void Executor::writeFOMReport(const string& filename) (kern->getTotTime(comp_vid) - kern->getTotTime(base_vid)) / kern->getTotTime(base_vid); - } else { // variant was not run, print a big fat goose egg... + string pfstring(pass); + if (pct_diff[ik][col] > run_params.getPFTolerance()) { + pfstring = fail; + } - file << sepchr << left << setw(fom_col_width) << setprecision(prec) - << 0.0 << pass; + file << sepchr << setw(fom_col_width) << setprecision(prec) + < 0) { - col_avg[col] /= col_exec_count[col]; - } else { - col_avg[col] = 0.0; - } - } + } // loop over kernels - // Column standard deviaation... - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase *kern = kernels[ik]; // // Compute column summary data. @@ -1156,33 +889,23 @@ void Executor::writeFOMReport(const string& filename) for (size_t ik = 0; ik < kernels.size(); ++ik) { KernelBase* kern = kernels[ik]; - col++; + int col = 0; + for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { + const FOMGroup& group = fom_groups[ifg]; - } // loop over group variants + for (size_t gv = 0; gv < group.variants.size(); ++gv) { + VariantID comp_vid = group.variants[gv]; if ( kern->wasVariantRun(comp_vid) ) { col_stddev[col] += ( pct_diff[ik][col] - col_avg[col] ) * ( pct_diff[ik][col] - col_avg[col] ); } - } // loop over kernels + col++; - for (size_t col = 0; col < ncols; ++col) { - if (col_exec_count[col] > 0) { - col_stddev[col] /= col_exec_count[col]; - } else { - col_stddev[col] = 0.0; - } - } + } // loop over group variants - // - // Print column summaries. - // - file << left << setw(kercol_width) << " "; - for (size_t iv = 0; iv < ncols; ++iv) { - file << sepchr << setw(fom_col_width) << left << " " << right << pass; - } - file << endl; + } // loop over groups } // loop over kernels @@ -1229,105 +952,33 @@ void Executor::writeFOMReport(const string& filename) file << sepchr <getName().size()); - } - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - namecol_width = max(namecol_width, - getVariantName(variant_ids[iv]).size()); - } - namecol_width++; - - - // - // Print title. - // - file << equal_line << endl; - file << "Checksum Report " << endl; - file << equal_line << endl; - - // - // Print column title line. - // - file << left << setw(namecol_width) << "Kernel " << endl; - file << dot_line << endl; - file << left << setw(namecol_width) << "Variants " - << left << setw(checksum_width) << "Checksum " - << left << setw(checksum_width) - << "Checksum Diff (vs. first variant listed)"; - file << endl; - file << dash_line << endl; + } // note file will be closed when file stream goes out of scope +} - // - // Print checksum and diff against baseline for each kernel variant. - // - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase *kern = kernels[ik]; - - file << left << setw(namecol_width) << kern->getName() << endl; - file << dot_line << endl; - - Checksum_type cksum_ref = 0.0; - size_t ivck = 0; - bool found_ref = false; - while (ivck < variant_ids.size() && !found_ref) { - VariantID vid = variant_ids[ivck]; - if (kern->wasVariantRun(vid)) { - cksum_ref = kern->getChecksum(vid); - found_ref = true; - } - ++ivck; - } - for (size_t iv = 0; iv < variant_ids.size(); ++iv) { - VariantID vid = variant_ids[iv]; - - if (kern->wasVariantRun(vid)) { - Checksum_type vcheck_sum = kern->getChecksum(vid); - Checksum_type diff = cksum_ref - kern->getChecksum(vid); - - file << left << setw(namecol_width) << getVariantName(vid) - << showpoint << setprecision(prec) - << left << setw(checksum_width) << vcheck_sum - << left << setw(checksum_width) << diff << endl; - } else { - file << left << setw(namecol_width) << getVariantName(vid) - << left << setw(checksum_width) << "Not Run" - << left << setw(checksum_width) << "Not Run" << endl; - } +void Executor::writeChecksumReport(const string& filename) +{ + ofstream file(filename.c_str(), ios::out | ios::trunc); + if ( !file ) { + cout << " ERROR: Can't open output file " << filename << endl; + } - } + if ( file ) { - file << endl; - file << dash_line_short << endl; - } + // + // Set basic table formatting parameters. + // + const string equal_line("==================================================================================================="); + const string dash_line("----------------------------------------------------------------------------------------"); + const string dash_line_short("-------------------------------------------------------"); + string dot_line("........................................................"); - file.flush(); + size_t prec = 20; + size_t checksum_width = prec + 8; size_t namecol_width = 0; for (size_t ik = 0; ik < kernels.size(); ++ik) { @@ -1337,43 +988,15 @@ void Executor::writeFOMReport(const string& filename) namecol_width = max(namecol_width, getVariantName(variant_ids[iv]).size()); } + namecol_width++; - long double Executor::getReportDataEntry(CSVRepMode mode, - KernelBase *kern, - VariantID vid) { - long double retval = 0.0; - switch (mode) { - case CSVRepMode::Timing : { - retval = kern->getTotTime(vid) / run_params.getNumPasses(); - break; - } - case CSVRepMode::Speedup : { - if (haveReferenceVariant()) { - if (kern->hasVariantToRun(reference_vid) && - kern->hasVariantToRun(vid)) { - retval = kern->getTotTime(reference_vid) / kern->getTotTime(vid); - } else { - retval = 0.0; - } -#if 0 // RDH DEBUG (leave this here, it's useful for debugging!) - cout << "Kernel(iv): " << kern->getName() << "(" << vid << ")" << endl; - cout << "\tref_time, tot_time, retval = " - << kern->getTotTime(reference_vid) << " , " - << kern->getTotTime(vid) << " , " - << retval << endl; -#endif - } - break; - } - default : { - cout << "\n Unknown CSV report mode = " << mode << endl; - } - }; - return retval; - } - void Executor::getFOMGroups(vector &fom_groups) { - fom_groups.clear(); + // + // Print title. + // + file << equal_line << endl; + file << "Checksum Report " << endl; + file << equal_line << endl; // // Print column title line. @@ -1387,7 +1010,26 @@ void Executor::writeFOMReport(const string& filename) file << endl; file << dash_line << endl; - if (vname.find("Base") != string::npos) { + // + // Print checksum and diff against baseline for each kernel variant. + // + for (size_t ik = 0; ik < kernels.size(); ++ik) { + KernelBase* kern = kernels[ik]; + + file <getName() << endl; + file << dot_line << endl; + + Checksum_type cksum_ref = 0.0; + size_t ivck = 0; + bool found_ref = false; + while ( ivck < variant_ids.size() && !found_ref ) { + VariantID vid = variant_ids[ivck]; + if ( kern->wasVariantRun(vid) ) { + cksum_ref = kern->getChecksum(vid); + found_ref = true; + } + ++ivck; + } for (size_t iv = 0; iv < variant_ids.size(); ++iv) { VariantID vid = variant_ids[iv]; @@ -1406,8 +1048,7 @@ void Executor::writeFOMReport(const string& filename) <& fom_groups) } // iterate over variant ids to run +#if 0 // RDH DEBUG (leave this here, it's useful for debugging!) + cout << "\nFOMGroups..." << endl; + for (size_t ifg = 0; ifg < fom_groups.size(); ++ifg) { + const FOMGroup& group = fom_groups[ifg]; + cout << "\tBase : " << getVariantName(group.base) << endl; + for (size_t iv = 0; iv < group.variants.size(); ++iv) { + cout << "\t\t " << getVariantName(group.variants[iv]) << endl; + } + } +#endif +} + + +// New functions for Kokkos to register new group and kernel IDs +// The return type is Executor::groupID + + + Executor::groupID Executor::registerGroup(std::string groupName) { + // find() method searches the string for the first occurrence of the sequence specified by its arguments. + // Recall, "kernelsPerGroup" is a mapping of kernel groups (e.g., basic) and their constituent kernels (e.g., DAXPY) + auto checkIfGroupExists = kernelsPerGroup.find(groupName); + + + /* Recall, these items are defined in Executor.hpp: + using groupID = int; + using kernelID = int; + using kernelSet = std::set; // data type: set of KernelBase* instances + using kernelMap = std::map; // data type: map of string kernel names to instances of KernelBase* + using groupMap = std::map; // data type: map of groupNames to sets of kernels + ... + // "allKernels" is an instance of kernelMap, which is a "map" of all kernels and their ID's + kernelMap allKernels; + + // "kernelsPerGroup" is an instance of "groupMap;" "kernelsPerGroup" maps kernels to their categories (e.g., basic, polybench, etc.) + groupMap kernelsPerGroup; + + */ + + /* end() + * Return iterator to end + * Returns an iterator referring to the past-the-end element in the vector container. + * The past-the-end element is the theoretical element that would follow the last element in the vector. + * It does not point to any element, and thus shall not be de-referenced. + * Because the ranges used by functions of the standard library do not include + * the element pointed by their closing iterator, + * this function is often used in combination with vector::begin to specify a range including all the elements in the container. + * If the container is empty, this function returns the same as vector::begin. + * + */ + + + // HERE, WE ARE CHECKING THE CASE THAT THE groupNAME **IS NOT** IN THE MAP OBJECT + // Using the .end() idiom to check if I've fallen off the edge of the container without finding a match + if (checkIfGroupExists == kernelsPerGroup.end()) { + // If groupName not found, set that groupName in kernelsPerGroup to an empty kernelSet obj + kernelsPerGroup[groupName] = kernelSet(); + } else { + // ERROR CONDITION: DUPLICATING GROUPS + // Error lists exsiting group, and kills program. + + std::cout << "The Group Name " << groupName << " already exists. Program is exiting." << std::endl; + + // In kernelsPerGroup, the Group Name is the first position / key value, and the second position / value type in the set + auto fullKernelSet = checkIfGroupExists->second; + + // fullKernelSet is of type std::set + + for (auto kernel: fullKernelSet) { + + std::cout << kernel->getName() << std::endl; + + } + + exit(1); + + } + // getNewGroupID() is an object of type Executor::groupID, an int + return getNewGroupID(); + + + } + +// New function with return type Executor::kernelID, returning getNewKernelID(); registerKernel is a new function in the Executor class +// + + Executor::kernelID Executor::registerKernel(std::string groupName, KernelBase *kernel) { + // declaring and setting kernelName to de-referenced kernel pointer obj, an instance of KernelBase* + auto kernelName = kernel->getName(); + // Recall, "allKernels" maps named kernels to their IDs + auto checkIfKernelExists = allKernels.find(kernelName); + // Check if checkKernelExists value IS NOT in the map of all kernels + if (checkIfKernelExists == allKernels.end()) { + // if the kernel name IS NOT in the allKernels map, set kernelName to kernel, the KernelBase* instance + allKernels[kernelName] = kernel; + } else { + // ERROR CONDITION: if the kernel is found / exists, make the program exit + + std::cout << "Kernel " << checkIfKernelExists->first << " already exists. Program is exiting." + << std::endl; + + exit(1); + } + ////////////////////////////////////////////////////////////////////////////// + // This error condition : adding a groupName before checking if the group associated with the kernel exists + // Declare and set checkIfGroupExists to the value of the string-type groupName in the kernelsPerGroup map + auto checkIfGroupExists = kernelsPerGroup.find(groupName); + // LOGIC: Check if checkIfGroupExists value is the same as the past-the-end element in the vector container, which + // does not have a value + // i.e., check for the case that the groupName DOES NOT exist with the ".end()" idiom; + if (checkIfGroupExists == kernelsPerGroup.end()) { + + } else { + // If the groupName DOES EXIST, then insert the kernel (instance of KernelBase*) at the second position of the + // allKernels map to associate the kernel and its groupNAme + + checkIfGroupExists->second.insert(kernel); + + } + + // getNewKernelID is an obj of type Executor::kernelID + return getNewKernelID(); + } +// AJP & DZP new function +// AJP GOAL: return a vector of all kernelBase* objects to be run by + + std::vector Executor::lookUpKernelByName(std::string kernelOrGroupName) { + + // The vector / list return type, std::vector will contain + // either all of the kernels with a given kernel name or group name + // We have two maps (defined in Executor.hpp): kernelMap allKernels, groupMap kernelsPerGroup, + // STEPS: + // 1) declare new vector that will contain the string data: + // 2) LOGIC: + // i) check to see if the kernel / group requested on the + // "./rajaperf.exe -k" line (you can pass either a specific kernel or a + // kernel groupName, e.g., "Basic" + + // Declaring the vector kernelsByNameVect of type std::vector; + // This variable will contain the set of kernels to run + std::vector kernelsByNameVect; + + // CONDITIONS TO INCLUDE: + // 1) If kernelName is groupName , then add that set of kernels in the + // group to the vector + + // 2) else if kernelName is kernel, then add the kernel to the vector + // 3) else if kernelName is horse stuff, then say so + + // HINT: Declare iterator against which you can test equivalence + + auto checkLookUpGroupNameIterator = kernelsPerGroup.find(kernelOrGroupName); + auto checkLookUpKernelNameIterator = allKernels.find(kernelOrGroupName); + + // Check to see if groupName NOT in kernelsPerGroup; + // end() iterates to the end + if (checkLookUpGroupNameIterator != kernelsPerGroup.end()) { + //cout << " STEP 1" << endl; + + // when using the arrow, you get a key, value pair. + // You can access either member by "first" or "second" + // + + // we have std::set of KernelBase* + auto groupSetForTests = checkLookUpGroupNameIterator->second; + + for (auto item: groupSetForTests) { + kernelsByNameVect.push_back(item); + } + } else if (checkLookUpKernelNameIterator != allKernels.end()) { + + auto kernel = checkLookUpKernelNameIterator->second; + + kernelsByNameVect.push_back(kernel); + + + } + + + // kernelsByNameVect is an object of type std::vector that will be used by + return kernelsByNameVect; + + + } + + const RunParams &Executor::getRunParams() { + + + return run_params; + } + void free_register_group(Executor *exec, std::string groupName) { exec->registerGroup(groupName); } @@ -1513,7 +1342,6 @@ void Executor::getFOMGroups(vector& fom_groups) void free_register_kernel(Executor *exec, std::string groupName, KernelBase *kernel) { exec->registerKernel(groupName, kernel); } - const RunParams& getRunParams(Executor* exec){ return exec->getRunParams(); diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index 3414fa86b..ed0ec9d87 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -53,7 +53,7 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) : : run_params(params), kernel_id(Basic_DAXPY), // TODO DZP: better name(name), - default_size(0), + default_prob_size(0), default_reps(0), running_variant(NumVariants) { @@ -63,7 +63,7 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params) : min_time[ivar] = std::numeric_limits::max(); max_time[ivar] = -std::numeric_limits::max(); tot_time[ivar] = 0.0; - has_variant_to_run[ivar] = false; + has_variant_defined[ivar] = false; } } diff --git a/src/common/QuickKernelBase.hpp b/src/common/QuickKernelBase.hpp index a9cea9dac..70289de13 100644 --- a/src/common/QuickKernelBase.hpp +++ b/src/common/QuickKernelBase.hpp @@ -62,7 +62,7 @@ namespace rajaperf { #if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) void runOpenMPVariant(VariantID vid) override { - auto size = getRunSize(); + auto size = getActualProblemSize(); for(int x =0; x< getRunReps(); ++x){ m_execute(x, size); } @@ -82,14 +82,14 @@ namespace rajaperf { template void rkv_helper(std::index_sequence) { - auto size = getRunSize(); + auto size = getActualProblemSize(); for (int x = 0; x < getRunReps(); ++x) { m_execute(x, size, std::get(rd)...); } } void rkv_helper(empty em) { - auto size = getRunSize(); + auto size = getActualProblemSize(); for (int x = 0; x < getRunReps(); ++x) { m_execute(x, size); } diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 3ec7d8a69..c6f87d8af 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -103,7 +103,7 @@ namespace rajaperf { // Basic - free_register_kernel(exec, "Basic", new basic::ATOMIC_PI(run_params)); + free_register_kernel(exec, "Basic", new basic::PI_ATOMIC(run_params)); free_register_kernel(exec, "Basic", new basic::DAXPY(run_params)); free_register_kernel(exec, "Basic", new basic::IF_QUAD(run_params)); free_register_kernel(exec, "Basic", new basic::INIT3(run_params)); @@ -497,261 +497,262 @@ const std::string& getFeatureName(FeatureID fid) * ******************************************************************************* */ -KernelBase* getKernelObject(KernelID kid, - const RunParams& run_params) -{ - KernelBase* kernel = 0; - - switch ( kid ) { - - // - // Basic kernels... - // - case Basic_DAXPY : { - kernel = new basic::DAXPY(run_params); - break; - } - case Basic_IF_QUAD : { - kernel = new basic::IF_QUAD(run_params); - break; - } - case Basic_INIT3 : { - kernel = new basic::INIT3(run_params); - break; - } - case Basic_INIT_VIEW1D : { - kernel = new basic::INIT_VIEW1D(run_params); - break; - } - case Basic_INIT_VIEW1D_OFFSET : { - kernel = new basic::INIT_VIEW1D_OFFSET(run_params); - break; - } - case Basic_MULADDSUB : { - kernel = new basic::MULADDSUB(run_params); - break; - } - case Basic_NESTED_INIT : { - kernel = new basic::NESTED_INIT(run_params); - break; - } - case Basic_PI_ATOMIC : { - kernel = new basic::PI_ATOMIC(run_params); - break; - } - case Basic_PI_REDUCE : { - kernel = new basic::PI_REDUCE(run_params); - break; - } - case Basic_REDUCE3_INT : { - kernel = new basic::REDUCE3_INT(run_params); - break; - } - case Basic_TRAP_INT : { - kernel = new basic::TRAP_INT(run_params); - break; - } - +//KernelBase* getKernelObject(KernelID kid, +// const RunParams& run_params) +//{ // -// Lcals kernels... - - case Lcals_DIFF_PREDICT : { - kernel = new lcals::DIFF_PREDICT(run_params); - break; - } - case Lcals_EOS : { - kernel = new lcals::EOS(run_params); - break; - } - case Lcals_FIRST_DIFF : { - kernel = new lcals::FIRST_DIFF(run_params); - break; - } - - case Lcals_FIRST_MIN : { - kernel = new lcals::FIRST_MIN(run_params); - break; - } - case Lcals_FIRST_SUM : { - kernel = new lcals::FIRST_SUM(run_params); - break; - } - case Lcals_GEN_LIN_RECUR : { - kernel = new lcals::GEN_LIN_RECUR(run_params); - break; - } - case Lcals_HYDRO_1D : { - kernel = new lcals::HYDRO_1D(run_params); - break; - } - case Lcals_HYDRO_2D : { - kernel = new lcals::HYDRO_2D(run_params); - break; - } - case Lcals_INT_PREDICT : { - kernel = new lcals::INT_PREDICT(run_params); - break; - } - case Lcals_PLANCKIAN : { - kernel = new lcals::PLANCKIAN(run_params); - break; - } - case Lcals_TRIDIAG_ELIM : { - kernel = new lcals::TRIDIAG_ELIM(run_params); - break; - } - - -// Stream kernels... +// KernelBase* kernel = 0; // - case Stream_ADD : { - kernel = new stream::ADD(run_params); - break; - } - case Stream_COPY : { - kernel = new stream::COPY(run_params); - break; - } - case Stream_DOT : { - kernel = new stream::DOT(run_params); - break; - } - case Stream_MUL : { - kernel = new stream::MUL(run_params); - break; - } - case Stream_TRIAD : { - kernel = new stream::TRIAD(run_params); - break; - } +// switch ( kid ) { // +// // +// // Basic kernels... +// // +// case Basic_DAXPY : { +// kernel = new basic::DAXPY(run_params); +// break; +// } +// case Basic_IF_QUAD : { +// kernel = new basic::IF_QUAD(run_params); +// break; +// } +// case Basic_INIT3 : { +// kernel = new basic::INIT3(run_params); +// break; +// } +// case Basic_INIT_VIEW1D : { +// kernel = new basic::INIT_VIEW1D(run_params); +// break; +// } +// case Basic_INIT_VIEW1D_OFFSET : { +// kernel = new basic::INIT_VIEW1D_OFFSET(run_params); +// break; +// } +// case Basic_MULADDSUB : { +// kernel = new basic::MULADDSUB(run_params); +// break; +// } +// case Basic_NESTED_INIT : { +// kernel = new basic::NESTED_INIT(run_params); +// break; +// } +// case Basic_PI_ATOMIC : { +// kernel = new basic::PI_ATOMIC(run_params); +// break; +// } +// case Basic_PI_REDUCE : { +// kernel = new basic::PI_REDUCE(run_params); +// break; +// } +// case Basic_REDUCE3_INT : { +// kernel = new basic::REDUCE3_INT(run_params); +// break; +// } +// case Basic_TRAP_INT : { +// kernel = new basic::TRAP_INT(run_params); +// break; +// } // -/** DZP: big comment block for unimplemented -// Polybench kernels... +//// +//// Lcals kernels... // - case Polybench_2MM : { - kernel = new polybench::POLYBENCH_2MM(run_params); - break; - } - case Polybench_3MM : { - kernel = new polybench::POLYBENCH_3MM(run_params); - break; - } - case Polybench_ADI : { - kernel = new polybench::POLYBENCH_ADI(run_params); - break; - } - case Polybench_ATAX : { - kernel = new polybench::POLYBENCH_ATAX(run_params); - break; - } - case Polybench_FDTD_2D : { - kernel = new polybench::POLYBENCH_FDTD_2D(run_params); - break; - } - case Polybench_FLOYD_WARSHALL : { - kernel = new polybench::POLYBENCH_FLOYD_WARSHALL(run_params); - break; - } - case Polybench_GEMM : { - kernel = new polybench::POLYBENCH_GEMM(run_params); - break; - } - case Polybench_GEMVER : { - kernel = new polybench::POLYBENCH_GEMVER(run_params); - break; - } - case Polybench_GESUMMV : { - kernel = new polybench::POLYBENCH_GESUMMV(run_params); - break; - } - case Polybench_HEAT_3D : { - kernel = new polybench::POLYBENCH_HEAT_3D(run_params); - break; - } - case Polybench_JACOBI_1D : { - kernel = new polybench::POLYBENCH_JACOBI_1D(run_params); - break; - } - case Polybench_JACOBI_2D : { - kernel = new polybench::POLYBENCH_JACOBI_2D(run_params); - break; - } - case Polybench_MVT : { - kernel = new polybench::POLYBENCH_MVT(run_params); - break; - } - -//////////////////////////////////////////////////////////////// -// Apps kernels... -/* - case Apps_COUPLE : { - kernel = new apps::COUPLE(run_params); - break; - } - - */ - - case Apps_DEL_DOT_VEC_2D : { - kernel = new apps::DEL_DOT_VEC_2D(run_params); - break; - } - case Apps_ENERGY : { - kernel = new apps::ENERGY(run_params); - break; - } - case Apps_FIR : { - kernel = new apps::FIR(run_params); - break; - } - case Apps_HALOEXCHANGE : { - kernel = new apps::HALOEXCHANGE(run_params); - break; - } - case Apps_HALOEXCHANGE_FUSED : { - kernel = new apps::HALOEXCHANGE_FUSED(run_params); - break; - } - case Apps_LTIMES : { - kernel = new apps::LTIMES(run_params); - break; - } - case Apps_LTIMES_NOVIEW : { - kernel = new apps::LTIMES_NOVIEW(run_params); - break; - } - case Apps_MASS3DPA : { - kernel = new apps::MASS3DPA(run_params); - break; - } - case Apps_PRESSURE : { - kernel = new apps::PRESSURE(run_params); - break; - } - case Apps_VOL3D : { - kernel = new apps::VOL3D(run_params); - break; - } - +// case Lcals_DIFF_PREDICT : { +// kernel = new lcals::DIFF_PREDICT(run_params); +// break; +// } +// case Lcals_EOS : { +// kernel = new lcals::EOS(run_params); +// break; +// } +// case Lcals_FIRST_DIFF : { +// kernel = new lcals::FIRST_DIFF(run_params); +// break; +// } // -// Algorithm kernels... -/* - case Algorithm_SORT: { - kernel = new algorithm::SORT(run_params); - break; - } - case Algorithm_SORTPAIRS: { - kernel = new algorithm::SORTPAIRS(run_params); - break; - } -*/ - default: { - std::cout << "\n Unknown Kernel ID = " << kid << std::endl; - } - - } // end switch on kernel id - - return kernel; - } +// case Lcals_FIRST_MIN : { +// kernel = new lcals::FIRST_MIN(run_params); +// break; +// } +// case Lcals_FIRST_SUM : { +// kernel = new lcals::FIRST_SUM(run_params); +// break; +// } +// case Lcals_GEN_LIN_RECUR : { +// kernel = new lcals::GEN_LIN_RECUR(run_params); +// break; +// } +// case Lcals_HYDRO_1D : { +// kernel = new lcals::HYDRO_1D(run_params); +// break; +// } +// case Lcals_HYDRO_2D : { +// kernel = new lcals::HYDRO_2D(run_params); +// break; +// } +// case Lcals_INT_PREDICT : { +// kernel = new lcals::INT_PREDICT(run_params); +// break; +// } +// case Lcals_PLANCKIAN : { +// kernel = new lcals::PLANCKIAN(run_params); +// break; +// } +// case Lcals_TRIDIAG_ELIM : { +// kernel = new lcals::TRIDIAG_ELIM(run_params); +// break; +// } +// +// +//// Stream kernels... +//// +// case Stream_ADD : { +// kernel = new stream::ADD(run_params); +// break; +// } +// case Stream_COPY : { +// kernel = new stream::COPY(run_params); +// break; +// } +// case Stream_DOT : { +// kernel = new stream::DOT(run_params); +// break; +// } +// case Stream_MUL : { +// kernel = new stream::MUL(run_params); +// break; +// } +// case Stream_TRIAD : { +// kernel = new stream::TRIAD(run_params); +// break; +// } +//// +//// +///** DZP: big comment block for unimplemented +//// Polybench kernels... +//// +// case Polybench_2MM : { +// kernel = new polybench::POLYBENCH_2MM(run_params); +// break; +// } +// case Polybench_3MM : { +// kernel = new polybench::POLYBENCH_3MM(run_params); +// break; +// } +// case Polybench_ADI : { +// kernel = new polybench::POLYBENCH_ADI(run_params); +// break; +// } +// case Polybench_ATAX : { +// kernel = new polybench::POLYBENCH_ATAX(run_params); +// break; +// } +// case Polybench_FDTD_2D : { +// kernel = new polybench::POLYBENCH_FDTD_2D(run_params); +// break; +// } +// case Polybench_FLOYD_WARSHALL : { +// kernel = new polybench::POLYBENCH_FLOYD_WARSHALL(run_params); +// break; +// } +// case Polybench_GEMM : { +// kernel = new polybench::POLYBENCH_GEMM(run_params); +// break; +// } +// case Polybench_GEMVER : { +// kernel = new polybench::POLYBENCH_GEMVER(run_params); +// break; +// } +// case Polybench_GESUMMV : { +// kernel = new polybench::POLYBENCH_GESUMMV(run_params); +// break; +// } +// case Polybench_HEAT_3D : { +// kernel = new polybench::POLYBENCH_HEAT_3D(run_params); +// break; +// } +// case Polybench_JACOBI_1D : { +// kernel = new polybench::POLYBENCH_JACOBI_1D(run_params); +// break; +// } +// case Polybench_JACOBI_2D : { +// kernel = new polybench::POLYBENCH_JACOBI_2D(run_params); +// break; +// } +// case Polybench_MVT : { +// kernel = new polybench::POLYBENCH_MVT(run_params); +// break; +// } +// +////////////////////////////////////////////////////////////////// +//// Apps kernels... +///* +// case Apps_COUPLE : { +// kernel = new apps::COUPLE(run_params); +// break; +// } +// +// */ +// +// case Apps_DEL_DOT_VEC_2D : { +// kernel = new apps::DEL_DOT_VEC_2D(run_params); +// break; +// } +// case Apps_ENERGY : { +// kernel = new apps::ENERGY(run_params); +// break; +// } +// case Apps_FIR : { +// kernel = new apps::FIR(run_params); +// break; +// } +// case Apps_HALOEXCHANGE : { +// kernel = new apps::HALOEXCHANGE(run_params); +// break; +// } +// case Apps_HALOEXCHANGE_FUSED : { +// kernel = new apps::HALOEXCHANGE_FUSED(run_params); +// break; +// } +// case Apps_LTIMES : { +// kernel = new apps::LTIMES(run_params); +// break; +// } +// case Apps_LTIMES_NOVIEW : { +// kernel = new apps::LTIMES_NOVIEW(run_params); +// break; +// } +// case Apps_MASS3DPA : { +// kernel = new apps::MASS3DPA(run_params); +// break; +// } +// case Apps_PRESSURE : { +// kernel = new apps::PRESSURE(run_params); +// break; +// } +// case Apps_VOL3D : { +// kernel = new apps::VOL3D(run_params); +// break; +// } +// +//// +//// Algorithm kernels... +///* +// case Algorithm_SORT: { +// kernel = new algorithm::SORT(run_params); +// break; +// } +// case Algorithm_SORTPAIRS: { +// kernel = new algorithm::SORTPAIRS(run_params); +// break; +// } +//*/ +// default: { +// std::cout << "\n Unknown Kernel ID = " << kid << std::endl; +// } +// +// } // end switch on kernel id +// +// return kernel; +// } } // closing brace for rajaperf namespace diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 28aaadda3..8c4b489b0 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -31,7 +31,6 @@ void free_register_group(Executor*, std::string); // forward declaration void free_register_kernel(Executor*, std::string, KernelBase*); // forward declaration void make_perfsuite_executor(Executor* exec, int argc, char* argv[]); #if defined(RUN_KOKKOS) - // Kokkos Design Spirit: // WE NEED: // 1) Use KokkosViews --> a wrapper around pointers for host and device memory @@ -373,7 +372,6 @@ enum KernelID { // // Apps kernels... -/* Apps_COUPLE, Apps_DEL_DOT_VEC_2D, Apps_ENERGY, @@ -385,7 +383,6 @@ enum KernelID { Apps_MASS3DPA, Apps_PRESSURE, Apps_VOL3D, -*/ // // Algorithm kernels... diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index de20e6e4d..69640aaaa 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -507,12 +507,13 @@ void RunParams::printKernelNames(std::ostream& str) const { str << "\nAvailable kernels:"; str << "\n------------------\n"; - for (int kid = 0; kid < NumKernels; ++kid) { -/// RDH DISABLE COUPLE KERNEL - if (static_cast(kid) != Apps_COUPLE) { - str << getKernelName(static_cast(kid)) << std::endl; - } - } +// TODO DZP reimplement +// for (int kid = 0; kid < NumKernels; ++kid) { +///// RDH DISABLE COUPLE KERNEL +// if (static_cast(kid) != Apps_COUPLE) { +// str << getKernelName(static_cast(kid)) << std::endl; +// } +// } str.flush(); } @@ -521,12 +522,13 @@ void RunParams::printFullKernelNames(std::ostream& str) const { str << "\nAvailable kernels (_):"; str << "\n-----------------------------------------\n"; - for (int kid = 0; kid < NumKernels; ++kid) { -/// RDH DISABLE COUPLE KERNEL - if (static_cast(kid) != Apps_COUPLE) { - str << getFullKernelName(static_cast(kid)) << std::endl; - } - } +// TODO DZP: reimplement +// for (int kid = 0; kid < NumKernels; ++kid) { +///// RDH DISABLE COUPLE KERNEL +// if (static_cast(kid) != Apps_COUPLE) { +// str << getFullKernelName(static_cast(kid)) << std::endl; +// } +// } str.flush(); } @@ -569,17 +571,18 @@ void RunParams::printFeatureKernels(std::ostream& str) const for (int fid = 0; fid < NumFeatures; ++fid) { FeatureID tfid = static_cast(fid); str << getFeatureName(tfid) << std::endl; - for (int kid = 0; kid < NumKernels; ++kid) { - KernelID tkid = static_cast(kid); -/// RDH DISABLE COUPLE KERNEL - if (tkid != Apps_COUPLE) { - KernelBase* kern = getKernelObject(tkid, *this); - if ( kern->usesFeature(tfid) ) { - str << "\t" << getFullKernelName(tkid) << std::endl; - } - delete kern; - } - } // loop over kernels +// TODO DZP: reimplement +// for (int kid = 0; kid < NumKernels; ++kid) { +// KernelID tkid = static_cast(kid); +///// RDH DISABLE COUPLE KERNEL +// if (tkid != Apps_COUPLE) { +// KernelBase* kern = getKernelObject(tkid, *this); +// if ( kern->usesFeature(tfid) ) { +// str << "\t" << getFullKernelName(tkid) << std::endl; +// } +// delete kern; +// } +// } // loop over kernels str << std::endl; } // loop over features str.flush(); @@ -589,21 +592,22 @@ void RunParams::printKernelFeatures(std::ostream& str) const { str << "\nAvailable kernels and features each uses:"; str << "\n-----------------------------------------\n"; - for (int kid = 0; kid < NumKernels; ++kid) { - KernelID tkid = static_cast(kid); -/// RDH DISABLE COUPLE KERNEL - if (tkid != Apps_COUPLE) { - str << getFullKernelName(tkid) << std::endl; - KernelBase* kern = getKernelObject(tkid, *this); - for (int fid = 0; fid < NumFeatures; ++fid) { - FeatureID tfid = static_cast(fid); - if ( kern->usesFeature(tfid) ) { - str << "\t" << getFeatureName(tfid) << std::endl; - } - } // loop over features - delete kern; - } - } // loop over kernels +// TODO DZP: reimplement +// for (int kid = 0; kid < NumKernels; ++kid) { +// KernelID tkid = static_cast(kid); +///// RDH DISABLE COUPLE KERNEL +// if (tkid != Apps_COUPLE) { +// str << getFullKernelName(tkid) << std::endl; +// KernelBase* kern = getKernelObject(tkid, *this); +// for (int fid = 0; fid < NumFeatures; ++fid) { +// FeatureID tfid = static_cast(fid); +// if ( kern->usesFeature(tfid) ) { +// str << "\t" << getFeatureName(tfid) << std::endl; +// } +// } // loop over features +// delete kern; +// } +// } // loop over kernels str.flush(); } diff --git a/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp b/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp index eefda95a4..8cfae031b 100644 --- a/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp +++ b/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp @@ -22,7 +22,7 @@ void DIFF_PREDICT::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); DIFF_PREDICT_DATA_SETUP; diff --git a/src/lcals-kokkos/EOS-Kokkos.cpp b/src/lcals-kokkos/EOS-Kokkos.cpp index 9fc824c83..b0b1f7403 100644 --- a/src/lcals-kokkos/EOS-Kokkos.cpp +++ b/src/lcals-kokkos/EOS-Kokkos.cpp @@ -22,7 +22,7 @@ void EOS::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); EOS_DATA_SETUP; diff --git a/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp b/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp index 68e312495..a1714a382 100644 --- a/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp +++ b/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp @@ -23,7 +23,7 @@ void FIRST_DIFF::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); FIRST_DIFF_DATA_SETUP; diff --git a/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp b/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp index 73ea40504..6d9502638 100644 --- a/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp +++ b/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp @@ -23,7 +23,7 @@ void FIRST_MIN::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); FIRST_MIN_DATA_SETUP; diff --git a/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp b/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp index 653cac0d1..f4a5d4b77 100644 --- a/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp +++ b/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp @@ -22,7 +22,7 @@ void FIRST_SUM::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); FIRST_SUM_DATA_SETUP; diff --git a/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp b/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp index 9bf21a15c..8120687c9 100644 --- a/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp +++ b/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp @@ -22,7 +22,7 @@ void GEN_LIN_RECUR::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 1; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); GEN_LIN_RECUR_DATA_SETUP; diff --git a/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp b/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp index 5b3c3a544..b8188a995 100644 --- a/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp +++ b/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp @@ -22,7 +22,7 @@ void HYDRO_1D::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); HYDRO_1D_DATA_SETUP; @@ -129,7 +129,7 @@ void HYDRO_1D::runKokkosVariant(VariantID vid) // ATTN: Adjust arr dimensions to be congruent with the setup // in the .cpp file: - // m_array_length = getRunSize() + 12; + // m_array_length = getActualProblemSize() + 12; moveDataToHostFromKokkosView(x, x_view, iend + 12); diff --git a/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp b/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp index c9ef8c430..04c49ff5f 100644 --- a/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp +++ b/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp @@ -23,7 +23,7 @@ void INT_PREDICT::runKokkosVariant(VariantID vid) const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); INT_PREDICT_DATA_SETUP; diff --git a/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp b/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp index 5a850cae6..0c74ad017 100644 --- a/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp +++ b/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp @@ -23,7 +23,7 @@ void PLANCKIAN::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); PLANCKIAN_DATA_SETUP; diff --git a/src/stream-kokkos/ADD-Kokkos.cpp b/src/stream-kokkos/ADD-Kokkos.cpp index 48d5bd20a..10171b1dc 100644 --- a/src/stream-kokkos/ADD-Kokkos.cpp +++ b/src/stream-kokkos/ADD-Kokkos.cpp @@ -24,14 +24,14 @@ void ADD::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); */ void ADD::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); ADD_DATA_SETUP; diff --git a/src/stream-kokkos/COPY-Kokkos.cpp b/src/stream-kokkos/COPY-Kokkos.cpp index 01b1f19b5..b896ea4e2 100644 --- a/src/stream-kokkos/COPY-Kokkos.cpp +++ b/src/stream-kokkos/COPY-Kokkos.cpp @@ -22,7 +22,7 @@ void COPY::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); */ void COPY::runKokkosVariant(VariantID vid) @@ -30,7 +30,7 @@ void COPY::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); COPY_DATA_SETUP; diff --git a/src/stream-kokkos/DOT-Kokkos.cpp b/src/stream-kokkos/DOT-Kokkos.cpp index 06a2040b8..7b6519768 100644 --- a/src/stream-kokkos/DOT-Kokkos.cpp +++ b/src/stream-kokkos/DOT-Kokkos.cpp @@ -22,7 +22,7 @@ void DOT::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); */ @@ -30,7 +30,7 @@ void DOT::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); diff --git a/src/stream-kokkos/MUL-Kokkos.cpp b/src/stream-kokkos/MUL-Kokkos.cpp index 24b74b4ea..10809b74d 100644 --- a/src/stream-kokkos/MUL-Kokkos.cpp +++ b/src/stream-kokkos/MUL-Kokkos.cpp @@ -22,14 +22,14 @@ void MUL::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); */ void MUL::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); MUL_DATA_SETUP; diff --git a/src/stream-kokkos/TRIAD-Kokkos.cpp b/src/stream-kokkos/TRIAD-Kokkos.cpp index ffac1e2c5..9fbef444f 100644 --- a/src/stream-kokkos/TRIAD-Kokkos.cpp +++ b/src/stream-kokkos/TRIAD-Kokkos.cpp @@ -22,14 +22,14 @@ void TRIAD::runSeqVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); */ void TRIAD::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; - const Index_type iend = getRunSize(); + const Index_type iend = getActualProblemSize(); TRIAD_DATA_SETUP; From c13b61c6c9444dfc9e0ac0ec36570cd738687012 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Wed, 21 Jul 2021 16:52:29 -0600 Subject: [PATCH 096/124] CMakeLists.txt: fix amd arch --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fc9069999..bed34c79b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -127,7 +127,7 @@ if(ENABLE_KOKKOS) add_definitions(-DRUN_KOKKOS) if(ENABLE_HIP) set(Kokkos_ENABLE_HIP ON CACHE BOOL "Kokkos builds with AMD HIP require a ... build...AJP FINISH") - set(Kokkos_ARCH_VEGA900 ON CACHE BOOL "Docstring") #TODO: better + #set(Kokkos_ARCH_VEGA900 ON CACHE BOOL "Docstring") #TODO: better #set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE /ascldap/users/ajpowel/RAJAPerf/amd_build/compiler_unscrewer) endif() if(ENABLE_TARGET_OPENMP) From 298cd4c0d09e311a2e55a4b55ecb1246f775709b Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 27 Jul 2021 10:39:15 -0600 Subject: [PATCH 097/124] watchr_KokkosConfig.json: updated script --- scripts/config/watchr_KokkosConfig.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/config/watchr_KokkosConfig.json b/scripts/config/watchr_KokkosConfig.json index c34a3e262..a056b8b69 100755 --- a/scripts/config/watchr_KokkosConfig.json +++ b/scripts/config/watchr_KokkosConfig.json @@ -25,6 +25,7 @@ "dataLines" : [ { "name" : "Data Line", + "template" : "Line_Template", "x" : { "getPath": "*", "getElement" : "performance-report", @@ -50,6 +51,7 @@ "category" : "Kokkos_Lambda_Seq", "dataLines" : [ { + "inherit" : "Line_Template", "y" : { "getKey" : "Kokkos_Lambda_Seq" } @@ -60,6 +62,7 @@ "category" : "Base_CUDA", "dataLines" : [ { + "inherit" : "Line_Template", "y" : { "getKey" : "Base_CUDA" } @@ -70,6 +73,7 @@ "category" : "Base_Seq", "dataLines" : [ { + "inherit" : "Line_Template", "y" : { "getKey" : "Base_Seq" } @@ -80,6 +84,7 @@ "category" : "Lambda_Seq", "dataLines" : [ { + "inherit" : "Line_Template", "y" : { "getKey" : "Lambda_Seq" } @@ -90,6 +95,7 @@ "category" : "RAJA_CUDA", "dataLines" : [ { + "inherit" : "Line_Template", "y" : { "getKey" : "RAJA_CUDA" } @@ -100,6 +106,7 @@ "category" : "RAJA_Seq", "dataLines" : [ { + "inherit" : "Line_Template", "y" : { "getKey" : "RAJA_Seq" } From 92f8f4d84b8e7aacf63fc0ea6446579eb09ac1c3 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 27 Jul 2021 11:38:08 -0600 Subject: [PATCH 098/124] deconflicting merge: apps --- src/apps/ENERGY.hpp | 1 - src/apps/FIR.hpp | 1 - src/apps/HALOEXCHANGE.hpp | 1 - src/apps/LTIMES.hpp | 1 - src/apps/LTIMES_NOVIEW.hpp | 1 - src/apps/PRESSURE.hpp | 1 - src/apps/VOL3D.hpp | 1 - src/apps/WIP-COUPLE.hpp | 2 +- 8 files changed, 1 insertion(+), 8 deletions(-) diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp index 767cc95aa..c34dd80f7 100644 --- a/src/apps/ENERGY.hpp +++ b/src/apps/ENERGY.hpp @@ -203,7 +203,6 @@ class ENERGY : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - void runKokkosVariant(VariantID vid); void runKokkosVariant(VariantID vid); diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp index f6ffc98c5..0f8c04d1a 100644 --- a/src/apps/FIR.hpp +++ b/src/apps/FIR.hpp @@ -80,7 +80,6 @@ class FIR : public KernelBase void runOpenMPTargetVariant(VariantID vid); void runKokkosVariant(VariantID vid); - void runKokkosVariant(VariantID vid); private: Real_ptr m_in; diff --git a/src/apps/HALOEXCHANGE.hpp b/src/apps/HALOEXCHANGE.hpp index aa6cd8fa6..d3967a881 100644 --- a/src/apps/HALOEXCHANGE.hpp +++ b/src/apps/HALOEXCHANGE.hpp @@ -95,7 +95,6 @@ class HALOEXCHANGE : public KernelBase void runOpenMPTargetVariant(VariantID vid); void runKokkosVariant(VariantID vid); - void runKokkosVariant(VariantID vid); private: static const int s_num_neighbors = 26; diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index b68e34323..c79ffcb7c 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -116,7 +116,6 @@ class LTIMES : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - void runKokkosVariant(VariantID vid); void runKokkosVariant(VariantID vid); diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index aa502a1bb..de2b15c34 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -66,7 +66,6 @@ class LTIMES_NOVIEW : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - void runKokkosVariant(VariantID vid); void runKokkosVariant(VariantID vid); diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index 9975ea7ea..091e912d4 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -72,7 +72,6 @@ class PRESSURE : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - void runKokkosVariant(VariantID vid); void runKokkosVariant(VariantID vid); diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp index b90cfa95d..cb385401b 100644 --- a/src/apps/VOL3D.hpp +++ b/src/apps/VOL3D.hpp @@ -169,7 +169,6 @@ class VOL3D : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - void runKokkosVariant(VariantID vid); void runKokkosVariant(VariantID vid); diff --git a/src/apps/WIP-COUPLE.hpp b/src/apps/WIP-COUPLE.hpp index d9738e12d..399d98aa9 100644 --- a/src/apps/WIP-COUPLE.hpp +++ b/src/apps/WIP-COUPLE.hpp @@ -171,7 +171,7 @@ class COUPLE : public KernelBase void runCudaVariant(VariantID vid) {(void) vid;} void runHipVariant(VariantID vid) {(void) vid;} void runOpenMPTargetVariant(VariantID vid) {(void) vid;} - void runKokkosVariant(VariantID vid); + //void runKokkosVariant(VariantID vid); void runKokkosVariant(VariantID vid) {(void) vid;} From efbfb4fdfe6a0896601532888d205670b39ad705 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 27 Jul 2021 17:13:49 -0600 Subject: [PATCH 099/124] watchr_KokkosConfig.json: fix RAJAPerf name --- scripts/config/watchr_KokkosConfig.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/config/watchr_KokkosConfig.json b/scripts/config/watchr_KokkosConfig.json index a056b8b69..3649444c9 100755 --- a/scripts/config/watchr_KokkosConfig.json +++ b/scripts/config/watchr_KokkosConfig.json @@ -1,7 +1,7 @@ { "plots" : { "files" : { - "fileName": "RAJAPerfSuite_*", + "fileName": "RAJAPerf*", "type" : "xml", "ignoreOldFiles" : true, "recurseDirectories" : true From cf7e93ab844a9d58c36ddd251e71f084c563adc1 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Wed, 28 Jul 2021 11:26:38 -0600 Subject: [PATCH 100/124] watchr_KokkosConfig.json: fix up config --- scripts/config/watchr_KokkosConfig.json | 44 ++++++++++++------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/scripts/config/watchr_KokkosConfig.json b/scripts/config/watchr_KokkosConfig.json index 3649444c9..888531393 100755 --- a/scripts/config/watchr_KokkosConfig.json +++ b/scripts/config/watchr_KokkosConfig.json @@ -7,20 +7,20 @@ "recurseDirectories" : true }, "categories": [ - "Kokkos_Lambda_CUDA", - "Kokkos_Lambda_Seq", - "Base_CUDA", - "Base_Seq", - "Lambda_Seq", - "RAJA_CUDA", - "RAJA_Seq" + "kokkos_lambda", + "lambda_cuda", + "base_cuda", + "base_seq", + "lambda_seq", + "raja_cuda", + "raja_seq" ], "plot" : [ { "autoname" : { "useProperty" : "y/path" }, - "category" : "Kokkos_Lambda_CUDA", + "category" : "kokkos_lambda", "template" : "kokkos_template", "dataLines" : [ { @@ -36,7 +36,7 @@ "getElement" : "performance-report|timing", "getPath": "*/kokkos_perf_suite/*", "getPathAttribute": "name", - "getKey" : "Kokkos_Lambda_CUDA", + "getKey" : "kokkos_lambda", "unit" : "seconds", "strategy" : { "getFirstMatchOnly" : "false", @@ -48,67 +48,67 @@ ] }, { "inherit" : "kokkos_template", - "category" : "Kokkos_Lambda_Seq", + "category" : "kokkos_lambda", "dataLines" : [ { "inherit" : "Line_Template", "y" : { - "getKey" : "Kokkos_Lambda_Seq" + "getKey" : "kokkos_lambda" } } ] }, { "inherit" : "kokkos_template", - "category" : "Base_CUDA", + "category" : "base_cuda", "dataLines" : [ { "inherit" : "Line_Template", "y" : { - "getKey" : "Base_CUDA" + "getKey" : "base_cuda" } } ] }, { "inherit" : "kokkos_template", - "category" : "Base_Seq", + "category" : "base_seq", "dataLines" : [ { "inherit" : "Line_Template", "y" : { - "getKey" : "Base_Seq" + "getKey" : "base_seq" } } ] }, { "inherit" : "kokkos_template", - "category" : "Lambda_Seq", + "category" : "lambda_seq", "dataLines" : [ { "inherit" : "Line_Template", "y" : { - "getKey" : "Lambda_Seq" + "getKey" : "lambda_seq" } } ] }, { "inherit" : "kokkos_template", - "category" : "RAJA_CUDA", + "category" : "raja_cuda", "dataLines" : [ { "inherit" : "Line_Template", "y" : { - "getKey" : "RAJA_CUDA" + "getKey" : "raja_cuda" } } ] }, { "inherit" : "kokkos_template", - "category" : "RAJA_Seq", + "category" : "raja_seq", "dataLines" : [ { "inherit" : "Line_Template", "y" : { - "getKey" : "RAJA_Seq" + "getKey" : "raja_seq" } } ] @@ -118,7 +118,7 @@ "graphDisplay": { "dbLocation" : "root", "page" : 1, - "displayCategory" : "Kokkos_Lambda_CUDA", + "displayCategory" : "kokkos_lambda", "displayRange" : 30, "graphWidth" : 450, "graphHeight" : 450, From da4f6864ff8afd53c91678f851fddbc28138221c Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 29 Jul 2021 10:46:06 -0600 Subject: [PATCH 101/124] watchr_KokkosConfig.json:rm timestamp for plotting --- scripts/config/watchr_KokkosConfig.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/config/watchr_KokkosConfig.json b/scripts/config/watchr_KokkosConfig.json index 888531393..3cfd66053 100755 --- a/scripts/config/watchr_KokkosConfig.json +++ b/scripts/config/watchr_KokkosConfig.json @@ -4,7 +4,8 @@ "fileName": "RAJAPerf*", "type" : "xml", "ignoreOldFiles" : true, - "recurseDirectories" : true + "recurseDirectories" : true, + "formatByRemovingPrefix" : "\\/RAJAPerf\\d{4}\\-\\d{2}\\-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.xml\\/" }, "categories": [ "kokkos_lambda", From 59ecd95a0c1799fb14bb05048153cb6baa185e80 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 29 Jul 2021 12:47:18 -0600 Subject: [PATCH 102/124] watchr_KokkosConfig.json: regex fix --- scripts/config/watchr_KokkosConfig.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/config/watchr_KokkosConfig.json b/scripts/config/watchr_KokkosConfig.json index 3cfd66053..613ecefa7 100755 --- a/scripts/config/watchr_KokkosConfig.json +++ b/scripts/config/watchr_KokkosConfig.json @@ -1,11 +1,11 @@ { "plots" : { "files" : { - "fileName": "RAJAPerf*", + "fileName": "RAJAPerf-timing_*", "type" : "xml", "ignoreOldFiles" : true, "recurseDirectories" : true, - "formatByRemovingPrefix" : "\\/RAJAPerf\\d{4}\\-\\d{2}\\-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.xml\\/" + "formatByRemovingPrefix" : "\\/RAJAPerf-timing_\\d{4}\\_\\d{2}\\_\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.xml\\/" }, "categories": [ "kokkos_lambda", From 8f7e27b3eab2001fe7009b21d8c8253afc40d349 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 29 Jul 2021 13:55:53 -0600 Subject: [PATCH 103/124] watchr_KokkosConfig.json: fixup regex --- scripts/config/watchr_KokkosConfig.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/config/watchr_KokkosConfig.json b/scripts/config/watchr_KokkosConfig.json index 613ecefa7..e8e6d50b2 100755 --- a/scripts/config/watchr_KokkosConfig.json +++ b/scripts/config/watchr_KokkosConfig.json @@ -1,11 +1,10 @@ { "plots" : { "files" : { - "fileName": "RAJAPerf-timing_*", + "fileName": "RAJAPerf*", "type" : "xml", "ignoreOldFiles" : true, - "recurseDirectories" : true, - "formatByRemovingPrefix" : "\\/RAJAPerf-timing_\\d{4}\\_\\d{2}\\_\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.xml\\/" + "recurseDirectories" : true }, "categories": [ "kokkos_lambda", @@ -19,7 +18,8 @@ "plot" : [ { "autoname" : { - "useProperty" : "y/path" + "useProperty" : "y/path", + "formatByRemovingPrefix" : "\\/RAJAPerf\\d{4}\\-\\d{2}\\-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.xml\\/" }, "category" : "kokkos_lambda", "template" : "kokkos_template", From 8d4723c5e918b77c9a4d7a4c314b354dbb7c15ed Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Fri, 30 Jul 2021 11:25:27 -0600 Subject: [PATCH 104/124] watchr_KokkosConfig.json: fix lambda_cuda --- scripts/config/watchr_KokkosConfig.json | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/config/watchr_KokkosConfig.json b/scripts/config/watchr_KokkosConfig.json index e8e6d50b2..dbd0c2353 100755 --- a/scripts/config/watchr_KokkosConfig.json +++ b/scripts/config/watchr_KokkosConfig.json @@ -81,6 +81,7 @@ } ] }, { + "inherit" : "kokkos_template", "category" : "lambda_seq", "dataLines" : [ @@ -92,6 +93,17 @@ } ] }, { + "inherit" : "kokkos_template", + "category" : "lambda_cuda", + "dataLines" : [ + { + "inherit" : "Line_Template", + "y" : { + "getKey" : "lambda_cuda" + } + } + ] + }, { "inherit" : "kokkos_template", "category" : "raja_cuda", "dataLines" : [ From 4c4f1035cedafd5afa2ed3fcc3fbcf07b7d6835b Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Fri, 30 Jul 2021 15:06:13 -0600 Subject: [PATCH 105/124] watchr_KokkosConfig.json: format lambda_cuda --- scripts/config/watchr_KokkosConfig.json | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/config/watchr_KokkosConfig.json b/scripts/config/watchr_KokkosConfig.json index dbd0c2353..1ab86b3e9 100755 --- a/scripts/config/watchr_KokkosConfig.json +++ b/scripts/config/watchr_KokkosConfig.json @@ -93,16 +93,16 @@ } ] }, { - "inherit" : "kokkos_template", - "category" : "lambda_cuda", - "dataLines" : [ - { - "inherit" : "Line_Template", - "y" : { - "getKey" : "lambda_cuda" - } - } - ] + "inherit" : "kokkos_template", + "category" : "lambda_cuda", + "dataLines" : [ + { + "inherit" : "Line_Template", + "y" : { + "getKey" : "lambda_cuda" + } + } + ] }, { "inherit" : "kokkos_template", "category" : "raja_cuda", From 170490d677314e9f46f92dc6a6f586eae5a5a5e1 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 2 Aug 2021 17:03:33 -0600 Subject: [PATCH 106/124] omptarget: fix up build errors --- src/CMakeLists.txt | 17 +++-- src/apps-kokkos/AppsData.cpp | 113 --------------------------------- src/apps-kokkos/CMakeLists.txt | 3 +- 3 files changed, 13 insertions(+), 120 deletions(-) delete mode 100644 src/apps-kokkos/AppsData.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8887a1ba2..6f1424f4b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -40,6 +40,11 @@ list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) if(ENABLE_TARGET_OPENMP) remove_definitions(-DRUN_RAJA_SEQ -DRUN_OPENMP ) include_directories(basic) +include_directories(lcals) +include_directories(apps) +include_directories(algorithm) +include_directories(stream) +include_directories(polybench) blt_add_executable( NAME raja-perf-omptarget.exe SOURCES RAJAPerfSuiteDriver.cpp @@ -76,7 +81,6 @@ blt_add_executable( apps/VOL3D-OMPTarget.cpp #apps/WIP-COUPLE.cpp #Kokkos bloc - apps-kokkos/AppsData.cpp apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp apps-kokkos/ENERGY-Kokkos.cpp apps-kokkos/FIR-Kokkos.cpp @@ -86,9 +90,12 @@ blt_add_executable( apps-kokkos/LTIMES_NOVIEW-Kokkos.cpp apps-kokkos/VOL3D-Kokkos.cpp #apps-kokkos/WIP-COUPLE.cpp - basic/ATOMIC_PI.cpp - basic/ATOMIC_PI-Seq.cpp - basic/ATOMIC_PI-OMPTarget.cpp + #basic/ATOMIC_PI.cpp + #basic/ATOMIC_PI-Seq.cpp + #basic/ATOMIC_PI-OMPTarget.cpp + basic/PI_ATOMIC.cpp + basic/PI_ATOMIC-Seq.cpp + basic/PI_ATOMIC-OMPTarget.cpp basic/DAXPY.cpp basic/DAXPY-Seq.cpp basic/DAXPY-OMPTarget.cpp @@ -122,7 +129,7 @@ blt_add_executable( basic/TRAP_INT.cpp basic/TRAP_INT-Seq.cpp basic/TRAP_INT-OMPTarget.cpp - basic-kokkos/ATOMIC_PI-Kokkos.cpp + basic-kokkos/PI_ATOMIC-Kokkos.cpp basic-kokkos/DAXPY-Kokkos.cpp basic-kokkos/IF_QUAD-Kokkos.cpp basic-kokkos/INIT3-Kokkos.cpp diff --git a/src/apps-kokkos/AppsData.cpp b/src/apps-kokkos/AppsData.cpp deleted file mode 100644 index b3c042162..000000000 --- a/src/apps-kokkos/AppsData.cpp +++ /dev/null @@ -1,113 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC -// and RAJA Performance Suite project contributors. -// See the RAJAPerf/COPYRIGHT file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include "AppsData.hpp" - -#include - -namespace rajaperf -{ -namespace apps -{ - -// -// Set mesh positions for 2d mesh. -// -void setMeshPositions_2d(Real_ptr x, Real_type dx, - Real_ptr y, Real_type dy, - const ADomain& domain) -{ - if (domain.ndims != 2) { - std::cout << "\n******* ERROR!!! domain is not 2d *******" << std::endl; - return; - } - - Index_type imin = domain.imin; - Index_type imax = domain.imax; - Index_type jmin = domain.jmin; - Index_type jmax = domain.jmax; - - Index_type jp = domain.jp; - - Index_type npnl = domain.NPNL; - Index_type npnr = domain.NPNR; - - Real_ptr x1, x2, x3, x4; - Real_ptr y1, y2, y3, y4; - NDSET2D(domain.jp, x, x1,x2,x3,x4) ; - NDSET2D(domain.jp, y, y1,y2,y3,y4) ; - - for (Index_type j = jmin - npnl; j < jmax + npnr; j++) { - for (Index_type i = imin - npnl; i < imax + npnr; i++) { - Index_type iz = i + j*jp ; - - x3[iz] = x4[iz] = i*dx; - x1[iz] = x2[iz] = (i+1)*dx; - - y1[iz] = y4[iz] = j*dy; - y2[iz] = y3[iz] = (j+1)*dy; - - } - } -} - - -// -// Set mesh positions for 2d mesh. -// -void setMeshPositions_3d(Real_ptr x, Real_type dx, - Real_ptr y, Real_type dy, - Real_ptr z, Real_type dz, - const ADomain& domain) -{ - if (domain.ndims != 3) { - std::cout << "\n******* ERROR!!! domain is not 3d *******" << std::endl; - return; - } - - Index_type imin = domain.imin; - Index_type imax = domain.imax; - Index_type jmin = domain.jmin; - Index_type jmax = domain.jmax; - Index_type kmin = domain.kmin; - Index_type kmax = domain.kmax; - - Index_type jp = domain.jp; - Index_type kp = domain.kp; - - Index_type npnl = domain.NPNL; - Index_type npnr = domain.NPNR; - - Real_ptr x0, x1, x2, x3, x4, x5, x6, x7; - Real_ptr y0, y1, y2, y3, y4, y5, y6, y7; - Real_ptr z0, z1, z2, z3, z4, z5, z6, z7; - NDPTRSET(domain.jp, domain.kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ; - NDPTRSET(domain.jp, domain.kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ; - NDPTRSET(domain.jp, domain.kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ; - - for (Index_type k = kmin - npnl; k < kmax + npnr; k++) { - for (Index_type j = jmin - npnl; j < jmax + npnr; j++) { - for (Index_type i = imin - npnl; i < imax + npnr; i++) { - Index_type iz = i + j*jp + kp*k ; - - x0[iz] = x2[iz] = x4[iz] = x6[iz] = i*dx; - x1[iz] = x3[iz] = x5[iz] = x7[iz] = (i+1)*dx; - - y0[iz] = y1[iz] = y4[iz] = y5[iz] = j*dy; - y2[iz] = y3[iz] = y6[iz] = y7[iz] = (j+1)*dy; - - z0[iz] = z1[iz] = z2[iz] = z3[iz] = k*dz; - z4[iz] = z5[iz] = z6[iz] = z7[iz] = (k+1)*dz; - - } - } - } -} - -} // end namespace apps -} // end namespace rajaperf diff --git a/src/apps-kokkos/CMakeLists.txt b/src/apps-kokkos/CMakeLists.txt index daa85c881..07802d26e 100644 --- a/src/apps-kokkos/CMakeLists.txt +++ b/src/apps-kokkos/CMakeLists.txt @@ -11,8 +11,7 @@ include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/../apps) blt_add_library( NAME apps-kokkos - SOURCES AppsData.cpp - DEL_DOT_VEC_2D-Kokkos.cpp + SOURCES DEL_DOT_VEC_2D-Kokkos.cpp ENERGY-Kokkos.cpp FIR-Kokkos.cpp HALOEXCHANGE-Kokkos.cpp From 4766ccb893e0f653be7b173ca62a6f96def213b4 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 3 Aug 2021 15:17:43 -0600 Subject: [PATCH 107/124] watchr_KokkosConfig.json: fix lambda_cuda --- scripts/config/watchr_KokkosConfig.json | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/scripts/config/watchr_KokkosConfig.json b/scripts/config/watchr_KokkosConfig.json index 1ab86b3e9..a9cc9affa 100755 --- a/scripts/config/watchr_KokkosConfig.json +++ b/scripts/config/watchr_KokkosConfig.json @@ -47,52 +47,53 @@ "color" : "202,77,77" } ] - }, { + },{ "inherit" : "kokkos_template", - "category" : "kokkos_lambda", + "category" : "base_cuda", "dataLines" : [ { "inherit" : "Line_Template", "y" : { - "getKey" : "kokkos_lambda" + "getKey" : "base_cuda" } } ] }, { "inherit" : "kokkos_template", - "category" : "base_cuda", + "category" : "base_seq", "dataLines" : [ { "inherit" : "Line_Template", "y" : { - "getKey" : "base_cuda" + "getKey" : "base_seq" } } ] }, { + "inherit" : "kokkos_template", - "category" : "base_seq", + "category" : "lambda_seq", "dataLines" : [ { "inherit" : "Line_Template", "y" : { - "getKey" : "base_seq" + "getKey" : "lambda_seq" } } ] - }, { + },{ "inherit" : "kokkos_template", - "category" : "lambda_seq", + "category" : "lambda_cuda", "dataLines" : [ { "inherit" : "Line_Template", "y" : { - "getKey" : "lambda_seq" + "getKey" : "lambda_cuda" } } ] - }, { + }, { "inherit" : "kokkos_template", "category" : "lambda_cuda", "dataLines" : [ From 3e331dc669a77010b57bd0bccfcdbcfecb5cc8b6 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 3 Aug 2021 15:20:18 -0600 Subject: [PATCH 108/124] watchr_KokkosConfig.json: fix lambda_cuda --- scripts/config/watchr_KokkosConfig.json | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/scripts/config/watchr_KokkosConfig.json b/scripts/config/watchr_KokkosConfig.json index a9cc9affa..ec10e2e8a 100755 --- a/scripts/config/watchr_KokkosConfig.json +++ b/scripts/config/watchr_KokkosConfig.json @@ -93,18 +93,7 @@ } } ] - }, { - "inherit" : "kokkos_template", - "category" : "lambda_cuda", - "dataLines" : [ - { - "inherit" : "Line_Template", - "y" : { - "getKey" : "lambda_cuda" - } - } - ] - }, { + },{ "inherit" : "kokkos_template", "category" : "raja_cuda", "dataLines" : [ From 6af5a71645c319f523d7c429bb3758dda7b6dc06 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 14 Sep 2021 13:54:12 -0600 Subject: [PATCH 109/124] Kokkos translations: apps, algorithm --- src/CMakeLists.txt | 16 +- src/algorithm/SORT.cpp | 1 + src/algorithm/SORT.hpp | 3 + src/algorithm/SORTPAIRS.cpp | 1 + src/algorithm/SORTPAIRS.hpp | 2 + src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp | 324 +++++++++++++++++----- src/apps-kokkos/ENERGY-Kokkos.cpp | 167 +++++++++-- src/apps-kokkos/FIR-Kokkos.cpp | 55 +++- src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp | 60 +++- src/apps-kokkos/LTIMES-Kokkos.cpp | 53 +++- src/apps-kokkos/PRESSURE-Kokkos.cpp | 74 ++++- src/apps-kokkos/VOL3D-Kokkos.cpp | 162 ++++++++++- src/apps/AppsData.hpp | 4 +- src/apps/DEL_DOT_VEC_2D.cpp | 2 + src/apps/ENERGY.cpp | 1 + src/apps/FIR.cpp | 5 + src/apps/HALOEXCHANGE.cpp | 1 + src/apps/LTIMES.cpp | 2 + src/apps/PRESSURE.cpp | 2 + src/apps/VOL3D.cpp | 2 + src/common/RAJAPerfSuite.cpp | 9 +- src/common/RAJAPerfSuite.hpp | 4 +- src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp | 100 +++---- src/stream-kokkos/ADD-Kokkos.cpp | 7 - src/stream-kokkos/DOT-Kokkos.cpp | 23 +- 25 files changed, 866 insertions(+), 214 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8887a1ba2..c656586cc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -19,7 +19,8 @@ add_subdirectory(lcals-kokkos) #add_subdirectory(polybench) add_subdirectory(stream) add_subdirectory(stream-kokkos) -#add_subdirectory(algorithm) +add_subdirectory(algorithm) +add_subdirectory(algorithm-kokkos) set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS common @@ -33,7 +34,8 @@ set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS #polybench stream stream-kokkos - #algorithm + algorithm + algorithm-kokkos ) list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) @@ -236,6 +238,12 @@ blt_add_executable( stream-kokkos/DOT-Kokkos.cpp stream-kokkos/MUL-Kokkos.cpp stream-kokkos/TRIAD-Kokkos.cpp + algorithm/SORT.cpp + algorithm/SORT-Seq.cpp + algorithm/SORTPAIRS.cpp + algorithm/SORTPAIRS-Seq.cpp + algorithm-kokkos/SORT-Kokkos.cpp + algorithm-kokkos/SORTPAIRS-Kokkos.cpp common/DataUtils.cpp common/Executor.cpp common/KernelBase.cpp @@ -243,10 +251,6 @@ blt_add_executable( common/RAJAPerfSuite.cpp common/RPTypes.hpp common/RunParams.cpp - #algorithm/SORT.cpp - #algorithm/SORT-Seq.cpp - #algorithm/SORTPAIRS.cpp - #algorithm/SORTPAIRS-Seq.cpp DEPENDS_ON ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/algorithm/SORT.cpp b/src/algorithm/SORT.cpp index ca02173a1..242a5f149 100644 --- a/src/algorithm/SORT.cpp +++ b/src/algorithm/SORT.cpp @@ -41,6 +41,7 @@ SORT::SORT(const RunParams& params) setVariantDefined( RAJA_CUDA ); setVariantDefined( RAJA_HIP ); + setVariantDefined(Kokkos_Lambda); } SORT::~SORT() diff --git a/src/algorithm/SORT.hpp b/src/algorithm/SORT.hpp index 39c1d952f..b8b3c969e 100644 --- a/src/algorithm/SORT.hpp +++ b/src/algorithm/SORT.hpp @@ -46,6 +46,7 @@ class SORT : public KernelBase void updateChecksum(VariantID vid); void tearDown(VariantID vid); + void runKokkosVariant(VariantID vid); void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); void runCudaVariant(VariantID vid); @@ -55,6 +56,8 @@ class SORT : public KernelBase std::cout << "\n SORT : Unknown OMP Target variant id = " << vid << std::endl; } + + private: Real_ptr m_x; }; diff --git a/src/algorithm/SORTPAIRS.cpp b/src/algorithm/SORTPAIRS.cpp index 41fa2ad05..8dc5d20d0 100644 --- a/src/algorithm/SORTPAIRS.cpp +++ b/src/algorithm/SORTPAIRS.cpp @@ -41,6 +41,7 @@ SORTPAIRS::SORTPAIRS(const RunParams& params) setVariantDefined( RAJA_CUDA ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Kokkos_Lambda ); } SORTPAIRS::~SORTPAIRS() diff --git a/src/algorithm/SORTPAIRS.hpp b/src/algorithm/SORTPAIRS.hpp index fe0b3a212..0fa7bb433 100644 --- a/src/algorithm/SORTPAIRS.hpp +++ b/src/algorithm/SORTPAIRS.hpp @@ -54,6 +54,8 @@ class SORTPAIRS : public KernelBase std::cout << "\n SORTPAIRS : Unknown OMP Target variant id = " << vid << std::endl; } + void runKokkosVariant(VariantID vid); + private: Real_ptr m_x; Real_ptr m_i; diff --git a/src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp b/src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp index 83a4c7b3a..ed66ca3cb 100644 --- a/src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp +++ b/src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp @@ -16,16 +16,32 @@ #include -namespace rajaperf -{ -namespace apps -{ - - -void DEL_DOT_VEC_2D::runKokkosVariant(VariantID vid) -{ - //FIXME - return; +namespace rajaperf { +namespace apps { + +struct arrayOffSetStruct { + using ViewType = Kokkos::View; // Real_ptr is equivalent to float* + + // v's are offsets; + ViewType v, v4, v1, v2, v3; + + // constructor + arrayOffSetStruct(const std::string& name, // we needed a name, for future efforts + Index_type num_elements, // alloc size of head; + Index_type jp, // their macro took in jp, so we're using it + Real_ptr head // v, approximately; + ): + // ":" = list of things to initialize + v (getViewFromPointer(head, num_elements)), + // Initializing v4 with v + v4(v), + v1(Kokkos::subview(v4, std::make_pair(static_cast(1), v4.extent(0)))), + v2(Kokkos::subview(v1, std::make_pair(static_cast(jp), v1.extent(0)))), + v3(Kokkos::subview(v4, std::make_pair(static_cast(jp), v4.extent(0)))){ + } +}; + +void DEL_DOT_VEC_2D::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -33,78 +49,260 @@ void DEL_DOT_VEC_2D::runKokkosVariant(VariantID vid) DEL_DOT_VEC_2D_DATA_SETUP; - NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ; - NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ; - NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ; - NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ; - - switch ( vid ) { - - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type ii = ibegin ; ii < iend ; ++ii ) { - DEL_DOT_VEC_2D_BODY_INDEX; - DEL_DOT_VEC_2D_BODY; - } - + NDSET2D(m_domain->jp, x, x1, x2, x3, x4); + NDSET2D(m_domain->jp, y, y1, y2, y3, y4); + NDSET2D(m_domain->jp, xdot, fx1, fx2, fx3, fx4); + NDSET2D(m_domain->jp, ydot, fy1, fy2, fy3, fy4); + + // Instantiating Kokkos Views with getViewFromPointer + //auto x_view = getViewFromPointer(x, m_domain->nnalls); + //auto y_view = getViewFromPointer(y, iend); + //auto xdot_view = getViewFromPointer(xdot, iend); + //auto ydot_view = getViewFromPointer(ydot, iend); + auto div_view = getViewFromPointer(div, m_domain->nnalls); + + arrayOffSetStruct x_offsets("x_offsets", m_domain->nnalls, m_domain->jp, x ); + arrayOffSetStruct y_offsets("y_offsets", m_domain->nnalls, m_domain->jp, y ); + arrayOffSetStruct xdot_offsets("xdot_offsets", m_domain->nnalls, m_domain->jp, xdot ); + arrayOffSetStruct ydot_offsets("ydot_offsets", m_domain->nnalls, m_domain->jp, ydot ); + + auto& x_view = x_offsets.v; + auto& x1_view = x_offsets.v1; + auto& x2_view = x_offsets.v2; + auto& x3_view = x_offsets.v3; + auto& x4_view = x_offsets.v4; + + + auto& y_view = y_offsets.v; + auto& y1_view = y_offsets.v1; + auto& y2_view = y_offsets.v2; + auto& y3_view = y_offsets.v3; + auto& y4_view = y_offsets.v4; + + + auto& xdot_view = xdot_offsets.v; + auto& fx1_view = xdot_offsets.v1; + auto& fx2_view = xdot_offsets.v2; + auto& fx3_view = xdot_offsets.v3; + auto& fx4_view = xdot_offsets.v4; + + + auto& ydot_view = ydot_offsets.v; + auto& fy1_view = ydot_offsets.v1; + auto& fy2_view = ydot_offsets.v2; + auto& fy3_view = ydot_offsets.v3; + auto& fy4_view = ydot_offsets.v4; + + // Use Kokkos::Subviews + /* + auto x1_view = getViewFromPointer(x1, iend); + auto x2_view = getViewFromPointer(x2, iend); + auto x3_view = getViewFromPointer(x3, iend); + auto x4_view = getViewFromPointer(x4, iend); + + auto y1_view = getViewFromPointer(y1, iend); + auto y2_view = getViewFromPointer(y2, iend); + auto y3_view = getViewFromPointer(y3, iend); + auto y4_view = getViewFromPointer(y4, iend); + + auto fx1_view = getViewFromPointer(fx1, iend); + auto fx2_view = getViewFromPointer(fx2, iend); + auto fx3_view = getViewFromPointer(fx3, iend); + auto fx4_view = getViewFromPointer(fx4, iend); + + auto fy1_view = getViewFromPointer(fy1, iend); + auto fy2_view = getViewFromPointer(fy2, iend); + auto fy3_view = getViewFromPointer(fy3, iend); + auto fy4_view = getViewFromPointer(fy4, iend); + +*/ + +#if defined(RUN_KOKKOS) + switch (vid) { + + case Base_Seq: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type ii = ibegin; ii < iend; ++ii) { + DEL_DOT_VEC_2D_BODY_INDEX; + DEL_DOT_VEC_2D_BODY; } - stopTimer(); + } + stopTimer(); - break; - } + break; + } -#if defined(RUN_RAJA_SEQ) - case Lambda_Seq : { + // #if defined(RUN_RAJA_SEQ) + case Lambda_Seq: { - auto deldotvec2d_base_lam = [=](Index_type ii) { - DEL_DOT_VEC_2D_BODY_INDEX; - DEL_DOT_VEC_2D_BODY; - }; + auto deldotvec2d_base_lam = [=](Index_type ii) { + DEL_DOT_VEC_2D_BODY_INDEX; + DEL_DOT_VEC_2D_BODY; + }; - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type ii = ibegin ; ii < iend ; ++ii ) { - deldotvec2d_base_lam(ii); - } + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + for (Index_type ii = ibegin; ii < iend; ++ii) { + deldotvec2d_base_lam(ii); } - stopTimer(); - - break; } + stopTimer(); - case RAJA_Seq : { + break; + } + /* + case RAJA_Seq : { - camp::resources::Resource working_res{camp::resources::Host()}; - RAJA::TypedListSegment zones(m_domain->real_zones, - m_domain->n_real_zones, - working_res); + camp::resources::Resource working_res{camp::resources::Host()}; + RAJA::TypedListSegment zones(m_domain->real_zones, + m_domain->n_real_zones, + working_res); - auto deldotvec2d_lam = [=](Index_type i) { - DEL_DOT_VEC_2D_BODY; - }; + auto deldotvec2d_lam = [=](Index_type i) { + DEL_DOT_VEC_2D_BODY; + }; - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::forall(zones, deldotvec2d_lam); + RAJA::forall(zones, deldotvec2d_lam); - } - stopTimer(); + } + stopTimer(); - break; + break; + } + */ + + case Kokkos_Lambda: { + + // Host resource will be used for loop execution + // camp::resources::Resource working_res{camp::resources::Host()}; + + // List segment = indices you're iterating over are contained in lists; + + /* RAJA::TypedListSegment zones(m_domain->real_zones, + m_domain->n_real_zones, + working_res); + */ + auto deldotvec2d_lam = [=](Index_type i) { DEL_DOT_VEC_2D_BODY; }; + + auto index_list = + getViewFromPointer(m_domain->real_zones, m_domain->n_real_zones); + + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // RAJA::forall(zones, deldotvec2d_lam); + Kokkos::parallel_for( + "DEL_DOT_VEC_2D Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type ii) { + // #define DEL_DOT_VEC_2D_BODY + int i = index_list[ii]; + + // Real_type xi = half * ( x1[i] + x2[i] - x3[i] - x4[i] ) ; + Real_type xi = + half * (x1_view[i] + x2_view[i] - x3_view[i] - + x4_view[i]); // Real_type xj = half * ( x2[i] + x3[i] + // - x4[i] - x1[i] ) ; + Real_type xj = + half * (x2_view[i] + x3_view[i] - x4_view[i] - + x1_view[i]); // Real_type yi = half * ( y1[i] + y2[i] + // - y3[i] - y4[i] ) ; + Real_type yi = + half * (y1_view[i] + y2_view[i] - y3_view[i] - + y4_view[i]); // Real_type yj = half * ( y2[i] + y3[i] + // - y4[i] - y1[i] ) ; + Real_type yj = + half * (y2_view[i] + y3_view[i] - y4_view[i] - + y1_view[i]); // Real_type fxi = half * ( fx1[i] + fx2[i] + // - fx3[i] - fx4[i] ) ; + Real_type fxi = + half * (fx1_view[i] + fx2_view[i] - fx3_view[i] - + fx4_view[i]); // Real_type fxj = half * ( fx2[i] + + // fx3[i] - fx4[i] - fx1[i] ) ; + Real_type fxj = + half * (fx2_view[i] + fx3_view[i] - fx4_view[i] - + fx1_view[i]); // Real_type fyi = half * ( fy1[i] + + // fy2[i] - fy3[i] - fy4[i] ) ; + Real_type fyi = + half * (fy1_view[i] + fy2_view[i] - fy3_view[i] - + fy4_view[i]); // Real_type fyj = half * ( fy2[i] + + // fy3[i] - fy4[i] - fy1[i] ) ; + Real_type fyj = + half * (fy2_view[i] + fy3_view[i] - fy4_view[i] - + fy1_view[i]); // Real_type rarea = 1.0 / ( xi * yj - xj + // * yi + ptiny ) ; + Real_type rarea = + 1.0 / + (xi * yj - xj * yi + + ptiny); // Real_type dfxdx = rarea * ( fxi * yj - fxj * yi ) ; + Real_type dfxdx = + rarea * (fxi * yj - fxj * yi); // Real_type dfydy = rarea * ( + // fyj * xi - fyi * xj ) ; + Real_type dfydy = + rarea * (fyj * xi - fyi * xj); /* Real_type affine = ( fy1[i] + + fy2[i] + fy3[i] + fy4[i] ) / \ + ( y1[i] + + y2[i] + y3[i] + y4[i] ) ; \ + */ + Real_type affine = + (fy1_view[i] + fy2_view[i] + fy3_view[i] + fy4_view[i]) / + (y1_view[i] + y2_view[i] + y3_view[i] + + y4_view[i]); // div[i] = dfxdx + dfydy + affine ; + div_view[i] = dfxdx + dfydy + affine; + } + + ); } -#endif // RUN_RAJA_SEQ + stopTimer(); - default : { - std::cout << "\n DEL_DOT_VEC_2D : Unknown variant id = " << vid << std::endl; - } + break; + } + //#endif // RUN_RAJA_SEQ + default: { + std::cout << "\n DEL_DOT_VEC_2D : Unknown variant id = " << vid + << std::endl; } + +} + +#endif // RUN_KOKKOS + + // moveDataToHostFromKokkosView(a, a_view, iend); + + moveDataToHostFromKokkosView(x, x_view, m_domain->nnalls); + moveDataToHostFromKokkosView(y, y_view, m_domain->nnalls); + moveDataToHostFromKokkosView(xdot, xdot_view, m_domain->nnalls); + moveDataToHostFromKokkosView(ydot, ydot_view, m_domain->nnalls); + moveDataToHostFromKokkosView(div, div_view, m_domain->nnalls); +/* + moveDataToHostFromKokkosView(x1, x1_view, iend); + moveDataToHostFromKokkosView(x2, x2_view, iend); + moveDataToHostFromKokkosView(x3, x3_view, iend); + moveDataToHostFromKokkosView(x4, x4_view, iend); + + moveDataToHostFromKokkosView(y1, y1_view, iend); + moveDataToHostFromKokkosView(y2, y2_view, iend); + moveDataToHostFromKokkosView(y3, y3_view, iend); + moveDataToHostFromKokkosView(y4, y4_view, iend); + + moveDataToHostFromKokkosView(fx1, fx1_view, iend); + moveDataToHostFromKokkosView(fx2, fx2_view, iend); + moveDataToHostFromKokkosView(fx3, fx3_view, iend); + moveDataToHostFromKokkosView(fx4, fx4_view, iend); + + moveDataToHostFromKokkosView(fy1, fy1_view, iend); + moveDataToHostFromKokkosView(fy2, fy2_view, iend); + moveDataToHostFromKokkosView(fy3, fy3_view, iend); + moveDataToHostFromKokkosView(fy4, fy4_view, iend); +*/ + } diff --git a/src/apps-kokkos/ENERGY-Kokkos.cpp b/src/apps-kokkos/ENERGY-Kokkos.cpp index 39a2f64e4..2dd43ccbd 100644 --- a/src/apps-kokkos/ENERGY-Kokkos.cpp +++ b/src/apps-kokkos/ENERGY-Kokkos.cpp @@ -20,14 +20,32 @@ namespace apps void ENERGY::runKokkosVariant(VariantID vid) { - //FIXME - return; const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); ENERGY_DATA_SETUP; + + // Instantiate Kokkos::Views + //auto a_view = getViewFromPointer(a, iend); + + auto e_new_view = getViewFromPointer(e_new, iend); + auto e_old_view = getViewFromPointer(e_old, iend); + auto delvc_view = getViewFromPointer(delvc, iend); + auto p_new_view = getViewFromPointer(p_new, iend); + auto p_old_view = getViewFromPointer(p_old, iend); + auto q_new_view = getViewFromPointer(q_new, iend); + auto q_old_view = getViewFromPointer(q_old, iend); + auto work_view = getViewFromPointer(work, iend); + auto compHalfStep_view = getViewFromPointer(compHalfStep, iend); + auto pHalfStep_view = getViewFromPointer(pHalfStep, iend); + auto bvc_view = getViewFromPointer(bvc, iend); + auto pbvc_view = getViewFromPointer(pbvc, iend); + auto ql_old_view = getViewFromPointer(ql_old, iend); + auto qq_old_view = getViewFromPointer(qq_old, iend); + auto vnewc_view = getViewFromPointer(vnewc, iend); + auto energy_lam1 = [=](Index_type i) { ENERGY_BODY1; @@ -48,6 +66,7 @@ void ENERGY::runKokkosVariant(VariantID vid) ENERGY_BODY6; }; +#if defined(RUN_KOKKOS) switch ( vid ) { case Base_Seq : { @@ -85,7 +104,6 @@ void ENERGY::runKokkosVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) case Lambda_Seq : { startTimer(); @@ -121,39 +139,126 @@ void ENERGY::runKokkosVariant(VariantID vid) break; } - case RAJA_Seq : { + case Kokkos_Lambda : { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::region( [=]() { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), energy_lam1); - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), energy_lam2); + Kokkos::parallel_for("ENERGY - lambda 1", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(const int64_t i){ + // Lamda Body 1 + e_new_view[i] = e_old_view[i] - 0.5 * delvc_view[i] * \ + (p_old_view[i] + ql_old_view[i]) + 0.5 * work_view[i]; + + }); + + Kokkos::parallel_for("ENERGY - lambda 2", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(const int64_t i){ + //#define ENERGY_BODY2 + if ( delvc_view[i] > 0.0 ) { + q_new_view[i] = 0.0 ; + } \ + else { \ + Real_type vhalf = 1.0 / (1.0 + compHalfStep_view[i]) ; + Real_type ssc = ( pbvc[i] * e_new_view[i] + + vhalf * vhalf * bvc[i] * pHalfStep_view[i] ) / rho0 ; + if ( ssc <= 0.1111111e-36 ) { + ssc = 0.3333333e-18 ; + } else { + ssc = sqrt(ssc) ; + } + q_new_view[i] = (ssc*ql_old_view[i] + qq_old_view[i]) ; + } + }); + + + Kokkos::parallel_for("ENERGY - lambda 3", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(const int64_t i){ + //#define ENERGY_BODY3 + + e_new_view[i] = e_new_view[i] + 0.5 * delvc_view[i] \ + * ( 3.0*(p_old_view[i] + qq_old_view[i]) \ + - 4.0*(pHalfStep_view[i] + q_new_view[i])) ; + + + }); + + + + Kokkos::parallel_for("ENERGY - lambda 4", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(const int64_t i){ + //#define ENERGY_BODY4 + + e_new_view[i] += 0.5 * work_view[i]; \ + if ( fabs(e_new_view[i]) < e_cut ) { e_new_view[i] = 0.0 ; } \ + if ( e_new_view[i] < emin ) { e_new_view[i] = emin ; } + + }); + + + Kokkos::parallel_for("ENERGY - lambda 5", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(const int64_t i){ + //#define ENERGY_BODY5 + Real_type q_tilde ; \ + + if (delvc_view[i] > 0.0) { \ + q_tilde = 0. ; \ + } \ + else { \ + Real_type ssc = ( pbvc_view[i] * e_new_view[i] \ + + vnewc_view[i] * vnewc_view[i] * bvc_view[i] * p_new_view[i] ) / rho0 ; \ + if ( ssc <= 0.1111111e-36 ) { \ + ssc = 0.3333333e-18 ; \ + } else { \ + ssc = sqrt(ssc) ; \ + } \ + q_tilde = (ssc*ql_old[i] + qq_old_view[i]) ; \ + } \ + e_new_view[i] = e_new_view[i] - ( 7.0*(p_old_view[i] + q_old_view[i]) \ + - 8.0*(pHalfStep_view[i] + q_new_view[i]) \ + + (p_new_view[i] + q_tilde)) * delvc_view[i] / 6.0 ; \ + if ( fabs(e_new_view[i]) < e_cut ) { \ + e_new_view[i] = 0.0 ; \ + } \ + if ( e_new_view[i] < emin ) { \ + e_new_view[i] = emin ; \ + } + - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), energy_lam3); + }); - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), energy_lam4); - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), energy_lam5); + Kokkos::parallel_for("ENERGY - lambda 6", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(const int64_t i){ + //#define ENERGY_BODY6 + + if ( delvc_view[i] <= 0.0 ) { \ + Real_type ssc = ( pbvc_view[i] * e_new_view[i] \ + + vnewc_view[i] * vnewc_view[i] * bvc_view[i] * p_new_view[i] ) / rho0 ; \ + if ( ssc <= 0.1111111e-36 ) { \ + ssc = 0.3333333e-18 ; \ + } else { \ + ssc = sqrt(ssc) ; \ + } \ + q_new_view[i] = (ssc*ql_old_view[i] + qq_old_view[i]) ; \ + if (fabs(q_new_view[i]) < q_cut) q_new_view[i] = 0.0 ; \ + } - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), energy_lam6); - }); // end sequential region (for single-source code) + }); } stopTimer(); break; } -#endif // RUN_RAJA_SEQ + default : { std::cout << "\n ENERGY : Unknown variant id = " << vid << std::endl; @@ -161,6 +266,26 @@ void ENERGY::runKokkosVariant(VariantID vid) } + +#endif // RUN_KOKKOS + + //moveDataToHostFromKokkosView(a, a_view, iend); + moveDataToHostFromKokkosView(e_new, e_new_view, iend); + moveDataToHostFromKokkosView(e_old, e_old_view, iend); + moveDataToHostFromKokkosView(delvc, delvc_view, iend); + moveDataToHostFromKokkosView(p_new, p_new_view, iend); + moveDataToHostFromKokkosView(p_old, p_old_view, iend); + moveDataToHostFromKokkosView(q_new, q_new_view, iend); + moveDataToHostFromKokkosView(q_old, ql_old_view, iend); + moveDataToHostFromKokkosView(work, work_view, iend); + moveDataToHostFromKokkosView(compHalfStep, compHalfStep_view, iend); + moveDataToHostFromKokkosView(pHalfStep, pHalfStep_view, iend); + moveDataToHostFromKokkosView(bvc, bvc_view, iend); + moveDataToHostFromKokkosView(pbvc, pbvc_view, iend); + moveDataToHostFromKokkosView(ql_old, ql_old_view, iend); + moveDataToHostFromKokkosView(qq_old, qq_old_view, iend); + moveDataToHostFromKokkosView(vnewc, vnewc_view, iend); + } } // end namespace apps diff --git a/src/apps-kokkos/FIR-Kokkos.cpp b/src/apps-kokkos/FIR-Kokkos.cpp index 322bd0210..82cd49cb1 100644 --- a/src/apps-kokkos/FIR-Kokkos.cpp +++ b/src/apps-kokkos/FIR-Kokkos.cpp @@ -21,23 +21,35 @@ namespace apps void FIR::runKokkosVariant(VariantID vid) { - // FIXME - return; + const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; const Index_type iend = getActualProblemSize() - m_coefflen; + // Macro for 1D Array of defined length of coefficients FIR_COEFF; + // Declare & initialize pointers, coefflen FIR_DATA_SETUP; + // Declare coeff array Real_type coeff[FIR_COEFFLEN]; + + + // std::copy(iterator source_first, iterator source_end, iterator target_start); + // Copy the "coeff_array" (in FIR.hpp) into the "coeff" array; both are + // "Real_type" std::copy(std::begin(coeff_array), std::end(coeff_array), std::begin(coeff)); + auto in_view = getViewFromPointer(in, iend + m_coefflen); + auto out_view = getViewFromPointer(out, iend + m_coefflen); + auto fir_lam = [=](Index_type i) { FIR_BODY; }; +#if defined(RUN_KOKKOS) + switch ( vid ) { case Base_Seq : { @@ -55,7 +67,6 @@ void FIR::runKokkosVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) case Lambda_Seq : { startTimer(); @@ -70,7 +81,7 @@ void FIR::runKokkosVariant(VariantID vid) break; } - +/* case RAJA_Seq : { startTimer(); @@ -84,7 +95,35 @@ void FIR::runKokkosVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ + + */ + + + case Kokkos_Lambda : { + + Kokkos::fence(); + startTimer(); + + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Kokkos::parallel_for("FIR - Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i) { + // #define FIR_BODY + Real_type sum = 0.0; + + for (Index_type j = 0; j < coefflen; ++j ) { + sum += coeff[j]*in_view[i+j]; + } + out_view[i] = sum; + }); + + } + Kokkos::fence(); + stopTimer(); + + break; + } default : { std::cout << "\n FIR : Unknown variant id = " << vid << std::endl; @@ -92,6 +131,12 @@ void FIR::runKokkosVariant(VariantID vid) } +#endif // RUN_KOKKOS + + moveDataToHostFromKokkosView(in, in_view, iend + m_coefflen); + moveDataToHostFromKokkosView(out, out_view, iend + m_coefflen); + + } } // end namespace apps diff --git a/src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp b/src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp index 70f4216a4..097599425 100644 --- a/src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp +++ b/src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp @@ -21,12 +21,14 @@ namespace apps void HALOEXCHANGE::runKokkosVariant(VariantID vid) { //FIXME - return; + //return; const Index_type run_reps = getRunReps(); HALOEXCHANGE_DATA_SETUP; +#if defined(RUN_KOKKOS) + switch ( vid ) { case Base_Seq : { @@ -66,7 +68,6 @@ void HALOEXCHANGE::runKokkosVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) case Lambda_Seq : { startTimer(); @@ -109,7 +110,7 @@ void HALOEXCHANGE::runKokkosVariant(VariantID vid) break; } - +/* case RAJA_Seq : { using EXEC_POL = RAJA::loop_exec; @@ -154,7 +155,54 @@ void HALOEXCHANGE::runKokkosVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ + +*/ + case Kokkos_Lambda : { + + Kokkos::fence(); + startTimer(); + + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_pack_base_lam = KOKKOS_LAMBDA(Index_type i) { + HALOEXCHANGE_PACK_BODY; + }; + +Kokkos::parallel_for("HALOEXCHANGE - Pack Body - Kokkos Lambda", + Kokkos::RangePolicy(0, len), + haloexchange_pack_base_lam); + buffer += len; + } + } + + for (Index_type l = 0; l < num_neighbors; ++l) { + Real_ptr buffer = buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + for (Index_type v = 0; v < num_vars; ++v) { + Real_ptr var = vars[v]; + auto haloexchange_unpack_base_lam = KOKKOS_LAMBDA(Index_type i) { + HALOEXCHANGE_UNPACK_BODY; + }; + + Kokkos::parallel_for("HALOEXCHANGE - Unpack Body - Kokkos Lambda", + Kokkos::RangePolicy(0, len), + haloexchange_unpack_base_lam); + buffer += len; + } + } + + } + Kokkos::fence(); + stopTimer(); + break; + } default : { std::cout << "\n HALOEXCHANGE : Unknown variant id = " << vid << std::endl; @@ -162,6 +210,10 @@ void HALOEXCHANGE::runKokkosVariant(VariantID vid) } +#endif // RUN_KOKKOS + + + } } // end namespace apps diff --git a/src/apps-kokkos/LTIMES-Kokkos.cpp b/src/apps-kokkos/LTIMES-Kokkos.cpp index 39152d23d..f2a8cd65c 100644 --- a/src/apps-kokkos/LTIMES-Kokkos.cpp +++ b/src/apps-kokkos/LTIMES-Kokkos.cpp @@ -21,12 +21,18 @@ namespace apps void LTIMES::runKokkosVariant(VariantID vid) { // FIXME - return; + //return; const Index_type run_reps = getRunReps(); LTIMES_DATA_SETUP; + auto phi = getViewFromPointer(phidat, num_z, num_g, num_m); + auto psi = getViewFromPointer(psidat, num_z, num_g, num_d); + auto ell = getViewFromPointer(elldat, num_m, num_d); + +#if defined (RUN_KOKKOS) + switch ( vid ) { case Base_Seq : { @@ -50,7 +56,6 @@ void LTIMES::runKokkosVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) case Lambda_Seq : { auto ltimes_base_lam = [=](Index_type d, Index_type z, @@ -76,7 +81,7 @@ void LTIMES::runKokkosVariant(VariantID vid) break; } - +/* case RAJA_Seq : { LTIMES_VIEWS_RANGES_RAJA; @@ -114,7 +119,39 @@ void LTIMES::runKokkosVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ +*/ + + case Kokkos_Lambda : { + + //LTIMES_VIEWS_RANGES_RAJA; + + + + // Kokkos uses MDRange to model tightly-nested loops + + + Kokkos::fence(); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + Kokkos::parallel_for("LTIMES", + Kokkos::MDRangePolicy>({0,0,0,0},{num_z, num_g, num_m, num_d}), + KOKKOS_LAMBDA(int64_t z, int64_t g, int64_t m, int64_t d) { + // #define LTIMES_BODY_RAJA \ + // phi(z, g, m) += ell(m, d) * psi(z, g, d); + // make view named phi from phi dat + phi(z, g, m) += ell(m, d) * psi(z, g, d); + + +}); + + + + } + Kokkos::fence(); + stopTimer(); + + break; + } default : { std::cout << "\n LTIMES : Unknown variant id = " << vid << std::endl; @@ -122,6 +159,14 @@ void LTIMES::runKokkosVariant(VariantID vid) } +#endif // RUN_KOKKOS + +// moveDataToHostFromKokkosView(a, a_view, iend); + moveDataToHostFromKokkosView(phidat, phi, num_z, num_g, num_m); + moveDataToHostFromKokkosView(psidat, psi, num_z, num_g, num_d); + moveDataToHostFromKokkosView(elldat, ell, num_m, num_d); + + } } // end namespace apps diff --git a/src/apps-kokkos/PRESSURE-Kokkos.cpp b/src/apps-kokkos/PRESSURE-Kokkos.cpp index c3b7cad38..e181a8764 100644 --- a/src/apps-kokkos/PRESSURE-Kokkos.cpp +++ b/src/apps-kokkos/PRESSURE-Kokkos.cpp @@ -20,8 +20,6 @@ namespace apps void PRESSURE::runKokkosVariant(VariantID vid) { - // FIXME - return; const Index_type run_reps = getRunReps(); const Index_type ibegin = 0; @@ -29,6 +27,18 @@ void PRESSURE::runKokkosVariant(VariantID vid) PRESSURE_DATA_SETUP; + // Real_ptr compression = m_compression; \ + // Real_ptr bvc = m_bvc; \ + // Real_ptr p_new = m_p_new; \ + // Real_ptr e_old = m_e_old; \ + // Real_ptr vnewc = m_vnewc; \ + + auto compression_view = getViewFromPointer(compression, iend); + auto bvc_view = getViewFromPointer(bvc, iend); + auto p_new_view = getViewFromPointer(p_new, iend); + auto e_old_view = getViewFromPointer(e_old, iend); + auto vnewc_view = getViewFromPointer(vnewc, iend); + auto pressure_lam1 = [=](Index_type i) { PRESSURE_BODY1; }; @@ -36,6 +46,8 @@ void PRESSURE::runKokkosVariant(VariantID vid) PRESSURE_BODY2; }; +#if defined(RUN_KOKKOS) + switch ( vid ) { case Base_Seq : { @@ -57,7 +69,6 @@ void PRESSURE::runKokkosVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) case Lambda_Seq : { startTimer(); @@ -76,7 +87,7 @@ void PRESSURE::runKokkosVariant(VariantID vid) break; } - +/* case RAJA_Seq : { startTimer(); @@ -97,7 +108,52 @@ void PRESSURE::runKokkosVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ + */ + + case Kokkos_Lambda : { + + Kokkos::fence(); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + // CRT : Look at Kokkos graphs as an implementation for kernel + // seq_region - create a sequential region + // Intent: two loop bodies will be executed consecutively + // https://raja.readthedocs.io/en/v0.9.0/feature/policies.html?highlight=seq_region#parallel-region-policies + // The sequential region specialization is essentially a pass through operation. + // It is provided so that if you want to turn off OpenMP in your code, + // you can simply replace the region policy type and you do not have to change your algorithm source code. + + + Kokkos::parallel_for("PRESSURE_BODY1 - Kokkos_Lambda", + Kokkos::RangePolicy(ibegin,iend), + KOKKOS_LAMBDA(Index_type i) { + // #define PRESSURE_BODY1 + // bvc[i] = cls * (compression[i] + 1.0); + bvc_view[i] = cls * (compression_view[i] + 1.0); + + }); + + + + Kokkos::parallel_for("PRESSURE_BODY2 - Kokkos_Lambda", + Kokkos::RangePolicy(ibegin,iend), + KOKKOS_LAMBDA(Index_type i) { + // #define PRESSURE_BODY2 + p_new_view[i] = bvc_view[i] * e_old_view[i] ; + if ( fabs(p_new_view[i]) < p_cut ) p_new_view[i] = 0.0 ; + if ( vnewc_view[i] >= eosvmax ) p_new_view[i] = 0.0 ; + if ( p_new_view[i] < pmin ) p_new_view[i] = pmin ; + }); + + + } + Kokkos::fence(); + stopTimer(); + + break; + } + default : { std::cout << "\n PRESSURE : Unknown variant id = " << vid << std::endl; @@ -105,6 +161,14 @@ void PRESSURE::runKokkosVariant(VariantID vid) } +#endif // RUN_KOKKOS + + moveDataToHostFromKokkosView(compression, compression_view, iend); + moveDataToHostFromKokkosView(bvc, bvc_view, iend); + moveDataToHostFromKokkosView(p_new, p_new_view, iend); + moveDataToHostFromKokkosView(e_old, e_old_view, iend); + moveDataToHostFromKokkosView(vnewc, vnewc_view, iend); + } } // end namespace apps diff --git a/src/apps-kokkos/VOL3D-Kokkos.cpp b/src/apps-kokkos/VOL3D-Kokkos.cpp index 7194cce22..546fc38fa 100644 --- a/src/apps-kokkos/VOL3D-Kokkos.cpp +++ b/src/apps-kokkos/VOL3D-Kokkos.cpp @@ -20,10 +20,36 @@ namespace apps { +struct arrayOffSetStruct3D { + using ViewType = Kokkos::View; + + // v's are offsets of indices + ViewType v, v0, v1, v2, v3, v4, v5, v6, v7; + + // constructor + arrayOffSetStruct3D(const std::string& name, + Index_type num_elements, + Index_type jp, + Index_type kp, + Real_ptr head + ): + // ":" = list of things to initialize + // Initialize v + v (getViewFromPointer(head, num_elements)), + v0(v), + v1(Kokkos::subview(v0, std::make_pair(static_cast(1), v0.extent(0)))), + v2(Kokkos::subview(v0, std::make_pair(static_cast(jp), v0.extent(0)))), + v3(Kokkos::subview(v1, std::make_pair(static_cast(jp), v1.extent(0)))), + v4(Kokkos::subview(v0, std::make_pair(static_cast(kp), v0.extent(0)))), + v5(Kokkos::subview(v1, std::make_pair(static_cast(kp), v1.extent(0)))), + v6(Kokkos::subview(v2, std::make_pair(static_cast(kp), v2.extent(0)))), + v7(Kokkos::subview(v3, std::make_pair(static_cast(kp), v3.extent(0)))) { + } +}; + + void VOL3D::runKokkosVariant(VariantID vid) { - // FIXME - return; const Index_type run_reps = getRunReps(); const Index_type ibegin = m_domain->fpz; @@ -35,6 +61,44 @@ void VOL3D::runKokkosVariant(VariantID vid) NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ; NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ; + // not sure about the 'ibegin, iend' here: + auto vol_view = getViewFromPointer(vol, m_domain->nnalls); + + arrayOffSetStruct3D x_offsets("x_offsets", m_domain->nnalls, m_domain->jp, m_domain->kp, x); + arrayOffSetStruct3D y_offsets("y_offsets", m_domain->nnalls, m_domain->jp, m_domain->kp, y); + arrayOffSetStruct3D z_offsets("z_offsets", m_domain->nnalls, m_domain->jp, m_domain->kp, z); + + auto& x_view = x_offsets.v; + auto& x0_view = x_offsets.v0; + auto& x1_view = x_offsets.v1; + auto& x2_view = x_offsets.v2; + auto& x3_view = x_offsets.v3; + auto& x4_view = x_offsets.v4; + auto& x5_view = x_offsets.v5; + auto& x6_view = x_offsets.v6; + auto& x7_view = x_offsets.v7; + + auto& y_view = y_offsets.v; + auto& y0_view = y_offsets.v0; + auto& y1_view = y_offsets.v1; + auto& y2_view = y_offsets.v2; + auto& y3_view = y_offsets.v3; + auto& y4_view = y_offsets.v4; + auto& y5_view = y_offsets.v5; + auto& y6_view = y_offsets.v6; + auto& y7_view = y_offsets.v7; + + auto& z_view = z_offsets.v; + auto& z0_view = z_offsets.v0; + auto& z1_view = z_offsets.v1; + auto& z2_view = z_offsets.v2; + auto& z3_view = z_offsets.v3; + auto& z4_view = z_offsets.v4; + auto& z5_view = z_offsets.v5; + auto& z6_view = z_offsets.v6; + auto& z7_view = z_offsets.v7; + + auto vol3d_lam = [=](Index_type i) { VOL3D_BODY; }; @@ -71,7 +135,7 @@ void VOL3D::runKokkosVariant(VariantID vid) break; } - +/* case RAJA_Seq : { startTimer(); @@ -85,7 +149,89 @@ void VOL3D::runKokkosVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ +*/ + case Kokkos_Lambda : { + + startTimer(); + + //auto index_list = getViewFromPointer(m_domain->real_zones, m_domain->n_real_zones); + + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Kokkos::parallel_for( + "VOL3D Kokkos_Lambda", + Kokkos::RangePolicy(ibegin,iend), + KOKKOS_LAMBDA(Index_type i ) { + + // #define VOL3D_BODY + //int i = index_list[ii]; + + Real_type x71 = x7_view[i] - x1_view[i] ; \ + Real_type x72 = x7_view[i] - x2_view[i] ; \ + Real_type x74 = x7_view[i] - x4_view[i] ; \ + Real_type x30 = x3_view[i] - x0_view[i] ; \ + Real_type x50 = x5_view[i] - x0_view[i] ; \ + Real_type x60 = x6_view[i] - x0_view[i] ; \ + + Real_type y71 = y7_view[i] - y1_view[i] ; \ + Real_type y72 = y7_view[i] - y2_view[i] ; \ + Real_type y74 = y7_view[i] - y4_view[i] ; \ + Real_type y30 = y3_view[i] - y0_view[i] ; \ + Real_type y50 = y5_view[i] - y0_view[i] ; \ + Real_type y60 = y6_view[i] - y0_view[i] ; \ + + Real_type z71 = z7_view[i] - z1_view[i] ; \ + Real_type z72 = z7_view[i] - z2_view[i] ; \ + Real_type z74 = z7_view[i] - z4_view[i] ; \ + Real_type z30 = z3_view[i] - z0_view[i] ; \ + Real_type z50 = z5_view[i] - z0_view[i] ; \ + Real_type z60 = z6_view[i] - z0_view[i] ; \ + + Real_type xps = x71 + x60 ; \ + Real_type yps = y71 + y60 ; \ + Real_type zps = z71 + z60 ; \ + + Real_type cyz = y72 * z30 - z72 * y30 ; \ + Real_type czx = z72 * x30 - x72 * z30 ; \ + Real_type cxy = x72 * y30 - y72 * x30 ; \ + vol_view[i] = xps * cyz + yps * czx + zps * cxy ; \ + + xps = x72 + x50 ; \ + yps = y72 + y50 ; \ + zps = z72 + z50 ; \ + + cyz = y74 * z60 - z74 * y60 ; \ + czx = z74 * x60 - x74 * z60 ; \ + cxy = x74 * y60 - y74 * x60 ; \ + vol_view[i] += xps * cyz + yps * czx + zps * cxy ; \ + + xps = x74 + x30 ; \ + yps = y74 + y30 ; \ + zps = z74 + z30 ; \ + + cyz = y74 * z60 - z74 * y60 ; \ + czx = z74 * x60 - x74 * z60 ; \ + cxy = x74 * y60 - y74 * x60 ; \ + vol_view[i] += xps * cyz + yps * czx + zps * cxy ; \ + + xps = x74 + x30 ; \ + yps = y74 + y30 ; \ + zps = z74 + z30 ; \ + + cyz = y71 * z50 - z71 * y50 ; \ + czx = z71 * x50 - x71 * z50 ; \ + cxy = x71 * y50 - y71 * x50 ; \ + vol_view[i] += xps * cyz + yps * czx + zps * cxy ; \ + + vol_view[i] *= vnormq ; + } + ); + + } + stopTimer(); + + break; + } default : { std::cout << "\n VOL3D : Unknown variant id = " << vid << std::endl; @@ -93,6 +239,14 @@ void VOL3D::runKokkosVariant(VariantID vid) } +#endif // RUN_KOKKOS + + moveDataToHostFromKokkosView(x, x_view, m_domain->nnalls); + moveDataToHostFromKokkosView(y, y_view, m_domain->nnalls); + moveDataToHostFromKokkosView(z, z_view, m_domain->nnalls); + moveDataToHostFromKokkosView(vol, vol_view, m_domain->nnalls); + + } } // end namespace apps diff --git a/src/apps/AppsData.hpp b/src/apps/AppsData.hpp index cb8526bf8..2a7e6ab65 100644 --- a/src/apps/AppsData.hpp +++ b/src/apps/AppsData.hpp @@ -18,7 +18,8 @@ namespace apps // // Some macros used in kernels to mimic real app code style. -// + +// For VOL-3D #define NDPTRSET(jp, kp,v,v0,v1,v2,v3,v4,v5,v6,v7) \ v0 = v ; \ v1 = v0 + 1 ; \ @@ -29,6 +30,7 @@ namespace apps v6 = v2 + kp ; \ v7 = v3 + kp ; +// For DEL_DOT_VEC_2D #define NDSET2D(jp,v,v1,v2,v3,v4) \ v4 = v ; \ v1 = v4 + 1 ; \ diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index e814f91e5..8c8b97131 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -62,6 +62,8 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); } DEL_DOT_VEC_2D::~DEL_DOT_VEC_2D() diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index 29a8d4be0..153722e1d 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -62,6 +62,7 @@ ENERGY::ENERGY(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Kokkos_Lambda ); } ENERGY::~ENERGY() diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index 1b85fc0b6..8884f964a 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -24,6 +24,9 @@ FIR::FIR(const RunParams& params) setDefaultProblemSize(1000000); setDefaultReps(160); + //setDefaultProblemSize(10); + //setDefaultReps(1); + m_coefflen = FIR_COEFFLEN; setActualProblemSize( getTargetProblemSize() ); @@ -52,6 +55,8 @@ FIR::FIR(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); } FIR::~FIR() diff --git a/src/apps/HALOEXCHANGE.cpp b/src/apps/HALOEXCHANGE.cpp index d826f2fbc..55a99fc6c 100644 --- a/src/apps/HALOEXCHANGE.cpp +++ b/src/apps/HALOEXCHANGE.cpp @@ -98,6 +98,7 @@ HALOEXCHANGE::HALOEXCHANGE(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + setVariantDefined( Kokkos_Lambda ); } HALOEXCHANGE::~HALOEXCHANGE() diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp index fa7542fc0..b5c1de32f 100644 --- a/src/apps/LTIMES.cpp +++ b/src/apps/LTIMES.cpp @@ -73,6 +73,8 @@ LTIMES::LTIMES(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); } LTIMES::~LTIMES() diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index efa921a94..7e77a9633 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -52,6 +52,8 @@ PRESSURE::PRESSURE(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); } PRESSURE::~PRESSURE() diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index 47635cb0b..7ddf2e42b 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -60,6 +60,8 @@ VOL3D::VOL3D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); } VOL3D::~VOL3D() diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 7ca22c2b5..cf4707000 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -159,12 +159,10 @@ namespace rajaperf { free_register_kernel(exec, "Apps", new apps::LTIMES_NOVIEW(run_params)); free_register_kernel(exec, "Apps", new apps::PRESSURE(run_params)); free_register_kernel(exec, "Apps", new apps::VOL3D(run_params)); - -/** // Algorithm free_register_kernel(exec, "Algorithm", new algorithm::SORT(run_params)); free_register_kernel(exec, "Algorithm", new algorithm::SORTPAIRS(run_params)); - */ + } /*! @@ -276,11 +274,10 @@ namespace rajaperf { std::string("Apps_PRESSURE"), std::string("Apps_VOL3D"), -// // Algorithm kernels... // -// std::string("Algorithm_SORT"), -// std::string("Algorithm_SORTPAIRS"), + std::string("Algorithm_SORT"), + std::string("Algorithm_SORTPAIRS"), std::string("Unknown Kernel") // Keep this at the end and DO NOT remove.... diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 7e0ef956f..c5d9a8d24 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -387,8 +387,8 @@ enum KernelID { // // Algorithm kernels... // -// Algorithm_SORT, -// Algorithm_SORTPAIRS, + Algorithm_SORT, + Algorithm_SORTPAIRS, NumKernels // Keep this one last and NEVER comment out (!!) diff --git a/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp b/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp index 8cfae031b..66dcfb0a4 100644 --- a/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp +++ b/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp @@ -18,6 +18,30 @@ namespace lcals { +template +void diff_predict_helper(Index_type run_reps, + Index_type ibegin, + Index_type iend, + Index_type offset, + // a Kokkos View + px_type& px, + // a Kokkos View + cx_type& cx){ + + + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + Kokkos::parallel_for("DIFF_PREDICT_Kokkos Kokkos_Lambda", + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i) { + DIFF_PREDICT_BODY + }); + + } +} + + + void DIFF_PREDICT::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -27,11 +51,12 @@ void DIFF_PREDICT::runKokkosVariant(VariantID vid) DIFF_PREDICT_DATA_SETUP; + // Instiating KokkosViews using getViewFromPointer; // Wrapping pointers in KokkosViews // You need to know the actual array size here to catch errors; - // + auto px_view = getViewFromPointer(px, iend*14); auto cx_view = getViewFromPointer(cx, iend*14); @@ -91,81 +116,26 @@ void DIFF_PREDICT::runKokkosVariant(VariantID vid) } */ + + // Kokkos-ifying here: // case Kokkos_Lambda : { - - // Define ar, br cr because you are not using the DIFF_PREDICT_BODY - Kokkos::fence(); startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Kokkos::parallel_for("DIFF_PREDICT_Kokkos Kokkos_Lambda", -/* -(gdb) p offset -$1 = 100000 -(gdb) -$2 = 100000 -(gdb) p iend -$3 = 100000 -*/ + diff_predict_helper( run_reps, + ibegin, + iend, + offset, + px_view, + cx_view); - Kokkos::RangePolicy(ibegin, iend), - KOKKOS_LAMBDA(Index_type i) { - // DIFF_PREDICT_BODY definition in - // DIFF_PREDICT.hpp: - /* - ar = cx[i + offset * 4]; \ - br = ar - px[i + offset * 4]; \ - px[i + offset * 4] = ar; \ - cr = br - px[i + offset * 5]; \ - px[i + offset * 5] = br; \ - ar = cr - px[i + offset * 6]; \ - px[i + offset * 6] = cr; \ - br = ar - px[i + offset * 7]; \ - px[i + offset * 7] = ar; \ - cr = br - px[i + offset * 8]; \ - px[i + offset * 8] = br; \ - ar = cr - px[i + offset * 9]; \ - px[i + offset * 9] = cr; \ - br = ar - px[i + offset * 10]; \ - px[i + offset * 10] = ar; \ - cr = br - px[i + offset * 11]; \ - px[i + offset * 11] = br; \ - px[i + offset * 13] = cr - px[i + offset * 12]; \ - px[i + offset * 12] = cr; - - */ - - Real_type ar, br, cr; - ar = cx_view[i + offset * 4]; \ - br = ar - px_view[i + offset * 4]; \ - px_view[i + offset * 4] = ar; \ - cr = br - px_view[i + offset * 5]; \ - px_view[i + offset * 5] = br; \ - ar = cr - px_view[i + offset * 6]; \ - px_view[i + offset * 6] = cr; \ - br = ar - px_view[i + offset * 7]; \ - px_view[i + offset * 7] = ar; \ - cr = br - px_view[i + offset * 8]; \ - px_view[i + offset * 8] = br; \ - ar = cr - px_view[i + offset * 9]; \ - px_view[i + offset * 9] = cr; \ - br = ar - px_view[i + offset * 10]; \ - px_view[i + offset * 10] = ar; \ - cr = br - px_view[i + offset * 11]; \ - px_view[i + offset * 11] = br; \ - px_view[i + offset * 13] = cr - px_view[i + offset * 12]; \ - px_view[i + offset * 12] = cr; - }); - } Kokkos::fence(); stopTimer(); - break; + } diff --git a/src/stream-kokkos/ADD-Kokkos.cpp b/src/stream-kokkos/ADD-Kokkos.cpp index 10171b1dc..c89f990f1 100644 --- a/src/stream-kokkos/ADD-Kokkos.cpp +++ b/src/stream-kokkos/ADD-Kokkos.cpp @@ -19,13 +19,6 @@ namespace stream // Start Kokkos-ifying here: // Nota bene: the original RAJAPerf Suite code left for reference - /* -void ADD::runSeqVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); -*/ void ADD::runKokkosVariant(VariantID vid) { diff --git a/src/stream-kokkos/DOT-Kokkos.cpp b/src/stream-kokkos/DOT-Kokkos.cpp index 7b6519768..0a7f1e461 100644 --- a/src/stream-kokkos/DOT-Kokkos.cpp +++ b/src/stream-kokkos/DOT-Kokkos.cpp @@ -17,14 +17,6 @@ namespace rajaperf namespace stream { -/* -void DOT::runSeqVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); -*/ - void DOT::runKokkosVariant(VariantID vid) { @@ -39,10 +31,6 @@ void DOT::runKokkosVariant(VariantID vid) { // Instantiation of pointer - wrapped views: auto a_view = getViewFromPointer(a, iend); auto b_view = getViewFromPointer(b, iend); - // - // From basic-kokkos - REDUCE3 - // Instantiation of a view from a pointer to a vector - // auto vec_view = getViewFromPointer(vec, iend); @@ -70,7 +58,6 @@ void DOT::runKokkosVariant(VariantID vid) { break; } -// #if defined(RUN_RAJA_SEQ) case Lambda_Seq : { auto dot_base_lam = [=](Index_type i) -> Real_type { @@ -152,9 +139,6 @@ void DOT::runKokkosVariant(VariantID vid) { } - -// #endif // RUN_RAJA_SEQ - default : { std::cout << "\n DOT : Unknown variant id = " << vid << std::endl; } @@ -165,11 +149,8 @@ void DOT::runKokkosVariant(VariantID vid) { #endif // RUN_KOKKOS std::cout << " FIX ME STREAM DOT -- GET DATA FROM VIEWS " << std::endl; - //moveDataToHostFromKokkosView(a, a_view, iend); - //moveDataToHostFromKokkosView(b, b_view, iend); - - // From REDUCE3-INT - // moveDataToHostFromKokkosView(vec, vec_view, iend); + moveDataToHostFromKokkosView(a, a_view, iend); + moveDataToHostFromKokkosView(b, b_view, iend); } From 1fd7f0fe0003d8b7de9ee7ddb66f8a0d0fc4c12c Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 4 Oct 2021 10:52:18 -0600 Subject: [PATCH 110/124] kokkos-algorithms: Kokkos translation --- src/algorithm-kokkos/CMakeLists.txt | 17 ++++ src/algorithm-kokkos/SORT-Kokkos.cpp | 97 ++++++++++++++++++ src/algorithm-kokkos/SORTPAIRS-Kokkos.cpp | 117 ++++++++++++++++++++++ 3 files changed, 231 insertions(+) create mode 100644 src/algorithm-kokkos/CMakeLists.txt create mode 100644 src/algorithm-kokkos/SORT-Kokkos.cpp create mode 100644 src/algorithm-kokkos/SORTPAIRS-Kokkos.cpp diff --git a/src/algorithm-kokkos/CMakeLists.txt b/src/algorithm-kokkos/CMakeLists.txt new file mode 100644 index 000000000..78b10b113 --- /dev/null +++ b/src/algorithm-kokkos/CMakeLists.txt @@ -0,0 +1,17 @@ +############################################################################### +# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/COPYRIGHT file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/../algorithm) + + +blt_add_library( + NAME algorithm-kokkos + SOURCES SORT-Kokkos.cpp + SORTPAIRS-Kokkos.cpp + DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} + ) diff --git a/src/algorithm-kokkos/SORT-Kokkos.cpp b/src/algorithm-kokkos/SORT-Kokkos.cpp new file mode 100644 index 000000000..e13f4d0c9 --- /dev/null +++ b/src/algorithm-kokkos/SORT-Kokkos.cpp @@ -0,0 +1,97 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "SORT.hpp" +#include + + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace algorithm +{ + + +void SORT::runKokkosVariant(VariantID vid) +{ + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + SORT_DATA_SETUP; + + // Instantiate Kokkos Views + + auto x_view = getViewFromPointer(x, iend*run_reps); + +#if defined (RUN_KOKKOS) + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + std::sort(STD_SORT_ARGS); + + } + stopTimer(); + + break; + } +/* + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::sort(RAJA_SORT_ARGS); + + } + stopTimer(); + + break; + } +*/ + + case Kokkos_Lambda : { + + Kokkos::fence(); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + //#define STD_SORT_ARGS vs. using RAJAPerf Suite expression + //x + iend*irep + ibegin, x + iend*irep + iend + + Kokkos::sort(x_view, iend*irep + ibegin, iend*irep + iend); + } + Kokkos::fence(); + stopTimer(); + + break; + } + + default : { + std::cout << "\n SORT : Unknown variant id = " << vid << std::endl; + } + + } + +#endif // RUN_KOKKOS + + moveDataToHostFromKokkosView(x, x_view, iend*run_reps); + + +} + +} // end namespace algorithm +} // end namespace rajaperf diff --git a/src/algorithm-kokkos/SORTPAIRS-Kokkos.cpp b/src/algorithm-kokkos/SORTPAIRS-Kokkos.cpp new file mode 100644 index 000000000..85015ff7b --- /dev/null +++ b/src/algorithm-kokkos/SORTPAIRS-Kokkos.cpp @@ -0,0 +1,117 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/COPYRIGHT file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "SORTPAIRS.hpp" + +#include "RAJA/RAJA.hpp" + +#include +#include +#include +#include + +namespace rajaperf +{ +namespace algorithm +{ + + +void SORTPAIRS::runKokkosVariant(VariantID vid) +{ + // FIXME + return; + + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + SORTPAIRS_DATA_SETUP; + + // instatiate Kokkos Views + // auto x_view = getViewFromPointer(x, iend); + //auto i_view = getViewFromPointer(i, iend); + +#if defined (RUN_KOKKOS) + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + using pair_type = std::pair; + + std::vector vector_of_pairs; + vector_of_pairs.reserve(iend-ibegin); + + for (Index_type iemp = ibegin; iemp < iend; ++iemp) { + vector_of_pairs.emplace_back(x[iend*irep + iemp], i[iend*irep + iemp]); + } + + std::sort(vector_of_pairs.begin(), vector_of_pairs.end(), + [](pair_type const& lhs, pair_type const& rhs) { + return lhs.first < rhs.first; + }); + + for (Index_type iemp = ibegin; iemp < iend; ++iemp) { + pair_type& pair = vector_of_pairs[iemp - ibegin]; + x[iend*irep + iemp] = pair.first; + i[iend*irep + iemp] = pair.second; + } + + } + stopTimer(); + + break; + } +/* + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::sort_pairs(RAJA_SORTPAIRS_ARGS); + + } + stopTimer(); + + break; + } + */ +/* + case Kokkos_Lambda : { + + Kokkos::fence(); + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + //RAJA::sort_pairs(RAJA_SORTPAIRS_ARGS); + + + }); + } + Kokkos::fence(); + stopTimer(); + + break; + } +*/ + default : { + std::cout << "\n SORTPAIRS : Unknown variant id = " << vid << std::endl; + } + + } +#endif // RUN_KOKKOS + + //moveDataToHostFromKokkosView(x, x_view, iend); + //moveDataToHostFromKokkosView(i, i_view, iend); + +} + +} // end namespace algorithm +} // end namespace rajaperf From 8e56aa4f99a826990a6329e1ceea60cd69971906 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Wed, 6 Oct 2021 10:46:13 -0600 Subject: [PATCH 111/124] VOL3D-Kokkos.cpp: Fix #ifdefs, #endifs --- src/apps-kokkos/VOL3D-Kokkos.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apps-kokkos/VOL3D-Kokkos.cpp b/src/apps-kokkos/VOL3D-Kokkos.cpp index 546fc38fa..70f08819b 100644 --- a/src/apps-kokkos/VOL3D-Kokkos.cpp +++ b/src/apps-kokkos/VOL3D-Kokkos.cpp @@ -34,7 +34,7 @@ struct arrayOffSetStruct3D { Real_ptr head ): // ":" = list of things to initialize - // Initialize v + // Initialize v v (getViewFromPointer(head, num_elements)), v0(v), v1(Kokkos::subview(v0, std::make_pair(static_cast(1), v0.extent(0)))), @@ -103,6 +103,7 @@ void VOL3D::runKokkosVariant(VariantID vid) VOL3D_BODY; }; +#if defined(RUN_KOKKOS) switch ( vid ) { case Base_Seq : { @@ -120,7 +121,6 @@ void VOL3D::runKokkosVariant(VariantID vid) break; } -#if defined(RUN_RAJA_SEQ) case Lambda_Seq : { startTimer(); From ff48c3a3a96b0d6c8aeb67a539bbf49a1a996415 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Fri, 8 Oct 2021 14:20:07 -0600 Subject: [PATCH 112/124] HALOEXCHANGE-Kokkos.cpp: Kokkos translation --- src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp | 180 ++++++++++++------------ 1 file changed, 92 insertions(+), 88 deletions(-) diff --git a/src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp b/src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp index 097599425..0aa022dfb 100644 --- a/src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp +++ b/src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp @@ -24,92 +24,54 @@ void HALOEXCHANGE::runKokkosVariant(VariantID vid) //return; const Index_type run_reps = getRunReps(); + // Nota bene: ibegin, iend not defined for this kernel + // Instead: + // Index_type num_neighbors = s_num_neighbors; + // Index_type num_vars = m_num_vars; + // How these variables are set:: + // apps/HALOEXCHANGE.cpp: m_num_vars_default = 3; + // apps/HALOEXCHANGE.hpp: static const int s_num_neighbors = 26; + + //HALOEXCHANGE_DATA_SETUP; + +// Declare and define Kokkos Views +// Preserving the names of the pointer variables to avoid typo errors in the +// Kokkos_Lambda expressions +// +//auto vars_view = getViewFromPointer(vars, num_neighbors); +//auto buffers_view = getViewFromPointer(buffers, num_neighbors); - HALOEXCHANGE_DATA_SETUP; - -#if defined(RUN_KOKKOS) - - switch ( vid ) { - - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { +std::vector> vars; +std::vector> buffers; +std::vector> pack_index_lists; +std::vector> unpack_index_lists; - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_PACK_BODY; - } - buffer += len; - } - } +for (auto var: m_vars) { + vars.push_back(getViewFromPointer(var, m_var_size)); +} - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - for (Index_type i = 0; i < len; i++) { - HALOEXCHANGE_UNPACK_BODY; - } - buffer += len; - } - } +for ( int x = 0; x < m_buffers.size(); ++x ) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[x]; + buffers.push_back(getViewFromPointer(m_buffers[x], buffer_len)); +} - } - stopTimer(); - break; - } +for ( int x = 0; x < m_pack_index_lists.size(); ++x ) { - case Lambda_Seq : { + pack_index_lists.push_back(getViewFromPointer(m_pack_index_lists[x], m_pack_index_list_lengths[x])); +} - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; - }; - for (Index_type i = 0; i < len; i++) { - haloexchange_pack_base_lam(i); - } - buffer += len; - } - } +for ( int x = 0; x < m_unpack_index_lists.size(); ++x ) { - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; - }; - for (Index_type i = 0; i < len; i++) { - haloexchange_unpack_base_lam(i); - } - buffer += len; - } - } + unpack_index_lists.push_back(getViewFromPointer(m_unpack_index_lists[x], m_unpack_index_list_lengths[x])); +} +auto num_neighbors = s_num_neighbors; +auto num_vars = m_num_vars; - } - stopTimer(); +#if defined(RUN_KOKKOS) - break; - } + switch ( vid ) { /* case RAJA_Seq : { @@ -163,38 +125,54 @@ void HALOEXCHANGE::runKokkosVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - + // FYI: num_neigbors defined in HALOEXCHANGE.hpp + // num_neighbors is set in HALOEXCHANGE.cpp for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; + auto buffer = buffers[l]; + auto list = pack_index_lists[l]; + Index_type len = m_pack_index_list_lengths[l]; + // FYI: num_vars defined in HALOEXCHANGE.hpp + // num_vars is set in HALOEXCHANGE.cpp for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; + auto var = vars[v]; auto haloexchange_pack_base_lam = KOKKOS_LAMBDA(Index_type i) { - HALOEXCHANGE_PACK_BODY; + // HALOEXCHANGE_PACK_BODY + // #define HALOEXCHANGE_PACK_BODY \ + // buffer[i] = var[list[i]]; + buffer[i] = var[list[i]]; }; Kokkos::parallel_for("HALOEXCHANGE - Pack Body - Kokkos Lambda", Kokkos::RangePolicy(0, len), haloexchange_pack_base_lam); - buffer += len; + //buffer += len + + auto end = buffer.extent(0); + decltype(end) begin = len; + buffer = Kokkos::subview(buffer, std::make_pair(begin,end)); } } for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; + auto buffer = buffers[l]; + auto list = unpack_index_lists[l]; + Index_type len = m_unpack_index_list_lengths[l]; for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; + auto var = vars[v]; auto haloexchange_unpack_base_lam = KOKKOS_LAMBDA(Index_type i) { - HALOEXCHANGE_UNPACK_BODY; + //#define HALOEXCHANGE_UNPACK_BODY \ + //var[list[i]] = buffer[i]; + var[list[i]] = buffer[i]; + }; Kokkos::parallel_for("HALOEXCHANGE - Unpack Body - Kokkos Lambda", Kokkos::RangePolicy(0, len), haloexchange_unpack_base_lam); - buffer += len; + //buffer += len; + auto end = buffer.extent(0); + decltype(end) begin = len; + buffer = Kokkos::subview(buffer, std::make_pair(begin,end)); } } @@ -211,8 +189,34 @@ Kokkos::parallel_for("HALOEXCHANGE - Pack Body - Kokkos Lambda", } #endif // RUN_KOKKOS + //Usage: moveDataToHostFromKokkosView(pointer, pointer_wrapped_view, iend); + // moveDataToHostFromKokkosView(vars, vars_view, num_neighbors); + // moveDataToHostFromKokkosView(buffers, buffers_view, num_neighbors); +for ( int x = 0; x < m_vars.size(); ++x ) { + //vars.push_back(getViewFromPointer(var, m_var_size)); + moveDataToHostFromKokkosView(m_vars[x], vars[x], m_var_size); +} + +for ( int x = 0; x < m_buffers.size(); ++x ) { + Index_type buffer_len = m_num_vars * m_pack_index_list_lengths[x]; + moveDataToHostFromKokkosView(m_buffers[x], buffers[x], buffer_len); +} + + +for ( int x = 0; x < m_pack_index_lists.size(); ++x ) { + + //pack_index_lists.push_back(getViewFromPointer(m_pack_index_lists[x], m_pack_index_list_lengths[x])); + moveDataToHostFromKokkosView(m_pack_index_lists[x], pack_index_lists[x], m_pack_index_list_lengths[x]); +} + + +for ( int x = 0; x < m_unpack_index_lists.size(); ++x ) { + + //unpack_index_lists.push_back(getViewFromPointer(m_unpack_index_lists[x], m_unpack_index_list_lengths[x])); + moveDataToHostFromKokkosView(m_unpack_index_lists[x], unpack_index_lists[x], m_unpack_index_list_lengths[x]); +} } From 621e3829e6cb081952e8554635fdbfbb55440c3e Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 12 Oct 2021 09:24:49 -0600 Subject: [PATCH 113/124] FIR-Kokkos.cpp: Wrapping coeff array in View --- src/apps-kokkos/FIR-Kokkos.cpp | 38 ++++++++-------------------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/src/apps-kokkos/FIR-Kokkos.cpp b/src/apps-kokkos/FIR-Kokkos.cpp index 82cd49cb1..4094f2916 100644 --- a/src/apps-kokkos/FIR-Kokkos.cpp +++ b/src/apps-kokkos/FIR-Kokkos.cpp @@ -26,20 +26,17 @@ void FIR::runKokkosVariant(VariantID vid) const Index_type ibegin = 0; const Index_type iend = getActualProblemSize() - m_coefflen; - // Macro for 1D Array of defined length of coefficients - FIR_COEFF; - - // Declare & initialize pointers, coefflen FIR_DATA_SETUP; - // Declare coeff array - Real_type coeff[FIR_COEFFLEN]; - + // Wrap 4x4 array, "coeff" in a Kokkos::View; + // "coeff" is used in the FIR_BODY + // Real_type coeff[FIR_COEFFLEN]; + // Macro for 4x4 input array + FIR_COEFF; + // "coeff" is assined the memory location containing the value of the 0th element of coeff_array; + Real_ptr coeff = &coeff_array[0]; - // std::copy(iterator source_first, iterator source_end, iterator target_start); - // Copy the "coeff_array" (in FIR.hpp) into the "coeff" array; both are - // "Real_type" - std::copy(std::begin(coeff_array), std::end(coeff_array), std::begin(coeff)); + auto coeff_view = getViewFromPointer(coeff, FIR_COEFFLEN); auto in_view = getViewFromPointer(in, iend + m_coefflen); auto out_view = getViewFromPointer(out, iend + m_coefflen); @@ -81,23 +78,6 @@ void FIR::runKokkosVariant(VariantID vid) break; } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), fir_lam); - - } - stopTimer(); - - break; - } - - */ - case Kokkos_Lambda : { @@ -113,7 +93,7 @@ void FIR::runKokkosVariant(VariantID vid) Real_type sum = 0.0; for (Index_type j = 0; j < coefflen; ++j ) { - sum += coeff[j]*in_view[i+j]; + sum += coeff_view[j]*in_view[i+j]; } out_view[i] = sum; }); From 038c34ee962bade7f3e90f1017280036464b24cd Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 21 Oct 2021 09:11:45 -0600 Subject: [PATCH 114/124] commenting features not integrated into Kokkos testing --- src/common/Executor.cpp | 6 +++--- src/common/RunParams.cpp | 6 +++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index bf5d93011..afcff9d0a 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -166,7 +166,7 @@ void Executor::setupSuite() for (size_t i = 0; i < exclude_feature_input.size(); ++i) { const string& feature = exclude_feature_input[i]; - +/* bool found_it = false; for (size_t fid = 0; fid < NumFeatures && !found_it; ++fid) { FeatureID tfid = static_cast(fid); @@ -175,7 +175,7 @@ void Executor::setupSuite() for (int kid = 0; kid < NumKernels; ++kid) { KernelID tkid = static_cast(kid); - KernelBase* kern = getKernelObject(tkid, run_params); + //KernelBase* kern = getKernelObject(tkid, run_params); if ( kern->usesFeature(tfid) ) { exclude_kern.insert( tkid ); } @@ -184,7 +184,7 @@ void Executor::setupSuite() } // if input feature name matches feature id } // loop over feature ids until name match is found - +*/ } // loop over feature name input } // if feature name input is valid diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 3638e8a44..8314f5f38 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -671,9 +671,11 @@ void RunParams::printFeatureKernels(std::ostream& str) const } // loop over features str.flush(); } - +// AJP, DZP: Commenting function body, because we have not yet integrated +// with Kokkos testing infrastructure void RunParams::printKernelFeatures(std::ostream& str) const { +/* str << "\nAvailable kernels and features each uses:"; str << "\n-----------------------------------------\n"; for (int kid = 0; kid < NumKernels; ++kid) { @@ -692,6 +694,8 @@ void RunParams::printKernelFeatures(std::ostream& str) const } } // loop over kernels str.flush(); + +*/ } } // closing brace for rajaperf namespace From f9d829dcce14f329b70384269f4b3d2b666dcbfb Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Tue, 26 Oct 2021 14:51:11 -0600 Subject: [PATCH 115/124] Addressing David B.'s first batch of PR Comments --- CMakeLists.txt | 26 ++++++++++++++++---------- src/basic/INIT3-OMPTarget.cpp | 8 ++++---- src/basic/INIT_VIEW1D-OMPTarget.cpp | 8 ++++---- src/basic/PI_ATOMIC.cpp | 2 +- src/stream-kokkos/DOT-Kokkos.cpp | 1 - 5 files changed, 25 insertions(+), 20 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 193650b9d..7ce447b82 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,13 +25,14 @@ set(ENABLE_TESTS Off CACHE BOOL "Enable BLT and RAJA tests") set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples") set(RAJA_ENABLE_EXERCISES Off CACHE BOOL "Enable RAJA exercises") -set(CMAKE_CXX_STANDARD 14) -set(BLT_CXX_STANDARD 14) +if(ENABLE_KOKKOS) + set(CMAKE_CXX_STANDARD 14) + set(BLT_CXX_STANDARD 14) +endif() include(blt/SetupBLT.cmake) # # Define RAJA settings... -# set(ENABLE_TESTS Off CACHE BOOL "") set(ENABLE_EXAMPLES Off CACHE BOOL "") @@ -79,8 +80,9 @@ endif() if (ENABLE_CUDA) list(APPEND RAJA_PERFSUITE_DEPENDS cuda) endif() -# HIP is used with AMD / VEGA GPU -# Neatly separate RAJAPerf Suite and Kokkos handling of HIP +# Kokkos requires hipcc as the CMAKE_CXX_COMPILER for HIP AMD/VEGA GPU +# platforms, whereas RAJAPerf Suite uses blt/CMake FindHIP to set HIP compiler +# Separate RAJAPerf Suite and Kokkos handling of HIP compilers if ((ENABLE_HIP) AND (NOT ENABLE_KOKKOS)) #if (ENABLE_HIP) list(APPEND RAJA_PERFSUITE_DEPENDS hip) @@ -90,9 +92,10 @@ set(RAJAPERF_BUILD_SYSTYPE $ENV{SYS_TYPE}) set(RAJAPERF_BUILD_HOST $ENV{HOSTNAME}) if (ENABLE_CUDA) - set(CMAKE_CUDA_STANDARD 14) + if (ENABLE_CUDA AND ENABLE_KOKKOS) + set(CMAKE_CUDA_STANDARD 14) + endif() set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -restrict -arch ${CUDA_ARCH} --expt-extended-lambda --expt-relaxed-constexpr") - set(RAJAPERF_COMPILER "${CUDA_NVCC_EXECUTABLE}") list(APPEND RAJAPERF_COMPILER ${CMAKE_CXX_COMPILER}) set(RAJAPERF_COMPILER_OPTIONS "${CUDA_NVCC_FLAGS}") @@ -114,8 +117,10 @@ configure_file(${CMAKE_SOURCE_DIR}/src/rajaperf_config.hpp.in # remove project-specific CMake variables that are no longer needed) set (CUDA_NVCC_FLAGS ${RAJA_NVCC_FLAGS}) -# HACKS TO FIX COMPILATION ISSUES -include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/tpl/RAJA/include/) +# The statement below is required for Kokkos compilation. +if(ENABLE_KOKKOS) + include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/tpl/RAJA/include/) +endif() @@ -123,7 +128,8 @@ include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/tpl/RAJA/include/) if(ENABLE_KOKKOS) add_definitions(-DRUN_KOKKOS) if(ENABLE_HIP) - set(Kokkos_ENABLE_HIP ON CACHE BOOL "Kokkos builds with AMD HIP require a ... build...AJP FINISH") + set(Kokkos_ENABLE_HIP ON CACHE BOOL "Kokkos builds for AMD HIP set the +Kokkos_ENABLE_HIP variable to ON") #set(Kokkos_ARCH_VEGA900 ON CACHE BOOL "Docstring") #TODO: better #set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE /ascldap/users/ajpowel/RAJAPerf/amd_build/compiler_unscrewer) endif() diff --git a/src/basic/INIT3-OMPTarget.cpp b/src/basic/INIT3-OMPTarget.cpp index 4abe4bb9d..7d3f9ce05 100644 --- a/src/basic/INIT3-OMPTarget.cpp +++ b/src/basic/INIT3-OMPTarget.cpp @@ -80,10 +80,10 @@ void INIT3::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - //RAJA::forall>( - // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - // INIT3_BODY; - //}); + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + INIT3_BODY; + }); } stopTimer(); diff --git a/src/basic/INIT_VIEW1D-OMPTarget.cpp b/src/basic/INIT_VIEW1D-OMPTarget.cpp index b14b4add4..705d2fb6e 100644 --- a/src/basic/INIT_VIEW1D-OMPTarget.cpp +++ b/src/basic/INIT_VIEW1D-OMPTarget.cpp @@ -72,10 +72,10 @@ void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - //RAJA::forall>( - // RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - // INIT_VIEW1D_BODY_RAJA; - //}); + RAJA::forall>( + RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { + INIT_VIEW1D_BODY_RAJA; + }); } stopTimer(); diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index ef5bae171..f6375f3cf 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -72,7 +72,7 @@ void PI_ATOMIC::setUp(VariantID vid) void PI_ATOMIC::updateChecksum(VariantID vid) { - std::cout << "Value is "<<*m_pi< Date: Tue, 2 Nov 2021 12:18:03 -0600 Subject: [PATCH 116/124] Fix some compiler warnings --- src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp | 1 - src/common/RunParams.cpp | 1 - 2 files changed, 2 deletions(-) diff --git a/src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp b/src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp index ed66ca3cb..b594f7f0d 100644 --- a/src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp +++ b/src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp @@ -189,7 +189,6 @@ void DEL_DOT_VEC_2D::runKokkosVariant(VariantID vid) { m_domain->n_real_zones, working_res); */ - auto deldotvec2d_lam = [=](Index_type i) { DEL_DOT_VEC_2D_BODY; }; auto index_list = getViewFromPointer(m_domain->real_zones, m_domain->n_real_zones); diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 8314f5f38..335e017e4 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -52,7 +52,6 @@ RunParams::RunParams(int argc, char** argv) outfile_prefix("RAJAPerf") { parseCommandLineOptions(argc, argv); - auto foo =0; } From 520357dea5689e5233380f49b77bd0853f465557 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Wed, 3 Nov 2021 14:04:43 -0600 Subject: [PATCH 117/124] Merge 1: RPS + RPS AS INFRASTRUCTURE --- src/CMakeLists.txt | 20 +- src/common/CMakeLists.txt | 23 +- src/common/Executor.cpp | 6 + src/common/KernelBase.cpp | 5 +- src/common/KernelBase.hpp | 19 +- src/common/PerfsuiteKernelDefinitions.cpp | 167 ++++++ src/common/PerfsuiteKernelDefinitions.hpp | 16 + src/common/QuickKernelBase.hpp | 3 +- src/common/RAJAPerfSuite.cpp | 678 +--------------------- src/common/RAJAPerfSuite.hpp | 7 + src/common/RPTypes.hpp | 10 +- 11 files changed, 258 insertions(+), 696 deletions(-) create mode 100644 src/common/PerfsuiteKernelDefinitions.cpp create mode 100644 src/common/PerfsuiteKernelDefinitions.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1e22203db..4aee5ffa7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,7 +8,10 @@ include_directories(.) +# Added as part of the merge with build only infrastructure: add_subdirectory(common) + +if(NOT INFRASTRUCTURE_ONLY) add_subdirectory(apps) add_subdirectory(apps-kokkos) add_subdirectory(basic) @@ -21,13 +24,15 @@ add_subdirectory(stream) add_subdirectory(stream-kokkos) add_subdirectory(algorithm) add_subdirectory(algorithm-kokkos) - -set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS - common - apps - apps-kokkos +endif() +# Ask David about necessary changes here (wrt to file in Kokkos Kernels) +set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS common) +if(NOT INFRASTRUCTURE_ONLY) +list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS basic basic-kokkos + apps + apps-kokkos #kokkos-mechanics lcals lcals-kokkos @@ -37,6 +42,9 @@ set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS algorithm algorithm-kokkos ) +endif() + +# This line must be kept list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) if(ENABLE_TARGET_OPENMP) @@ -268,9 +276,11 @@ blt_add_executable( ) else() + if(NOT INFRASTRUCTURE_ONLY) blt_add_executable( NAME raja-perf.exe SOURCES RAJAPerfSuiteDriver.cpp DEPENDS_ON ${RAJA_PERFSUITE_EXECUTABLE_DEPENDS} ) + endif() endif() diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index a673d2e43..2c4be2fbc 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -5,14 +5,25 @@ # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### +set(RAJAPERF_COMMON_SRC + Executor.cpp + KernelBase.cpp + OutputUtils.cpp + RunParams.cpp + RAJAPerfSuite.cpp + ) +if(NOT INFRASTRUCTURE_ONLY) + #MESSAGE(FATAL_ERROR "TODO ERROR ${INFRASTRUCTURE_ONLY}") + LIST(APPEND RAJAPERF_COMMON_SRC + DataUtils.cpp + PerfsuiteKernelDefinitions.cpp + ) + endif() blt_add_library( NAME common - SOURCES DataUtils.cpp - Executor.cpp - KernelBase.cpp - OutputUtils.cpp - RAJAPerfSuite.cpp - RunParams.cpp + SOURCES ${RAJAPERF_COMMON_SRC} DEPENDS_ON ${RAJA_PERFSUITE_DEPENDS} ) + + diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index afcff9d0a..b7b96ab30 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -13,9 +13,12 @@ #include "common/OutputUtils.hpp" // Warmup kernels to run first to help reduce startup overheads in timings +#ifndef RAJAPERF_INFRASTRUCTURE_ONLY #include "basic/DAXPY.hpp" #include "basic/REDUCE3_INT.hpp" #include "algorithm/SORT.hpp" +#endif + #include #include @@ -718,6 +721,8 @@ void Executor::runSuite() return; } +#ifndef RAJAPERF_INFRASTRUCTURE_ONLY + cout << "\n\nRun warmup kernels...\n"; vector warmup_kernels; @@ -746,6 +751,7 @@ void Executor::runSuite() delete warmup_kernels[ik]; } +#endif cout << "\n\nRunning specified kernels and variants...\n"; diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index fd1ec8fb2..ecc048846 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -109,8 +109,9 @@ void KernelBase::execute(VariantID vid) running_variant = vid; resetTimer(); - +#ifndef RAJAPERF_INFRASTRUCTURE_ONLY resetDataInitCount(); +#endif this->setUp(vid); #ifdef RUN_KOKKOS Kokkos::Tools::pushRegion(this->getName() + ":"+getVariantName(vid)); @@ -199,7 +200,7 @@ void KernelBase::runKernel(VariantID vid) break; } -#if defined(RUN_KOKKOS) +#if defined(RUN_KOKKOS) or defined (RAJAPERF_INFRASTRUCTURE_ONLY) case Kokkos_Lambda : case Kokkos_Functor : { diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index e8653d733..6a5793d26 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -10,11 +10,16 @@ #define RAJAPerf_KernelBase_HPP #include "common/RAJAPerfSuite.hpp" -#include "common/RPTypes.hpp" -#include "common/DataUtils.hpp" +//#include "common/RPTypes.hpp" #include "common/RunParams.hpp" +#ifndef RAJAPERF_INFRASTRUCTURE_ONLY #include "RAJA/util/Timer.hpp" +#include "common/DataUtils.hpp" +#else +#include "common/BuiltinTimer.hpp" +#endif + #if defined(RAJA_ENABLE_CUDA) #include "RAJA/policy/cuda/raja_cudaerrchk.hpp" #endif @@ -41,6 +46,12 @@ class KernelBase KernelBase(KernelID kid, const RunParams& params); KernelBase(std::string name, const RunParams& params); +#ifndef RAJAPERF_INFRASTRUCTURE_ONLY + using TimerType = RAJA::Timer; +#else + using TimerType = rajaperf::ChronoTimer; +#endif + virtual ~KernelBase(); KernelID getKernelID() const { return kernel_id; } @@ -158,12 +169,12 @@ class KernelBase virtual void runOpenMPTargetVariant(VariantID vid) = 0; #endif -#if defined(RUN_KOKKOS) +#if defined(RUN_KOKKOS) or defined(RAJAPERF_INFRASTRUCTURE_ONLY) virtual void runKokkosVariant(VariantID vid) = 0; #endif // RUN_KOKKOS protected: - const RunParams& run_params; + const RunParams run_params; Checksum_type checksum[NumVariants]; Checksum_type checksum_scale_factor; diff --git a/src/common/PerfsuiteKernelDefinitions.cpp b/src/common/PerfsuiteKernelDefinitions.cpp new file mode 100644 index 000000000..d958415b7 --- /dev/null +++ b/src/common/PerfsuiteKernelDefinitions.cpp @@ -0,0 +1,167 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// Created by Poliakoff, David Zoeller on 4/26/21. +// + +// Basic kernels... +// +#include "basic/DAXPY.hpp" +#include "basic/IF_QUAD.hpp" +#include "basic/INIT3.hpp" +#include "basic/INIT_VIEW1D.hpp" +#include "basic/INIT_VIEW1D_OFFSET.hpp" +#include "basic/MAT_MAT_SHARED.hpp" +#include "basic/MULADDSUB.hpp" +#include "basic/NESTED_INIT.hpp" +#include "basic/PI_ATOMIC.hpp" +#include "basic/PI_REDUCE.hpp" +#include "basic/REDUCE3_INT.hpp" +#include "basic/TRAP_INT.hpp" + +// +// Lcals kernels... +// +#include "lcals/DIFF_PREDICT.hpp" +#include "lcals/EOS.hpp" +#include "lcals/FIRST_DIFF.hpp" +#include "lcals/FIRST_MIN.hpp" +#include "lcals/FIRST_SUM.hpp" +#include "lcals/GEN_LIN_RECUR.hpp" +#include "lcals/HYDRO_1D.hpp" +#include "lcals/HYDRO_2D.hpp" +#include "lcals/INT_PREDICT.hpp" +#include "lcals/PLANCKIAN.hpp" +#include "lcals/TRIDIAG_ELIM.hpp" + +// +// Polybench kernels... +// +#include "polybench/POLYBENCH_2MM.hpp" +#include "polybench/POLYBENCH_3MM.hpp" +#include "polybench/POLYBENCH_ADI.hpp" +#include "polybench/POLYBENCH_ATAX.hpp" +#include "polybench/POLYBENCH_FDTD_2D.hpp" +#include "polybench/POLYBENCH_FLOYD_WARSHALL.hpp" +#include "polybench/POLYBENCH_GEMM.hpp" +#include "polybench/POLYBENCH_GEMVER.hpp" +#include "polybench/POLYBENCH_GESUMMV.hpp" +#include "polybench/POLYBENCH_HEAT_3D.hpp" +#include "polybench/POLYBENCH_JACOBI_1D.hpp" +#include "polybench/POLYBENCH_JACOBI_2D.hpp" +#include "polybench/POLYBENCH_MVT.hpp" + +// +// Stream kernels... +// +#include "stream/COPY.hpp" +#include "stream/MUL.hpp" +#include "stream/ADD.hpp" +#include "stream/TRIAD.hpp" +#include "stream/DOT.hpp" + +// +// Apps kernels... +// +#include "apps/WIP-COUPLE.hpp" +#include "apps/DEL_DOT_VEC_2D.hpp" +#include "apps/DIFFUSION3DPA.hpp" +#include "apps/ENERGY.hpp" +#include "apps/FIR.hpp" +#include "apps/HALOEXCHANGE.hpp" +#include "apps/HALOEXCHANGE_FUSED.hpp" +#include "apps/LTIMES.hpp" +#include "apps/LTIMES_NOVIEW.hpp" +#include "apps/MASS3DPA.hpp" +#include "apps/PRESSURE.hpp" +#include "apps/VOL3D.hpp" + +// +// Algorithm kernels... +// +#include "algorithm/SORT.hpp" +#include "algorithm/SORTPAIRS.hpp" + + +#include +namespace rajaperf { + +void make_perfsuite_executor(rajaperf::Executor *exec, int argc, char *argv[]) { + RunParams run_params(argc, argv); + free_register_group(exec, std::string("Basic")); + free_register_group(exec, std::string("Lcals")); + free_register_group(exec, std::string("Polybench")); + free_register_group(exec, std::string("Stream")); + free_register_group(exec, std::string("Apps")); + free_register_group(exec, std::string("Algorithm")); + + // Basic + + free_register_kernel(exec, "Basic", new basic::PI_ATOMIC(run_params)); + free_register_kernel(exec, "Basic", new basic::DAXPY(run_params)); + free_register_kernel(exec, "Basic", new basic::IF_QUAD(run_params)); + free_register_kernel(exec, "Basic", new basic::INIT3(run_params)); + free_register_kernel(exec, "Basic", new basic::INIT_VIEW1D(run_params)); + free_register_kernel(exec, "Basic", new basic::INIT_VIEW1D_OFFSET(run_params)); + free_register_kernel(exec, "Basic", new basic::MULADDSUB(run_params)); + free_register_kernel(exec, "Basic", new basic::NESTED_INIT(run_params)); + free_register_kernel(exec, "Basic", new basic::REDUCE3_INT(run_params)); + free_register_kernel(exec, "Basic", new basic::TRAP_INT(run_params)); + + // Lcals + free_register_kernel(exec, "Lcals", new lcals::DIFF_PREDICT(run_params)); + free_register_kernel(exec, "Lcals", new lcals::EOS(run_params)); + free_register_kernel(exec, "Lcals", new lcals::FIRST_DIFF(run_params)); + free_register_kernel(exec, "Lcals", new lcals::FIRST_MIN(run_params)); + free_register_kernel(exec, "Lcals", new lcals::FIRST_SUM(run_params)); + free_register_kernel(exec, "Lcals", new lcals::GEN_LIN_RECUR(run_params)); + free_register_kernel(exec, "Lcals", new lcals::HYDRO_1D(run_params)); + free_register_kernel(exec, "Lcals", new lcals::HYDRO_2D(run_params)); + free_register_kernel(exec, "Lcals", new lcals::INT_PREDICT(run_params)); + free_register_kernel(exec, "Lcals", new lcals::PLANCKIAN(run_params)); + free_register_kernel(exec, "Lcals", new lcals::TRIDIAG_ELIM(run_params)); +/* + // Polybench + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_2MM(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_3MM(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_ADI(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_ATAX(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_FDTD_2D(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_FLOYD_WARSHALL(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_GEMM(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_GEMVER(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_GESUMMV(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_HEAT_3D(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_JACOBI_1D(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_JACOBI_2D(run_params)); + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_MVT(run_params)); +*/ + // Stream + free_register_kernel(exec, "Stream", new stream::ADD(run_params)); + free_register_kernel(exec, "Stream", new stream::COPY(run_params)); + free_register_kernel(exec, "Stream", new stream::DOT(run_params)); + free_register_kernel(exec, "Stream", new stream::MUL(run_params)); + free_register_kernel(exec, "Stream", new stream::TRIAD(run_params)); + + // Apps + //free_register_kernel(exec, "Apps", new apps::COUPLE(run_params)); + free_register_kernel(exec, "Apps", new apps::DEL_DOT_VEC_2D(run_params)); + free_register_kernel(exec, "Apps", new apps::ENERGY(run_params)); + free_register_kernel(exec, "Apps", new apps::FIR(run_params)); + free_register_kernel(exec, "Apps", new apps::HALOEXCHANGE(run_params)); + free_register_kernel(exec, "Apps", new apps::LTIMES(run_params)); + free_register_kernel(exec, "Apps", new apps::LTIMES_NOVIEW(run_params)); + free_register_kernel(exec, "Apps", new apps::PRESSURE(run_params)); + free_register_kernel(exec, "Apps", new apps::VOL3D(run_params)); + + // Algorithm + free_register_kernel(exec, "Algorithm", new algorithm::SORT(run_params)); + free_register_kernel(exec, "Algorithm", new algorithm::SORTPAIRS(run_params)); + +} +} // Closing namespace rajaperf diff --git a/src/common/PerfsuiteKernelDefinitions.hpp b/src/common/PerfsuiteKernelDefinitions.hpp new file mode 100644 index 000000000..cbbd50c1a --- /dev/null +++ b/src/common/PerfsuiteKernelDefinitions.hpp @@ -0,0 +1,16 @@ +// +// Created by Poliakoff, David Zoeller on 4/26/21. +// + +#ifndef RAJAPERFSUITE_PERFSUITEKERNELDEFINITIONS_HPP +#define RAJAPERFSUITE_PERFSUITEKERNELDEFINITIONS_HPP + +namespace rajaperf{ + + class Executor; +} +void make_perfsuite_executor(rajaperf::Executor *exec, int argc, char *argv[]); + + + +#endif //RAJAPERFSUITE_PERFSUITEKERNELDEFINITIONS_HPP diff --git a/src/common/QuickKernelBase.hpp b/src/common/QuickKernelBase.hpp index 70289de13..4980a3761 100644 --- a/src/common/QuickKernelBase.hpp +++ b/src/common/QuickKernelBase.hpp @@ -78,7 +78,8 @@ namespace rajaperf { void runOpenMPTargetVariant(VariantID vid) override {} #endif -#if defined(RUN_KOKKOS) +#if defined(RUN_KOKKOS) or defined(RAJAPERF_INFRASTRUCTURE_ONLY) + template void rkv_helper(std::index_sequence) { diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 3cb9fc9e2..d1dd39dca 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -10,162 +10,13 @@ #include "RunParams.hpp" -// -// Basic kernels... -// -#include "basic/DAXPY.hpp" -#include "basic/IF_QUAD.hpp" -#include "basic/INIT3.hpp" -#include "basic/INIT_VIEW1D.hpp" -#include "basic/INIT_VIEW1D_OFFSET.hpp" -#include "basic/MAT_MAT_SHARED.hpp" -#include "basic/MULADDSUB.hpp" -#include "basic/NESTED_INIT.hpp" -#include "basic/PI_ATOMIC.hpp" -#include "basic/PI_REDUCE.hpp" -#include "basic/REDUCE3_INT.hpp" -#include "basic/TRAP_INT.hpp" - -// -// Lcals kernels... -// -#include "lcals/DIFF_PREDICT.hpp" -#include "lcals/EOS.hpp" -#include "lcals/FIRST_DIFF.hpp" -#include "lcals/FIRST_MIN.hpp" -#include "lcals/FIRST_SUM.hpp" -#include "lcals/GEN_LIN_RECUR.hpp" -#include "lcals/HYDRO_1D.hpp" -#include "lcals/HYDRO_2D.hpp" -#include "lcals/INT_PREDICT.hpp" -#include "lcals/PLANCKIAN.hpp" -#include "lcals/TRIDIAG_ELIM.hpp" - -// -// Polybench kernels... -// -#include "polybench/POLYBENCH_2MM.hpp" -#include "polybench/POLYBENCH_3MM.hpp" -#include "polybench/POLYBENCH_ADI.hpp" -#include "polybench/POLYBENCH_ATAX.hpp" -#include "polybench/POLYBENCH_FDTD_2D.hpp" -#include "polybench/POLYBENCH_FLOYD_WARSHALL.hpp" -#include "polybench/POLYBENCH_GEMM.hpp" -#include "polybench/POLYBENCH_GEMVER.hpp" -#include "polybench/POLYBENCH_GESUMMV.hpp" -#include "polybench/POLYBENCH_HEAT_3D.hpp" -#include "polybench/POLYBENCH_JACOBI_1D.hpp" -#include "polybench/POLYBENCH_JACOBI_2D.hpp" -#include "polybench/POLYBENCH_MVT.hpp" - -// -// Stream kernels... -// -#include "stream/COPY.hpp" -#include "stream/MUL.hpp" -#include "stream/ADD.hpp" -#include "stream/TRIAD.hpp" -#include "stream/DOT.hpp" - -// -// Apps kernels... -// -#include "apps/WIP-COUPLE.hpp" -#include "apps/DEL_DOT_VEC_2D.hpp" -#include "apps/DIFFUSION3DPA.hpp" -#include "apps/ENERGY.hpp" -#include "apps/FIR.hpp" -#include "apps/HALOEXCHANGE.hpp" -#include "apps/HALOEXCHANGE_FUSED.hpp" -#include "apps/LTIMES.hpp" -#include "apps/LTIMES_NOVIEW.hpp" -#include "apps/MASS3DPA.hpp" -#include "apps/PRESSURE.hpp" -#include "apps/VOL3D.hpp" - -// -// Algorithm kernels... -// -#include "algorithm/SORT.hpp" -#include "algorithm/SORTPAIRS.hpp" - +#ifndef RAJAPERF_INFRASTRUCTURE_ONLY +#include "PerfsuiteKernelDefinitions.hpp" +#endif #include namespace rajaperf { - void make_perfsuite_executor(rajaperf::Executor *exec, int argc, char *argv[]) { - //RunParams run_params(argc, argv); - const RunParams& run_params = getRunParams(exec); - free_register_group(exec, std::string("Basic")); - free_register_group(exec, std::string("Lcals")); - free_register_group(exec, std::string("Polybench")); - free_register_group(exec, std::string("Stream")); - free_register_group(exec, std::string("Apps")); - free_register_group(exec, std::string("Algorithm")); - - // Basic - - free_register_kernel(exec, "Basic", new basic::PI_ATOMIC(run_params)); - free_register_kernel(exec, "Basic", new basic::DAXPY(run_params)); - free_register_kernel(exec, "Basic", new basic::IF_QUAD(run_params)); - free_register_kernel(exec, "Basic", new basic::INIT3(run_params)); - free_register_kernel(exec, "Basic", new basic::INIT_VIEW1D(run_params)); - free_register_kernel(exec, "Basic", new basic::INIT_VIEW1D_OFFSET(run_params)); - free_register_kernel(exec, "Basic", new basic::MULADDSUB(run_params)); - free_register_kernel(exec, "Basic", new basic::NESTED_INIT(run_params)); - free_register_kernel(exec, "Basic", new basic::REDUCE3_INT(run_params)); - free_register_kernel(exec, "Basic", new basic::TRAP_INT(run_params)); - // Lcals - free_register_kernel(exec, "Lcals", new lcals::DIFF_PREDICT(run_params)); - free_register_kernel(exec, "Lcals", new lcals::EOS(run_params)); - free_register_kernel(exec, "Lcals", new lcals::FIRST_DIFF(run_params)); - free_register_kernel(exec, "Lcals", new lcals::FIRST_MIN(run_params)); - free_register_kernel(exec, "Lcals", new lcals::FIRST_SUM(run_params)); - free_register_kernel(exec, "Lcals", new lcals::GEN_LIN_RECUR(run_params)); - free_register_kernel(exec, "Lcals", new lcals::HYDRO_1D(run_params)); - free_register_kernel(exec, "Lcals", new lcals::HYDRO_2D(run_params)); - free_register_kernel(exec, "Lcals", new lcals::INT_PREDICT(run_params)); - free_register_kernel(exec, "Lcals", new lcals::PLANCKIAN(run_params)); - free_register_kernel(exec, "Lcals", new lcals::TRIDIAG_ELIM(run_params)); - - /** - // Polybench - free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_2MM(run_params)); - free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_3MM(run_params)); - free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_ADI(run_params)); - free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_ATAX(run_params)); - free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_FDTD_2D(run_params)); - free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_FLOYD_WARSHALL(run_params)); - free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_GEMM(run_params)); - free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_GEMVER(run_params)); - free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_GESUMMV(run_params)); - free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_HEAT_3D(run_params)); - free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_JACOBI_1D(run_params)); - free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_JACOBI_2D(run_params)); - free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_MVT(run_params)); -**/ - // Stream - free_register_kernel(exec, "Stream", new stream::ADD(run_params)); - free_register_kernel(exec, "Stream", new stream::COPY(run_params)); - free_register_kernel(exec, "Stream", new stream::DOT(run_params)); - free_register_kernel(exec, "Stream", new stream::MUL(run_params)); - free_register_kernel(exec, "Stream", new stream::TRIAD(run_params)); - // Apps - // Item below is a WIP from the RPS side - // free_register_kernel(exec, "Apps", new apps::COUPLE(run_params)); - free_register_kernel(exec, "Apps", new apps::DEL_DOT_VEC_2D(run_params)); - free_register_kernel(exec, "Apps", new apps::ENERGY(run_params)); - free_register_kernel(exec, "Apps", new apps::FIR(run_params)); - free_register_kernel(exec, "Apps", new apps::HALOEXCHANGE(run_params)); - free_register_kernel(exec, "Apps", new apps::LTIMES(run_params)); - free_register_kernel(exec, "Apps", new apps::LTIMES_NOVIEW(run_params)); - free_register_kernel(exec, "Apps", new apps::PRESSURE(run_params)); - free_register_kernel(exec, "Apps", new apps::VOL3D(run_params)); - // Algorithm - free_register_kernel(exec, "Algorithm", new algorithm::SORT(run_params)); - free_register_kernel(exec, "Algorithm", new algorithm::SORTPAIRS(run_params)); - - } /*! ******************************************************************************* @@ -434,7 +285,7 @@ const std::string& getGroupName(GroupID gid) } #endif -#if defined(RUN_KOKKOS) +#if defined(RUN_KOKKOS) or defined(RAJAPERF_INFRASTRUCTURE_ONLY) if (vid == Kokkos_Lambda || vid == Kokkos_Functor) { ret_val = true; @@ -494,525 +345,4 @@ const std::string& getFeatureName(FeatureID fid) * ******************************************************************************* */ -//<<<<<<< HEAD -//KernelBase* getKernelObject(KernelID kid, -// const RunParams& run_params) -//{ -// -// KernelBase* kernel = 0; -// -// switch ( kid ) { -// -// // -// // Basic kernels... -// // -// case Basic_DAXPY : { -// kernel = new basic::DAXPY(run_params); -// break; -// } -// case Basic_IF_QUAD : { -// kernel = new basic::IF_QUAD(run_params); -// break; -// } -// case Basic_INIT3 : { -// kernel = new basic::INIT3(run_params); -// break; -// } -// case Basic_INIT_VIEW1D : { -// kernel = new basic::INIT_VIEW1D(run_params); -// break; -// } -// case Basic_INIT_VIEW1D_OFFSET : { -// kernel = new basic::INIT_VIEW1D_OFFSET(run_params); -// break; -// } -// case Basic_MULADDSUB : { -// kernel = new basic::MULADDSUB(run_params); -// break; -// } -// case Basic_NESTED_INIT : { -// kernel = new basic::NESTED_INIT(run_params); -// break; -// } -// case Basic_PI_ATOMIC : { -// kernel = new basic::PI_ATOMIC(run_params); -// break; -// } -// case Basic_PI_REDUCE : { -// kernel = new basic::PI_REDUCE(run_params); -// break; -// } -// case Basic_REDUCE3_INT : { -// kernel = new basic::REDUCE3_INT(run_params); -// break; -// } -// case Basic_TRAP_INT : { -// kernel = new basic::TRAP_INT(run_params); -// break; -// } -// -//// -//// Lcals kernels... -// -// case Lcals_DIFF_PREDICT : { -// kernel = new lcals::DIFF_PREDICT(run_params); -// break; -// } -// case Lcals_EOS : { -// kernel = new lcals::EOS(run_params); -// break; -// } -// case Lcals_FIRST_DIFF : { -// kernel = new lcals::FIRST_DIFF(run_params); -// break; -// } -// -// case Lcals_FIRST_MIN : { -// kernel = new lcals::FIRST_MIN(run_params); -// break; -// } -// case Lcals_FIRST_SUM : { -// kernel = new lcals::FIRST_SUM(run_params); -// break; -// } -// case Lcals_GEN_LIN_RECUR : { -// kernel = new lcals::GEN_LIN_RECUR(run_params); -// break; -// } -// case Lcals_HYDRO_1D : { -// kernel = new lcals::HYDRO_1D(run_params); -// break; -// } -// case Lcals_HYDRO_2D : { -// kernel = new lcals::HYDRO_2D(run_params); -// break; -// } -// case Lcals_INT_PREDICT : { -// kernel = new lcals::INT_PREDICT(run_params); -// break; -// } -// case Lcals_PLANCKIAN : { -// kernel = new lcals::PLANCKIAN(run_params); -// break; -// } -// case Lcals_TRIDIAG_ELIM : { -// kernel = new lcals::TRIDIAG_ELIM(run_params); -// break; -// } -// -// -//// Stream kernels... -//// -// case Stream_ADD : { -// kernel = new stream::ADD(run_params); -// break; -// } -// case Stream_COPY : { -// kernel = new stream::COPY(run_params); -// break; -// } -// case Stream_DOT : { -// kernel = new stream::DOT(run_params); -// break; -// } -// case Stream_MUL : { -// kernel = new stream::MUL(run_params); -// break; -// } -// case Stream_TRIAD : { -// kernel = new stream::TRIAD(run_params); -// break; -// } -//// -//// -///** DZP: big comment block for unimplemented -//// Polybench kernels... -//// -// case Polybench_2MM : { -// kernel = new polybench::POLYBENCH_2MM(run_params); -// break; -// } -// case Polybench_3MM : { -// kernel = new polybench::POLYBENCH_3MM(run_params); -// break; -// } -// case Polybench_ADI : { -// kernel = new polybench::POLYBENCH_ADI(run_params); -// break; -// } -// case Polybench_ATAX : { -// kernel = new polybench::POLYBENCH_ATAX(run_params); -// break; -// } -// case Polybench_FDTD_2D : { -// kernel = new polybench::POLYBENCH_FDTD_2D(run_params); -// break; -// } -// case Polybench_FLOYD_WARSHALL : { -// kernel = new polybench::POLYBENCH_FLOYD_WARSHALL(run_params); -// break; -// } -// case Polybench_GEMM : { -// kernel = new polybench::POLYBENCH_GEMM(run_params); -// break; -// } -// case Polybench_GEMVER : { -// kernel = new polybench::POLYBENCH_GEMVER(run_params); -// break; -// } -// case Polybench_GESUMMV : { -// kernel = new polybench::POLYBENCH_GESUMMV(run_params); -// break; -// } -// case Polybench_HEAT_3D : { -// kernel = new polybench::POLYBENCH_HEAT_3D(run_params); -// break; -// } -// case Polybench_JACOBI_1D : { -// kernel = new polybench::POLYBENCH_JACOBI_1D(run_params); -// break; -// } -// case Polybench_JACOBI_2D : { -// kernel = new polybench::POLYBENCH_JACOBI_2D(run_params); -// break; -// } -// case Polybench_MVT : { -// kernel = new polybench::POLYBENCH_MVT(run_params); -// break; -// } -// -////////////////////////////////////////////////////////////////// -//// Apps kernels... -///* -// case Apps_COUPLE : { -// kernel = new apps::COUPLE(run_params); -// break; -// } -// -// */ -// -// case Apps_DEL_DOT_VEC_2D : { -// kernel = new apps::DEL_DOT_VEC_2D(run_params); -// break; -// } -// case Apps_ENERGY : { -// kernel = new apps::ENERGY(run_params); -// break; -// } -// case Apps_FIR : { -// kernel = new apps::FIR(run_params); -// break; -// } -// case Apps_HALOEXCHANGE : { -// kernel = new apps::HALOEXCHANGE(run_params); -// break; -// } -// case Apps_HALOEXCHANGE_FUSED : { -// kernel = new apps::HALOEXCHANGE_FUSED(run_params); -// break; -// } -// case Apps_LTIMES : { -// kernel = new apps::LTIMES(run_params); -// break; -// } -// case Apps_LTIMES_NOVIEW : { -// kernel = new apps::LTIMES_NOVIEW(run_params); -// break; -// } -// case Apps_MASS3DPA : { -// kernel = new apps::MASS3DPA(run_params); -// break; -// } -// case Apps_PRESSURE : { -// kernel = new apps::PRESSURE(run_params); -// break; -// } -// case Apps_VOL3D : { -// kernel = new apps::VOL3D(run_params); -// break; -// } -// -//// -//// Algorithm kernels... -///* -// case Algorithm_SORT: { -// kernel = new algorithm::SORT(run_params); -// break; -// } -// case Algorithm_SORTPAIRS: { -// kernel = new algorithm::SORTPAIRS(run_params); -// break; -// } -//*/ -// default: { -// std::cout << "\n Unknown Kernel ID = " << kid << std::endl; -// } -// -// } // end switch on kernel id -// -// return kernel; -// } -//======= -/* -KernelBase* getKernelObject(KernelID kid, - const RunParams& run_params) -{ - KernelBase* kernel = 0; - - switch ( kid ) { - - // - // Basic kernels... - // - case Basic_DAXPY : { - kernel = new basic::DAXPY(run_params); - break; - } - case Basic_IF_QUAD : { - kernel = new basic::IF_QUAD(run_params); - break; - } - case Basic_INIT3 : { - kernel = new basic::INIT3(run_params); - break; - } - case Basic_INIT_VIEW1D : { - kernel = new basic::INIT_VIEW1D(run_params); - break; - } - case Basic_INIT_VIEW1D_OFFSET : { - kernel = new basic::INIT_VIEW1D_OFFSET(run_params); - break; - } - case Basic_MAT_MAT_SHARED : { - kernel = new basic::MAT_MAT_SHARED(run_params); - break; - } - case Basic_MULADDSUB : { - kernel = new basic::MULADDSUB(run_params); - break; - } - case Basic_NESTED_INIT : { - kernel = new basic::NESTED_INIT(run_params); - break; - } - case Basic_PI_ATOMIC : { - kernel = new basic::PI_ATOMIC(run_params); - break; - } - case Basic_PI_REDUCE : { - kernel = new basic::PI_REDUCE(run_params); - break; - } - case Basic_REDUCE3_INT : { - kernel = new basic::REDUCE3_INT(run_params); - break; - } - case Basic_TRAP_INT : { - kernel = new basic::TRAP_INT(run_params); - break; - } - -// -// Lcals kernels... -// - case Lcals_DIFF_PREDICT : { - kernel = new lcals::DIFF_PREDICT(run_params); - break; - } - case Lcals_EOS : { - kernel = new lcals::EOS(run_params); - break; - } - case Lcals_FIRST_DIFF : { - kernel = new lcals::FIRST_DIFF(run_params); - break; - } - case Lcals_FIRST_MIN : { - kernel = new lcals::FIRST_MIN(run_params); - break; - } - case Lcals_FIRST_SUM : { - kernel = new lcals::FIRST_SUM(run_params); - break; - } - case Lcals_GEN_LIN_RECUR : { - kernel = new lcals::GEN_LIN_RECUR(run_params); - break; - } - case Lcals_HYDRO_1D : { - kernel = new lcals::HYDRO_1D(run_params); - break; - } - case Lcals_HYDRO_2D : { - kernel = new lcals::HYDRO_2D(run_params); - break; - } - case Lcals_INT_PREDICT : { - kernel = new lcals::INT_PREDICT(run_params); - break; - } - case Lcals_PLANCKIAN : { - kernel = new lcals::PLANCKIAN(run_params); - break; - } - case Lcals_TRIDIAG_ELIM : { - kernel = new lcals::TRIDIAG_ELIM(run_params); - break; - } - -// -// Polybench kernels... -// - case Polybench_2MM : { - kernel = new polybench::POLYBENCH_2MM(run_params); - break; - } - case Polybench_3MM : { - kernel = new polybench::POLYBENCH_3MM(run_params); - break; - } - case Polybench_ADI : { - kernel = new polybench::POLYBENCH_ADI(run_params); - break; - } - case Polybench_ATAX : { - kernel = new polybench::POLYBENCH_ATAX(run_params); - break; - } - case Polybench_FDTD_2D : { - kernel = new polybench::POLYBENCH_FDTD_2D(run_params); - break; - } - case Polybench_FLOYD_WARSHALL : { - kernel = new polybench::POLYBENCH_FLOYD_WARSHALL(run_params); - break; - } - case Polybench_GEMM : { - kernel = new polybench::POLYBENCH_GEMM(run_params); - break; - } - case Polybench_GEMVER : { - kernel = new polybench::POLYBENCH_GEMVER(run_params); - break; - } - case Polybench_GESUMMV : { - kernel = new polybench::POLYBENCH_GESUMMV(run_params); - break; - } - case Polybench_HEAT_3D : { - kernel = new polybench::POLYBENCH_HEAT_3D(run_params); - break; - } - case Polybench_JACOBI_1D : { - kernel = new polybench::POLYBENCH_JACOBI_1D(run_params); - break; - } - case Polybench_JACOBI_2D : { - kernel = new polybench::POLYBENCH_JACOBI_2D(run_params); - break; - } - case Polybench_MVT : { - kernel = new polybench::POLYBENCH_MVT(run_params); - break; - } - -// -// Stream kernels... -// - case Stream_ADD : { - kernel = new stream::ADD(run_params); - break; - } - case Stream_COPY : { - kernel = new stream::COPY(run_params); - break; - } - case Stream_DOT : { - kernel = new stream::DOT(run_params); - break; - } - case Stream_MUL : { - kernel = new stream::MUL(run_params); - break; - } - case Stream_TRIAD : { - kernel = new stream::TRIAD(run_params); - break; - } - -// -// Apps kernels... -// - case Apps_COUPLE : { - kernel = new apps::COUPLE(run_params); - break; - } - case Apps_DEL_DOT_VEC_2D : { - kernel = new apps::DEL_DOT_VEC_2D(run_params); - break; - } - case Apps_DIFFUSION3DPA : { - kernel = new apps::DIFFUSION3DPA(run_params); - break; - } - case Apps_ENERGY : { - kernel = new apps::ENERGY(run_params); - break; - } - case Apps_FIR : { - kernel = new apps::FIR(run_params); - break; - } - case Apps_HALOEXCHANGE : { - kernel = new apps::HALOEXCHANGE(run_params); - break; - } - case Apps_HALOEXCHANGE_FUSED : { - kernel = new apps::HALOEXCHANGE_FUSED(run_params); - break; - } - case Apps_LTIMES : { - kernel = new apps::LTIMES(run_params); - break; - } - case Apps_LTIMES_NOVIEW : { - kernel = new apps::LTIMES_NOVIEW(run_params); - break; - } - case Apps_MASS3DPA : { - kernel = new apps::MASS3DPA(run_params); - break; - } - case Apps_PRESSURE : { - kernel = new apps::PRESSURE(run_params); - break; - } - case Apps_VOL3D : { - kernel = new apps::VOL3D(run_params); - break; - } - -// -// Algorithm kernels... -// - case Algorithm_SORT: { - kernel = new algorithm::SORT(run_params); - break; - } - case Algorithm_SORTPAIRS: { - kernel = new algorithm::SORTPAIRS(run_params); - break; - } - - default: { - std::cout << "\n Unknown Kernel ID = " << kid << std::endl; - } - - } // end switch on kernel id - - return kernel; -} ->>>>>>> upstream/develop -*/ } // closing brace for rajaperf namespace diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 6f1026c26..a654ba6e9 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -13,11 +13,18 @@ #ifndef RAJAPerfSuite_HPP #define RAJAPerfSuite_HPP +//#include "common/RPTypes.hpp" +#ifndef RAJAPERF_INFRASTRUCTURE_ONLY #include "RAJA/config.hpp" +//#include "common/PerfsuiteKernelDefinitions.hpp" + + + #if defined(RUN_KOKKOS) #include "Kokkos_Core.hpp" #endif +#endif #include namespace rajaperf diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp index e48ba53c1..f0e784344 100644 --- a/src/common/RPTypes.hpp +++ b/src/common/RPTypes.hpp @@ -12,9 +12,9 @@ #ifndef RAJAPerf_RPTypes_HPP #define RAJAPerf_RPTypes_HPP - +#ifndef RAJAPERF_INFRASTRUCTURE_ONLY #include "RAJA/util/types.hpp" - +#endif // // Only one of the following (double or float) should be defined. // @@ -56,11 +56,13 @@ using RepIndex_type = volatile int; * ****************************************************************************** */ +#ifndef RAJAPERF_INFRASTRUCTURE_ONLY using Index_type = RAJA::Index_type; -/// +#else +using Index_type = int64_t; +#endif using Index_ptr = Index_type*; - /*! ****************************************************************************** * From c064d50230e6a8fae33fa0d5d26a67d6e9aef997 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 4 Nov 2021 12:09:34 -0600 Subject: [PATCH 118/124] Merge 2: RPS + INFRASTRUCTURE --- CMakeLists.txt | 2 +- src/common/BuiltinTimer.hpp | 40 ++++++++++++++++++++++++++++++++++ src/common/KernelBase.cpp | 2 +- src/common/KernelBase.hpp | 14 +++++++----- src/common/QuickKernelBase.hpp | 36 +++++++++++++++++++++++++----- 5 files changed, 81 insertions(+), 13 deletions(-) create mode 100644 src/common/BuiltinTimer.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ce447b82..1aeea9cfb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -110,7 +110,7 @@ else() list(APPEND RAJAPERF_COMPILER_OPTIONS ${CMAKE_CXX_FLAGS}) endif() -configure_file(${CMAKE_SOURCE_DIR}/src/rajaperf_config.hpp.in +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/src/rajaperf_config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/bin/rajaperf_config.hpp) # Make sure RAJA flag propagate (we need to do some house cleaning to diff --git a/src/common/BuiltinTimer.hpp b/src/common/BuiltinTimer.hpp new file mode 100644 index 000000000..f20a7f543 --- /dev/null +++ b/src/common/BuiltinTimer.hpp @@ -0,0 +1,40 @@ +// +// Created by Poliakoff, David Zoeller on 4/26/21. +// +#include +#include +#ifndef RAJAPERFSUITE_BUILTINTIMER_HPP +#define RAJAPERFSUITE_BUILTINTIMER_HPP +namespace rajaperf { + class ChronoTimer { + public: + using ElapsedType = double; + + private: + using ClockType = std::chrono::steady_clock; + using TimeType = ClockType::time_point; + using DurationType = std::chrono::duration; + + public: + ChronoTimer() : tstart(ClockType::now()), tstop(ClockType::now()), telapsed(0) { + } + + void start() { tstart = ClockType::now(); } + + void stop() { + tstop = ClockType::now(); + telapsed += + std::chrono::duration_cast(tstop - tstart).count(); + } + + ElapsedType elapsed() const { return telapsed; } + + void reset() { telapsed = 0; } + + private: + TimeType tstart; + TimeType tstop; + ElapsedType telapsed; + }; +} +#endif //RAJAPERFSUITE_BUILTINTIMER_HPP diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index ecc048846..9a2bccd13 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -132,7 +132,7 @@ void KernelBase::recordExecTime() { num_exec[running_variant]++; - RAJA::Timer::ElapsedType exec_time = timer.elapsed(); + TimerType::ElapsedType exec_time = timer.elapsed(); min_time[running_variant] = std::min(min_time[running_variant], exec_time); max_time[running_variant] = std::max(max_time[running_variant], exec_time); tot_time[running_variant] += exec_time; diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 6a5793d26..65dbe1efc 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -10,7 +10,7 @@ #define RAJAPerf_KernelBase_HPP #include "common/RAJAPerfSuite.hpp" -//#include "common/RPTypes.hpp" +#include "common/RPTypes.hpp" #include "common/RunParams.hpp" #ifndef RAJAPERF_INFRASTRUCTURE_ONLY @@ -211,11 +211,15 @@ class KernelBase int num_exec[NumVariants]; - RAJA::Timer timer; - RAJA::Timer::ElapsedType min_time[NumVariants]; - RAJA::Timer::ElapsedType max_time[NumVariants]; - RAJA::Timer::ElapsedType tot_time[NumVariants]; + TimerType timer; + + TimerType::ElapsedType min_time[NumVariants]; + TimerType::ElapsedType max_time[NumVariants]; + TimerType::ElapsedType tot_time[NumVariants]; + + + }; } // closing brace for rajaperf namespace diff --git a/src/common/QuickKernelBase.hpp b/src/common/QuickKernelBase.hpp index 4980a3761..49b51639c 100644 --- a/src/common/QuickKernelBase.hpp +++ b/src/common/QuickKernelBase.hpp @@ -22,7 +22,7 @@ namespace rajaperf { using runData_helper = decltype(m_setup(0, 0)); using runData = typename std::conditional::value, empty, runData_helper>::type; using is_empty = std::is_same; - runData rd; + runData *rd; public: QuickKernelBase(std::string &name, const RunParams ¶ms, SetUp se, Execute ex, Checksum ch) : KernelBase( name, @@ -30,22 +30,41 @@ namespace rajaperf { m_setup(se), m_execute(ex), m_checksum( - ch) {} + ch) { + + + +setVariantDefined(Kokkos_Lambda); + setDefaultProblemSize(100000); + setActualProblemSize(100000); + setDefaultReps(5000); + +} QuickKernelBase(std::string &name, const RunParams ¶ms, SetUp se, Execute ex) : KernelBase(name, params), m_setup(se), m_execute(ex), m_checksum( - SureBuddyOkay()) {} - + SureBuddyOkay() +) { + +setVariantDefined(Kokkos_Lambda); + setDefaultProblemSize(100000); + setActualProblemSize(100); + setDefaultReps(5); + +} + ~QuickKernelBase(){ + free(rd); +} Real_type m_y; void setUpHelper(std::true_type) { } void setUpHelper(std::false_type) { - rd = m_setup(0, 0); + rd = new runData(m_setup(getItsPerRep(), getActualProblemSize())); } void setUp(VariantID vid) override { @@ -85,7 +104,7 @@ namespace rajaperf { void rkv_helper(std::index_sequence) { auto size = getActualProblemSize(); for (int x = 0; x < getRunReps(); ++x) { - m_execute(x, size, std::get(rd)...); + m_execute(x, size, std::get(*rd)...); } } @@ -108,7 +127,12 @@ namespace rajaperf { } void runKokkosVariant(VariantID vid) override { + Kokkos::fence(); + startTimer(); rkv_switch_on_empty(is_empty()); + Kokkos::fence(); + stopTimer(); + } #endif // RUN_KOKKOS From 29dfbfd1b56582cf602f7b0478241b7c07b32a1e Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 4 Nov 2021 13:32:12 -0600 Subject: [PATCH 119/124] Fixing test reps and problem size numbers --- src/common/QuickKernelBase.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/QuickKernelBase.hpp b/src/common/QuickKernelBase.hpp index 49b51639c..2cd7a60c6 100644 --- a/src/common/QuickKernelBase.hpp +++ b/src/common/QuickKernelBase.hpp @@ -51,8 +51,8 @@ setVariantDefined(Kokkos_Lambda); setVariantDefined(Kokkos_Lambda); setDefaultProblemSize(100000); - setActualProblemSize(100); - setDefaultReps(5); + setActualProblemSize(100000); + setDefaultReps(5000); } ~QuickKernelBase(){ From 393504bd516011536af373943c3b487f58649808 Mon Sep 17 00:00:00 2001 From: David Poliakoff Date: Mon, 8 Nov 2021 10:36:01 -0700 Subject: [PATCH 120/124] Fixed timing code --- src/common/KernelBase.cpp | 6 ------ src/common/KernelBase.hpp | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index ed0ec9d87..3d921fd34 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -110,13 +110,7 @@ void KernelBase::execute(VariantID vid) resetDataInitCount(); this->setUp(vid); -#ifdef RUN_KOKKOS - Kokkos::Tools::pushRegion(this->getName() + ":"+getVariantName(vid)); -#endif this->runKernel(vid); -#ifdef RUN_KOKKOS - Kokkos::Tools::popRegion(); -#endif this->updateChecksum(vid); diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 88035c9e5..9afc5087a 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -122,12 +122,18 @@ class KernelBase void startTimer() { synchronize(); + #ifdef RUN_KOKKOS + Kokkos::Tools::pushRegion(this->getName()); + #endif timer.start(); } void stopTimer() { synchronize(); + #ifdef RUN_KOKKOS + Kokkos::Tools::popRegion(this->getName()); + #endif timer.stop(); recordExecTime(); } From b7de26a0d57be123acd78fd9de06c5a2ea833603 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 11 Nov 2021 10:50:22 -0700 Subject: [PATCH 121/124] rm name passed to pop.region() --- src/common/KernelBase.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 5beb10d8e..48f16f3de 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -141,7 +141,7 @@ class KernelBase { synchronize(); #ifdef RUN_KOKKOS - Kokkos::Tools::popRegion(this->getName()); + Kokkos::Tools::popRegion(); #endif timer.stop(); recordExecTime(); } From 3dc1751c06bbbac354a752c5a205ce533727faa1 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Mon, 29 Nov 2021 13:05:06 -0700 Subject: [PATCH 122/124] First major clean up for David B.: does not include commented Executor.cpp, Executor.hpp --- CMakeLists.txt | 27 +-- blt | 2 +- src/algorithm-kokkos/SORT-Kokkos.cpp | 29 +-- src/algorithm-kokkos/SORTPAIRS-Kokkos.cpp | 50 +---- src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp | 114 +----------- src/apps-kokkos/ENERGY-Kokkos.cpp | 78 +------- src/apps-kokkos/FIR-Kokkos.cpp | 30 --- src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp | 62 +------ src/apps-kokkos/LTIMES-Kokkos.cpp | 96 ---------- src/apps-kokkos/LTIMES_NOVIEW-Kokkos.cpp | 61 ++---- src/apps-kokkos/PRESSURE-Kokkos.cpp | 65 +------ src/apps-kokkos/VOL3D-Kokkos.cpp | 50 +---- src/basic-kokkos/CMakeLists.txt | 7 - src/basic-kokkos/DAXPY-Kokkos.cpp | 21 +-- src/basic-kokkos/IF_QUAD-Kokkos.cpp | 11 -- src/basic-kokkos/INIT3-Kokkos.cpp | 40 +--- src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp | 51 ------ .../INIT_VIEW1D_OFFSET-Kokkos.cpp | 53 +----- src/basic-kokkos/MULADDSUB-Kokkos.cpp | 41 +---- src/basic-kokkos/NESTED_INIT-Kokkos.cpp | 90 +++------ src/basic-kokkos/PI_ATOMIC-Kokkos.cpp | 63 +------ src/common/Executor.hpp | 3 - src/common/PerfsuiteKernelDefinitions.cpp | 2 + src/common/RAJAPerfSuite.cpp | 2 +- src/common/RAJAPerfSuite.hpp | 173 ++++-------------- src/common/RPTypes.hpp | 9 +- src/common/RunParams.cpp | 33 +--- src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp | 54 +----- src/lcals-kokkos/EOS-Kokkos.cpp | 46 ----- src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp | 52 +----- src/lcals-kokkos/FIRST_MIN-Kokkos.cpp | 77 +------- src/lcals-kokkos/FIRST_SUM-Kokkos.cpp | 50 ----- src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp | 67 +------ src/lcals-kokkos/HYDRO_1D-Kokkos.cpp | 50 +---- src/lcals-kokkos/HYDRO_2D-Kokkos.cpp | 127 +------------ src/lcals-kokkos/INT_PREDICT-Kokkos.cpp | 47 +---- src/lcals-kokkos/PLANCKIAN-Kokkos.cpp | 46 ----- src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp | 45 ----- src/stream-kokkos/ADD-Kokkos.cpp | 58 +----- src/stream-kokkos/COPY-Kokkos.cpp | 60 +----- src/stream-kokkos/DOT-Kokkos.cpp | 83 +-------- src/stream-kokkos/MUL-Kokkos.cpp | 67 ------- src/stream-kokkos/TRIAD-Kokkos.cpp | 78 ++------ 43 files changed, 144 insertions(+), 2126 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1aeea9cfb..deab22a9f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,8 +55,6 @@ include_directories(${RAJA_INCLUDE_DIRS}) # # Setup variables to pass to Perf suite -# - # # These (hopefully temporary) macro constants are needed to work-around # performance issues in the xl compiler. @@ -80,9 +78,11 @@ endif() if (ENABLE_CUDA) list(APPEND RAJA_PERFSUITE_DEPENDS cuda) endif() + # Kokkos requires hipcc as the CMAKE_CXX_COMPILER for HIP AMD/VEGA GPU # platforms, whereas RAJAPerf Suite uses blt/CMake FindHIP to set HIP compiler # Separate RAJAPerf Suite and Kokkos handling of HIP compilers + if ((ENABLE_HIP) AND (NOT ENABLE_KOKKOS)) #if (ENABLE_HIP) list(APPEND RAJA_PERFSUITE_DEPENDS hip) @@ -113,7 +113,7 @@ endif() configure_file(${CMAKE_CURRENT_SOURCE_DIR}/src/rajaperf_config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/bin/rajaperf_config.hpp) -# Make sure RAJA flag propagate (we need to do some house cleaning to +# Make sure RAJA flag propagate (we need to do some tidying to # remove project-specific CMake variables that are no longer needed) set (CUDA_NVCC_FLAGS ${RAJA_NVCC_FLAGS}) @@ -123,7 +123,6 @@ if(ENABLE_KOKKOS) endif() - # ENABLE KOKKOS IS A RAJA PERFSUITE OPTION if(ENABLE_KOKKOS) add_definitions(-DRUN_KOKKOS) @@ -145,12 +144,6 @@ Kokkos_ENABLE_HIP variable to ON") endif() endif() - - - - - - #add_definitions(-DRAJA_ENABLE_TARGET_OPENMP) endif() @@ -168,18 +161,6 @@ Kokkos_ENABLE_HIP variable to ON") add_subdirectory(tpl/kokkos) if(ENABLE_CUDA) - #get_target_property(kokkos_core_files kokkoscore SOURCES) - #get_target_property(kokkos_container_files kokkoscontainers SOURCES) - ##message(STATUS "KOKKOS FILES: ${kokkos_core_files}") - #foreach(kokkos_core_file IN LISTS kokkos_core_files) - # set_source_files_properties(${kokkos_core_file} PROPERTIES COMPILE_LANGUAGE CUDA) - # get_source_file_property(local ${kokkos_core_file} LANGUAGE) - # set(remote "") - # #get_source_file_property(remote ${kokkos_core_file} DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/tpl/kokkos/core/src LANGUAGE) - # message(STATUS "Kokkos file: ${kokkos_core_file} ${local} ${remote}") - # - #endforeach() - #set_target_properties(kokkoscore kokkoscontainers PROPERTIES LANGUAGE CUDA) endif() get_property(KOKKOS_INCLUDE_DIRS DIRECTORY tpl/kokkos PROPERTY INCLUDE_DIRECTORIES) include_directories(${KOKKOS_INCLUDE_DIRS}) @@ -187,8 +168,6 @@ Kokkos_ENABLE_HIP variable to ON") list(APPEND RAJA_PERFSUITE_DEPENDS kokkos) endif() - -# # Each directory in the perf suite has its own CMakeLists.txt file. # DZP, AJP, DB, DA fixes add_subdirectory(src) diff --git a/blt b/blt index ddd5a0ca7..d14490144 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit ddd5a0ca7c566d0ae14270b66625c8a363630ddb +Subproject commit d144901443362ff153291121717a28778a703c60 diff --git a/src/algorithm-kokkos/SORT-Kokkos.cpp b/src/algorithm-kokkos/SORT-Kokkos.cpp index e13f4d0c9..dfc0292fa 100644 --- a/src/algorithm-kokkos/SORT-Kokkos.cpp +++ b/src/algorithm-kokkos/SORT-Kokkos.cpp @@ -29,7 +29,7 @@ void SORT::runKokkosVariant(VariantID vid) SORT_DATA_SETUP; - // Instantiate Kokkos Views + // Wrap pointers in Kokkos view auto x_view = getViewFromPointer(x, iend*run_reps); @@ -37,33 +37,6 @@ void SORT::runKokkosVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - std::sort(STD_SORT_ARGS); - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::sort(RAJA_SORT_ARGS); - - } - stopTimer(); - - break; - } -*/ - case Kokkos_Lambda : { Kokkos::fence(); diff --git a/src/algorithm-kokkos/SORTPAIRS-Kokkos.cpp b/src/algorithm-kokkos/SORTPAIRS-Kokkos.cpp index 85015ff7b..68504e3b6 100644 --- a/src/algorithm-kokkos/SORTPAIRS-Kokkos.cpp +++ b/src/algorithm-kokkos/SORTPAIRS-Kokkos.cpp @@ -23,7 +23,8 @@ namespace algorithm void SORTPAIRS::runKokkosVariant(VariantID vid) { - // FIXME + // Here, we are returning for configure, build and running purposes, + // because Kokkos does not yet have a "sort pairs" capability return; const Index_type run_reps = getRunReps(); @@ -32,57 +33,10 @@ void SORTPAIRS::runKokkosVariant(VariantID vid) SORTPAIRS_DATA_SETUP; - // instatiate Kokkos Views - // auto x_view = getViewFromPointer(x, iend); - //auto i_view = getViewFromPointer(i, iend); #if defined (RUN_KOKKOS) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - using pair_type = std::pair; - - std::vector vector_of_pairs; - vector_of_pairs.reserve(iend-ibegin); - - for (Index_type iemp = ibegin; iemp < iend; ++iemp) { - vector_of_pairs.emplace_back(x[iend*irep + iemp], i[iend*irep + iemp]); - } - - std::sort(vector_of_pairs.begin(), vector_of_pairs.end(), - [](pair_type const& lhs, pair_type const& rhs) { - return lhs.first < rhs.first; - }); - - for (Index_type iemp = ibegin; iemp < iend; ++iemp) { - pair_type& pair = vector_of_pairs[iemp - ibegin]; - x[iend*irep + iemp] = pair.first; - i[iend*irep + iemp] = pair.second; - } - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::sort_pairs(RAJA_SORTPAIRS_ARGS); - - } - stopTimer(); - - break; - } - */ /* case Kokkos_Lambda : { diff --git a/src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp b/src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp index b594f7f0d..9ccfcefed 100644 --- a/src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp +++ b/src/apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp @@ -22,7 +22,7 @@ namespace apps { struct arrayOffSetStruct { using ViewType = Kokkos::View; // Real_ptr is equivalent to float* - // v's are offsets; + // v's represent different offsets in different Kokkos views; ViewType v, v4, v1, v2, v3; // constructor @@ -54,11 +54,7 @@ void DEL_DOT_VEC_2D::runKokkosVariant(VariantID vid) { NDSET2D(m_domain->jp, xdot, fx1, fx2, fx3, fx4); NDSET2D(m_domain->jp, ydot, fy1, fy2, fy3, fy4); - // Instantiating Kokkos Views with getViewFromPointer - //auto x_view = getViewFromPointer(x, m_domain->nnalls); - //auto y_view = getViewFromPointer(y, iend); - //auto xdot_view = getViewFromPointer(xdot, iend); - //auto ydot_view = getViewFromPointer(ydot, iend); + // Instantiating Kokkos view auto div_view = getViewFromPointer(div, m_domain->nnalls); arrayOffSetStruct x_offsets("x_offsets", m_domain->nnalls, m_domain->jp, x ); @@ -93,93 +89,13 @@ void DEL_DOT_VEC_2D::runKokkosVariant(VariantID vid) { auto& fy3_view = ydot_offsets.v3; auto& fy4_view = ydot_offsets.v4; - // Use Kokkos::Subviews - /* - auto x1_view = getViewFromPointer(x1, iend); - auto x2_view = getViewFromPointer(x2, iend); - auto x3_view = getViewFromPointer(x3, iend); - auto x4_view = getViewFromPointer(x4, iend); - - auto y1_view = getViewFromPointer(y1, iend); - auto y2_view = getViewFromPointer(y2, iend); - auto y3_view = getViewFromPointer(y3, iend); - auto y4_view = getViewFromPointer(y4, iend); - - auto fx1_view = getViewFromPointer(fx1, iend); - auto fx2_view = getViewFromPointer(fx2, iend); - auto fx3_view = getViewFromPointer(fx3, iend); - auto fx4_view = getViewFromPointer(fx4, iend); - - auto fy1_view = getViewFromPointer(fy1, iend); - auto fy2_view = getViewFromPointer(fy2, iend); - auto fy3_view = getViewFromPointer(fy3, iend); - auto fy4_view = getViewFromPointer(fy4, iend); - -*/ #if defined(RUN_KOKKOS) switch (vid) { - case Base_Seq: { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type ii = ibegin; ii < iend; ++ii) { - DEL_DOT_VEC_2D_BODY_INDEX; - DEL_DOT_VEC_2D_BODY; - } - } - stopTimer(); - - break; - } - - // #if defined(RUN_RAJA_SEQ) - case Lambda_Seq: { - - auto deldotvec2d_base_lam = [=](Index_type ii) { - DEL_DOT_VEC_2D_BODY_INDEX; - DEL_DOT_VEC_2D_BODY; - }; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type ii = ibegin; ii < iend; ++ii) { - deldotvec2d_base_lam(ii); - } - } - stopTimer(); - - break; - } - /* - case RAJA_Seq : { - - camp::resources::Resource working_res{camp::resources::Host()}; - RAJA::TypedListSegment zones(m_domain->real_zones, - m_domain->n_real_zones, - working_res); - - auto deldotvec2d_lam = [=](Index_type i) { - DEL_DOT_VEC_2D_BODY; - }; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall(zones, deldotvec2d_lam); - - } - stopTimer(); - - break; - } - */ - case Kokkos_Lambda: { + // Translation from RAJAPerf Suite to Kokkos notes: // Host resource will be used for loop execution // camp::resources::Resource working_res{camp::resources::Host()}; @@ -262,7 +178,6 @@ void DEL_DOT_VEC_2D::runKokkosVariant(VariantID vid) { break; } - //#endif // RUN_RAJA_SEQ default: { std::cout << "\n DEL_DOT_VEC_2D : Unknown variant id = " << vid @@ -273,35 +188,12 @@ void DEL_DOT_VEC_2D::runKokkosVariant(VariantID vid) { #endif // RUN_KOKKOS - // moveDataToHostFromKokkosView(a, a_view, iend); moveDataToHostFromKokkosView(x, x_view, m_domain->nnalls); moveDataToHostFromKokkosView(y, y_view, m_domain->nnalls); moveDataToHostFromKokkosView(xdot, xdot_view, m_domain->nnalls); moveDataToHostFromKokkosView(ydot, ydot_view, m_domain->nnalls); moveDataToHostFromKokkosView(div, div_view, m_domain->nnalls); -/* - moveDataToHostFromKokkosView(x1, x1_view, iend); - moveDataToHostFromKokkosView(x2, x2_view, iend); - moveDataToHostFromKokkosView(x3, x3_view, iend); - moveDataToHostFromKokkosView(x4, x4_view, iend); - - moveDataToHostFromKokkosView(y1, y1_view, iend); - moveDataToHostFromKokkosView(y2, y2_view, iend); - moveDataToHostFromKokkosView(y3, y3_view, iend); - moveDataToHostFromKokkosView(y4, y4_view, iend); - - moveDataToHostFromKokkosView(fx1, fx1_view, iend); - moveDataToHostFromKokkosView(fx2, fx2_view, iend); - moveDataToHostFromKokkosView(fx3, fx3_view, iend); - moveDataToHostFromKokkosView(fx4, fx4_view, iend); - - moveDataToHostFromKokkosView(fy1, fy1_view, iend); - moveDataToHostFromKokkosView(fy2, fy2_view, iend); - moveDataToHostFromKokkosView(fy3, fy3_view, iend); - moveDataToHostFromKokkosView(fy4, fy4_view, iend); -*/ - } diff --git a/src/apps-kokkos/ENERGY-Kokkos.cpp b/src/apps-kokkos/ENERGY-Kokkos.cpp index 2dd43ccbd..a19bdfbc2 100644 --- a/src/apps-kokkos/ENERGY-Kokkos.cpp +++ b/src/apps-kokkos/ENERGY-Kokkos.cpp @@ -27,8 +27,7 @@ void ENERGY::runKokkosVariant(VariantID vid) ENERGY_DATA_SETUP; - // Instantiate Kokkos::Views - //auto a_view = getViewFromPointer(a, iend); + // Wrap pointers in Kokkos views auto e_new_view = getViewFromPointer(e_new, iend); auto e_old_view = getViewFromPointer(e_old, iend); @@ -69,76 +68,6 @@ void ENERGY::runKokkosVariant(VariantID vid) #if defined(RUN_KOKKOS) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - ENERGY_BODY1; - } - - for (Index_type i = ibegin; i < iend; ++i ) { - ENERGY_BODY2; - } - - for (Index_type i = ibegin; i < iend; ++i ) { - ENERGY_BODY3; - } - - for (Index_type i = ibegin; i < iend; ++i ) { - ENERGY_BODY4; - } - - for (Index_type i = ibegin; i < iend; ++i ) { - ENERGY_BODY5; - } - - for (Index_type i = ibegin; i < iend; ++i ) { - ENERGY_BODY6; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - energy_lam1(i); - } - - for (Index_type i = ibegin; i < iend; ++i ) { - energy_lam2(i); - } - - for (Index_type i = ibegin; i < iend; ++i ) { - energy_lam3(i); - } - - for (Index_type i = ibegin; i < iend; ++i ) { - energy_lam4(i); - } - - for (Index_type i = ibegin; i < iend; ++i ) { - energy_lam5(i); - } - - for (Index_type i = ibegin; i < iend; ++i ) { - energy_lam6(i); - } - - } - stopTimer(); - - break; - } - case Kokkos_Lambda : { startTimer(); @@ -147,7 +76,7 @@ void ENERGY::runKokkosVariant(VariantID vid) Kokkos::parallel_for("ENERGY - lambda 1", Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA(const int64_t i){ - // Lamda Body 1 + //#define ENERGY_BODY1 e_new_view[i] = e_old_view[i] - 0.5 * delvc_view[i] * \ (p_old_view[i] + ql_old_view[i]) + 0.5 * work_view[i]; @@ -187,7 +116,6 @@ void ENERGY::runKokkosVariant(VariantID vid) }); - Kokkos::parallel_for("ENERGY - lambda 4", Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA(const int64_t i){ @@ -269,7 +197,7 @@ void ENERGY::runKokkosVariant(VariantID vid) #endif // RUN_KOKKOS - //moveDataToHostFromKokkosView(a, a_view, iend); + // Move data from Kokkos view on device back to the host moveDataToHostFromKokkosView(e_new, e_new_view, iend); moveDataToHostFromKokkosView(e_old, e_old_view, iend); moveDataToHostFromKokkosView(delvc, delvc_view, iend); diff --git a/src/apps-kokkos/FIR-Kokkos.cpp b/src/apps-kokkos/FIR-Kokkos.cpp index 4094f2916..696af71eb 100644 --- a/src/apps-kokkos/FIR-Kokkos.cpp +++ b/src/apps-kokkos/FIR-Kokkos.cpp @@ -49,36 +49,6 @@ void FIR::runKokkosVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - FIR_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - fir_lam(i); - } - - } - stopTimer(); - - break; - } - case Kokkos_Lambda : { Kokkos::fence(); diff --git a/src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp b/src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp index 0aa022dfb..ba1c400ff 100644 --- a/src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp +++ b/src/apps-kokkos/HALOEXCHANGE-Kokkos.cpp @@ -20,10 +20,9 @@ namespace apps void HALOEXCHANGE::runKokkosVariant(VariantID vid) { - //FIXME - //return; const Index_type run_reps = getRunReps(); + // Nota bene: ibegin, iend not defined for this kernel // Instead: // Index_type num_neighbors = s_num_neighbors; @@ -32,14 +31,11 @@ void HALOEXCHANGE::runKokkosVariant(VariantID vid) // apps/HALOEXCHANGE.cpp: m_num_vars_default = 3; // apps/HALOEXCHANGE.hpp: static const int s_num_neighbors = 26; - //HALOEXCHANGE_DATA_SETUP; + // HALOEXCHANGE_DATA_SETUP; // Declare and define Kokkos Views // Preserving the names of the pointer variables to avoid typo errors in the // Kokkos_Lambda expressions -// -//auto vars_view = getViewFromPointer(vars, num_neighbors); -//auto buffers_view = getViewFromPointer(buffers, num_neighbors); std::vector> vars; std::vector> buffers; @@ -72,53 +68,7 @@ auto num_vars = m_num_vars; #if defined(RUN_KOKKOS) switch ( vid ) { -/* - case RAJA_Seq : { - using EXEC_POL = RAJA::loop_exec; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = pack_index_lists[l]; - Index_type len = pack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_pack_base_lam = [=](Index_type i) { - HALOEXCHANGE_PACK_BODY; - }; - RAJA::forall( - RAJA::TypedRangeSegment(0, len), - haloexchange_pack_base_lam ); - buffer += len; - } - } - - for (Index_type l = 0; l < num_neighbors; ++l) { - Real_ptr buffer = buffers[l]; - Int_ptr list = unpack_index_lists[l]; - Index_type len = unpack_index_list_lengths[l]; - for (Index_type v = 0; v < num_vars; ++v) { - Real_ptr var = vars[v]; - auto haloexchange_unpack_base_lam = [=](Index_type i) { - HALOEXCHANGE_UNPACK_BODY; - }; - RAJA::forall( - RAJA::TypedRangeSegment(0, len), - haloexchange_unpack_base_lam ); - buffer += len; - } - } - - } - stopTimer(); - - break; - } - -*/ case Kokkos_Lambda : { Kokkos::fence(); @@ -189,13 +139,9 @@ Kokkos::parallel_for("HALOEXCHANGE - Pack Body - Kokkos Lambda", } #endif // RUN_KOKKOS - //Usage: moveDataToHostFromKokkosView(pointer, pointer_wrapped_view, iend); - // moveDataToHostFromKokkosView(vars, vars_view, num_neighbors); - // moveDataToHostFromKokkosView(buffers, buffers_view, num_neighbors); - for ( int x = 0; x < m_vars.size(); ++x ) { - //vars.push_back(getViewFromPointer(var, m_var_size)); + //RAJAPerf Suite operation: vars.push_back(getViewFromPointer(var, m_var_size)); moveDataToHostFromKokkosView(m_vars[x], vars[x], m_var_size); } @@ -207,7 +153,7 @@ for ( int x = 0; x < m_buffers.size(); ++x ) { for ( int x = 0; x < m_pack_index_lists.size(); ++x ) { - //pack_index_lists.push_back(getViewFromPointer(m_pack_index_lists[x], m_pack_index_list_lengths[x])); + //RAJAPerf Suite operation: pack_index_lists.push_back(getViewFromPointer(m_pack_index_lists[x], m_pack_index_list_lengths[x])); moveDataToHostFromKokkosView(m_pack_index_lists[x], pack_index_lists[x], m_pack_index_list_lengths[x]); } diff --git a/src/apps-kokkos/LTIMES-Kokkos.cpp b/src/apps-kokkos/LTIMES-Kokkos.cpp index f2a8cd65c..880b690b0 100644 --- a/src/apps-kokkos/LTIMES-Kokkos.cpp +++ b/src/apps-kokkos/LTIMES-Kokkos.cpp @@ -20,8 +20,6 @@ namespace apps void LTIMES::runKokkosVariant(VariantID vid) { - // FIXME - //return; const Index_type run_reps = getRunReps(); @@ -35,101 +33,9 @@ void LTIMES::runKokkosVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type z = 0; z < num_z; ++z ) { - for (Index_type g = 0; g < num_g; ++g ) { - for (Index_type m = 0; m < num_m; ++m ) { - for (Index_type d = 0; d < num_d; ++d ) { - LTIMES_BODY; - } - } - } - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - auto ltimes_base_lam = [=](Index_type d, Index_type z, - Index_type g, Index_type m) { - LTIMES_BODY; - }; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type z = 0; z < num_z; ++z ) { - for (Index_type g = 0; g < num_g; ++g ) { - for (Index_type m = 0; m < num_m; ++m ) { - for (Index_type d = 0; d < num_d; ++d ) { - ltimes_base_lam(d, z, g, m); - } - } - } - } - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - LTIMES_VIEWS_RANGES_RAJA; - - auto ltimes_lam = [=](ID d, IZ z, IG g, IM m) { - LTIMES_BODY_RAJA; - }; - - - using EXEC_POL = - RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::loop_exec, // z - RAJA::statement::For<2, RAJA::loop_exec, // g - RAJA::statement::For<3, RAJA::loop_exec, // m - RAJA::statement::For<0, RAJA::loop_exec, // d - RAJA::statement::Lambda<0> - > - > - > - > - >; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::kernel( RAJA::make_tuple(IDRange(0, num_d), - IZRange(0, num_z), - IGRange(0, num_g), - IMRange(0, num_m)), - ltimes_lam - ); - - } - stopTimer(); - - break; - } -*/ - case Kokkos_Lambda : { - //LTIMES_VIEWS_RANGES_RAJA; - - - // Kokkos uses MDRange to model tightly-nested loops - - Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { @@ -161,12 +67,10 @@ void LTIMES::runKokkosVariant(VariantID vid) #endif // RUN_KOKKOS -// moveDataToHostFromKokkosView(a, a_view, iend); moveDataToHostFromKokkosView(phidat, phi, num_z, num_g, num_m); moveDataToHostFromKokkosView(psidat, psi, num_z, num_g, num_d); moveDataToHostFromKokkosView(elldat, ell, num_m, num_d); - } } // end namespace apps diff --git a/src/apps-kokkos/LTIMES_NOVIEW-Kokkos.cpp b/src/apps-kokkos/LTIMES_NOVIEW-Kokkos.cpp index daa9cf46c..483926287 100644 --- a/src/apps-kokkos/LTIMES_NOVIEW-Kokkos.cpp +++ b/src/apps-kokkos/LTIMES_NOVIEW-Kokkos.cpp @@ -20,7 +20,9 @@ namespace apps void LTIMES_NOVIEW::runKokkosVariant(VariantID vid) { - // FIXME + // Nota bene: we put a return statement for build and running purposes; + // A kernel without a Kokkos view is not informative for Kokkos + // performance return; const Index_type run_reps = getRunReps(); @@ -31,52 +33,13 @@ void LTIMES_NOVIEW::runKokkosVariant(VariantID vid) LTIMES_NOVIEW_BODY; }; - switch ( vid ) { - - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type z = 0; z < num_z; ++z ) { - for (Index_type g = 0; g < num_g; ++g ) { - for (Index_type m = 0; m < num_m; ++m ) { - for (Index_type d = 0; d < num_d; ++d ) { - LTIMES_NOVIEW_BODY; - } - } - } - } - - } - stopTimer(); - - break; - } - -#if defined(RUN_RAJA_SEQ) - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type z = 0; z < num_z; ++z ) { - for (Index_type g = 0; g < num_g; ++g ) { - for (Index_type m = 0; m < num_m; ++m ) { - for (Index_type d = 0; d < num_d; ++d ) { - ltimesnoview_lam(d, z, g, m); - } - } - } - } +#if defined (RUN_KOKKOS) - } - stopTimer(); - - break; - } + switch ( vid ) { - case RAJA_Seq : { +/* + Future Kokkos Translation here: + case Kokkos_Lambda : { using EXEC_POL = RAJA::KernelPolicy< @@ -105,15 +68,21 @@ void LTIMES_NOVIEW::runKokkosVariant(VariantID vid) stopTimer(); break; + } -#endif // RUN_RAJA_SEQ +*/ default : { std::cout << "\n LTIMES_NOVIEW : Unknown variant id = " << vid << std::endl; } } +#endif // RUN_KOKKOS + +// Move data from Kokkos View on device back to the host + + } } // end namespace apps diff --git a/src/apps-kokkos/PRESSURE-Kokkos.cpp b/src/apps-kokkos/PRESSURE-Kokkos.cpp index e181a8764..7e2fc32a4 100644 --- a/src/apps-kokkos/PRESSURE-Kokkos.cpp +++ b/src/apps-kokkos/PRESSURE-Kokkos.cpp @@ -50,80 +50,19 @@ void PRESSURE::runKokkosVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - PRESSURE_BODY1; - } - - for (Index_type i = ibegin; i < iend; ++i ) { - PRESSURE_BODY2; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - pressure_lam1(i); - } - - for (Index_type i = ibegin; i < iend; ++i ) { - pressure_lam2(i); - } - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::region( [=]() { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), pressure_lam1); - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), pressure_lam2); - - }); // end sequential region (for single-source code) - - } - stopTimer(); - - break; - } - */ - case Kokkos_Lambda : { Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - // CRT : Look at Kokkos graphs as an implementation for kernel + // Christian Trott : Look at Kokkos graphs as an implementation for kernel // seq_region - create a sequential region // Intent: two loop bodies will be executed consecutively // https://raja.readthedocs.io/en/v0.9.0/feature/policies.html?highlight=seq_region#parallel-region-policies // The sequential region specialization is essentially a pass through operation. // It is provided so that if you want to turn off OpenMP in your code, - // you can simply replace the region policy type and you do not have to change your algorithm source code. - + // you can simply replace the region policy type, and you do not have to change your algorithm source code. Kokkos::parallel_for("PRESSURE_BODY1 - Kokkos_Lambda", Kokkos::RangePolicy(ibegin,iend), diff --git a/src/apps-kokkos/VOL3D-Kokkos.cpp b/src/apps-kokkos/VOL3D-Kokkos.cpp index 70f08819b..bb510bba8 100644 --- a/src/apps-kokkos/VOL3D-Kokkos.cpp +++ b/src/apps-kokkos/VOL3D-Kokkos.cpp @@ -23,10 +23,10 @@ namespace apps struct arrayOffSetStruct3D { using ViewType = Kokkos::View; - // v's are offsets of indices + // The different v's are offsets of indices in different Kokkos views ViewType v, v0, v1, v2, v3, v4, v5, v6, v7; - // constructor + // Constructor arrayOffSetStruct3D(const std::string& name, Index_type num_elements, Index_type jp, @@ -61,7 +61,7 @@ void VOL3D::runKokkosVariant(VariantID vid) NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ; NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ; - // not sure about the 'ibegin, iend' here: + // The 'ibegin, iend' are unclear here: auto vol_view = getViewFromPointer(vol, m_domain->nnalls); arrayOffSetStruct3D x_offsets("x_offsets", m_domain->nnalls, m_domain->jp, m_domain->kp, x); @@ -106,50 +106,6 @@ void VOL3D::runKokkosVariant(VariantID vid) #if defined(RUN_KOKKOS) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin ; i < iend ; ++i ) { - VOL3D_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin ; i < iend ; ++i ) { - vol3d_lam(i); - } - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), vol3d_lam); - - } - stopTimer(); - - break; - } -*/ case Kokkos_Lambda : { startTimer(); diff --git a/src/basic-kokkos/CMakeLists.txt b/src/basic-kokkos/CMakeLists.txt index 6744c3662..c859747c2 100644 --- a/src/basic-kokkos/CMakeLists.txt +++ b/src/basic-kokkos/CMakeLists.txt @@ -26,13 +26,6 @@ blt_add_library( # Diagnostics message (STATUS "${RAJA_PERFSUITE_DEPENDS}") -#blt_print_target_properties(TARGET basic-kokkos) -#blt_print_target_properties(TARGET kokkos) - -#kokkoscore;kokkoscontainers;kokkosalgorithms -#blt_print_target_properties(TARGET kokkoscore) -#blt_print_target_properties(TARGET kokkoscontainers) -#blt_print_target_properties(TARGET kokkosalgorithms) blt_print_target_properties(TARGET RAJA) diff --git a/src/basic-kokkos/DAXPY-Kokkos.cpp b/src/basic-kokkos/DAXPY-Kokkos.cpp index b21e0ce16..8883d0da6 100644 --- a/src/basic-kokkos/DAXPY-Kokkos.cpp +++ b/src/basic-kokkos/DAXPY-Kokkos.cpp @@ -79,37 +79,18 @@ void DAXPY::runKokkosVariant(VariantID vid) break; } -/* case Kokkos_Functor: { - DaxpyFunctor daxpy_functor_instance(y,x,a); - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - Kokkos::parallel_for("DAXPY-Kokkos Kokkos_Functor", Kokkos::RangePolicy(ibegin, iend), - daxpy_functor_instance); - } - - stopTimer(); - - - - break; - } -*/ default : { std::cout << "\n DAXPY : Unknown variant id = " << vid << std::endl; } } - // Moving all data (i.e., pointer, KokkosView-wrapped ponter) back to the host from the device + // Move data (i.e., pointer, KokkosView-wrapped ponter) back to the host from the device moveDataToHostFromKokkosView(x, x_view, iend); moveDataToHostFromKokkosView(y, y_view, iend); - - - - #endif // RUN_KOKKOS } diff --git a/src/basic-kokkos/IF_QUAD-Kokkos.cpp b/src/basic-kokkos/IF_QUAD-Kokkos.cpp index a1d25d3fe..39f2f6dc2 100644 --- a/src/basic-kokkos/IF_QUAD-Kokkos.cpp +++ b/src/basic-kokkos/IF_QUAD-Kokkos.cpp @@ -17,10 +17,6 @@ namespace rajaperf namespace basic { - - -// Kokkos-ify here - void IF_QUAD::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -44,14 +40,11 @@ void IF_QUAD::runKokkosVariant(VariantID vid) - #if defined(RUN_KOKKOS) switch ( vid ) { - - case Kokkos_Lambda : { Kokkos::fence(); @@ -59,10 +52,6 @@ void IF_QUAD::runKokkosVariant(VariantID vid) for (RepIndex_type irep = 0; irep < run_reps; ++irep) { -/* RAJA::forall( - RAJA::RangeSegment(ibegin, iend), ifquad_lam); -*/ - // Translation Kokkos::parallel_for("IF_QUAD_Kokkos Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA (Index_type i) { diff --git a/src/basic-kokkos/INIT3-Kokkos.cpp b/src/basic-kokkos/INIT3-Kokkos.cpp index 1ceb5174c..1ebaf83cc 100644 --- a/src/basic-kokkos/INIT3-Kokkos.cpp +++ b/src/basic-kokkos/INIT3-Kokkos.cpp @@ -48,54 +48,19 @@ void INIT3::runKokkosVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for(RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for(Index_type i = ibegin; i < iend; ++i) { - INIT3_BODY; - } - - } - stopTimer(); - - break; -} - - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i) { - init3_lam(i); - } - - - } - stopTimer(); - - break; -} - // Nota bene -- Conversion of Raja code begins here case Kokkos_Lambda : { Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - -// RAJA::forall( -// RAJA::RangeSegment(ibegin, iend), init3_lam); - // Kokkos translation making INIT3_BODY explicit + // Kokkos translation of INIT3_BODY Kokkos::parallel_for("INIT3-Kokkos Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA(Index_type i) { //INIT3_BODY definition: - // out1[i] = out2[i] = out3[i] = - in1[i] - in2[i] ; + // out1[i] = out2[i] = out3[i] = - in1[i] - in2[i] ; out1_view[i] = out2_view[i] = out3_view[i] = - in1_view[i] - in2_view[i]; }); } @@ -119,7 +84,6 @@ void INIT3::runKokkosVariant(VariantID vid) moveDataToHostFromKokkosView(in1, in1_view, iend); moveDataToHostFromKokkosView(in2, in2_view, iend); - } } // end namespace basic diff --git a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp index 7afb010ee..95702570e 100644 --- a/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp +++ b/src/basic-kokkos/INIT_VIEW1D-Kokkos.cpp @@ -29,66 +29,16 @@ void INIT_VIEW1D::runKokkosVariant(VariantID vid) // Declare a Kokkos View that will be used to wrap a pointer auto a_view = getViewFromPointer(a, iend); - - - #if defined(RUN_KOKKOS) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - INIT_VIEW1D_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - auto initview1d_base_lam = [=](Index_type i) { - INIT_VIEW1D_BODY; - }; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - initview1d_base_lam(i); - } - - } - stopTimer(); - - break; - } - - // AJP began modificaiton here case Kokkos_Lambda : { - //INIT_VIEW1D_VIEW_RAJA; - - /* auto initview1d_lam = [=](Index_type i) { - INIT_VIEW1D_BODY_RAJA; - - }; -*/ - // fence needed to ensure upstream operations are complete before timer - // start Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { -// RAJA::forall( -// RAJA::RangeSegment(ibegin, iend), initview1d_lam); - //Kokkos translation Kokkos::parallel_for("INIT_VIEW1D_Kokkos Kokkos_Lambda", Kokkos::RangePolicy(ibegin,iend), KOKKOS_LAMBDA (Index_type i) { @@ -118,7 +68,6 @@ void INIT_VIEW1D::runKokkosVariant(VariantID vid) moveDataToHostFromKokkosView(a, a_view, iend); - } } // end namespace basic diff --git a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp index d47ed1462..bc2d9d955 100644 --- a/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp +++ b/src/basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp @@ -18,7 +18,6 @@ namespace basic { - void INIT_VIEW1D_OFFSET::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -32,62 +31,14 @@ void INIT_VIEW1D_OFFSET::runKokkosVariant(VariantID vid) #if defined(RUN_KOKKOS) - switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - INIT_VIEW1D_OFFSET_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - auto initview1doffset_base_lam = [=](Index_type i) { - INIT_VIEW1D_OFFSET_BODY; - }; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - initview1doffset_base_lam(i); - } - - } - stopTimer(); - - break; - } - - // Conversion of Raja code to Kokkos starts here - // case Kokkos_Lambda : { - //INIT_VIEW1D_OFFSET_VIEW_RAJA; - - /*auto initview1doffset_lam = [=](Index_type i) { - INIT_VIEW1D_OFFSET_BODY_RAJA; - }; - -*/ - - // Set a fence Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { -// RAJA::forall( -// RAJA::RangeSegment(ibegin, iend), initview1doffset_lam); Kokkos::parallel_for("INIT_VIEW1D_OFFSET_Kokkos Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA (Index_type i) { @@ -114,11 +65,9 @@ void INIT_VIEW1D_OFFSET::runKokkosVariant(VariantID vid) #endif // RUN_KOKKOS - // Move data from Kokkos View back to Host + // Move data from Kokkos View (on Device) back to Host moveDataToHostFromKokkosView(a, a_view, iend); - - } } // end namespace basic diff --git a/src/basic-kokkos/MULADDSUB-Kokkos.cpp b/src/basic-kokkos/MULADDSUB-Kokkos.cpp index edb54ae3b..0caad2748 100644 --- a/src/basic-kokkos/MULADDSUB-Kokkos.cpp +++ b/src/basic-kokkos/MULADDSUB-Kokkos.cpp @@ -34,9 +34,6 @@ void MULADDSUB::runKokkosVariant(VariantID vid) auto in1_view = getViewFromPointer(in1, iend); auto in2_view = getViewFromPointer(in2, iend); - - - auto mas_lam = [=](Index_type i) { MULADDSUB_BODY; }; @@ -46,48 +43,14 @@ void MULADDSUB::runKokkosVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - MULADDSUB_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - mas_lam(i); - } - - } - stopTimer(); - - break; - } case Kokkos_Lambda : { - // Set fence to ensure upstream calculations have completed Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { -// RAJA::forall( -// RAJA::RangeSegment(ibegin, iend), mas_lam); -// -// Kokkos translation -// If SIMD really matters , consider using Kokkos SIMD + // If SIMD really matters , consider using Kokkos SIMD Kokkos::parallel_for("MULTISUB-KokkosSeq Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA(Index_type i) { @@ -121,8 +84,6 @@ void MULADDSUB::runKokkosVariant(VariantID vid) moveDataToHostFromKokkosView(in1, in1_view, iend); moveDataToHostFromKokkosView(in2, in2_view, iend); - - } } // end namespace basic diff --git a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp index 4dee1a560..6f58e34c4 100644 --- a/src/basic-kokkos/NESTED_INIT-Kokkos.cpp +++ b/src/basic-kokkos/NESTED_INIT-Kokkos.cpp @@ -14,12 +14,20 @@ namespace rajaperf { namespace basic { -//////////////////////////////////////////////////////////// + + void NESTED_INIT::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); NESTED_INIT_DATA_SETUP; + // Wrap the nested init array pointer in a Kokkos View + // In a Kokkos View, array arguments for array boundaries go from outmost + // to innermost dimension sizes + // See the basic NESTED_INIT.hpp file for defnition of NESTED_INIT + + auto array_kokkos_view = getViewFromPointer(array, nk, nj, ni); + auto nestedinit_lam = [=](Index_type i, Index_type j, Index_type k) { NESTED_INIT_BODY; }; @@ -28,87 +36,31 @@ void NESTED_INIT::runKokkosVariant(VariantID vid) { switch (vid) { - case Base_Seq: { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type k = 0; k < nk; ++k) { - for (Index_type j = 0; j < nj; ++j) { - for (Index_type i = 0; i < ni; ++i) { - NESTED_INIT_BODY; - } - } - } - } - stopTimer(); - - break; - } - - case Lambda_Seq: { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type k = 0; k < nk; ++k) { - for (Index_type j = 0; j < nj; ++j) { - for (Index_type i = 0; i < ni; ++i) { - nestedinit_lam(i, j, k); - } - } - } - } - stopTimer(); - - break; - } - - // Kokkos_Lambda variant - case Kokkos_Lambda: { - // Wrap the nested init array pointer in a Kokkos View - // In a Kokkos View, array arguments for array boundaries go from outmost - // to innermost dimension sizes - // See the basic NESTED_INIT.hpp file for defnition of NESTED_INIT - - auto array_kokkos_view = getViewFromPointer(array, nk, nj, ni); - Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { // MDRange can be optimized - - Kokkos::parallel_for( - "NESTED_INIT KokkosSeq", - // Range policy - Kokkos::MDRangePolicy, - // Execution space - Kokkos::DefaultExecutionSpace>({0, 0, 0}, - {nk, nj, ni}), - // Loop body - KOKKOS_LAMBDA(Index_type k, Index_type j, Index_type i) { - // NESTED_INIT_BODY no longer useful, because we're not - // operating on the array, but on the Kokkos::View - // array_kokkos_view created to hold value for - // getViewFromPointer(array, nk, nj, ni) - // MD Views are index'ed via "()" - // - // KOKKOS-FIED translation of NESTED_INIT_BODY: - // #define NESTED_INIT_BODY - // array[i+ni*(j+nj*k)] = 0.00000001 * i * j * k ; - // - array_kokkos_view(k, j, i) = 0.00000001 * i * j * k; - }); + Kokkos::parallel_for("NESTED_INIT KokkosSeq", + // Range policy to define amount of work to be done + Kokkos::MDRangePolicy, + // Execution space + Kokkos::DefaultExecutionSpace>({0, 0, 0}, {nk, nj, ni}), + // Loop body + KOKKOS_LAMBDA(Index_type k, Index_type j, Index_type i) { + // #define NESTED_INIT_BODY + // array[i+ni*(j+nj*k)] = 0.00000001 * i * j * k ; + array_kokkos_view(k, j, i) = 0.00000001 * i * j * k; + }); } Kokkos::fence(); stopTimer(); - // "Moves" mirror data from GPU to CPU (void, i.e., no retrun type). In + // Moves mirror data from GPU to CPU (void, i.e., no return type). In // this moving of data back to Host, the layout is changed back to Layout // Right, vs. the LayoutLeft of the GPU moveDataToHostFromKokkosView(array, array_kokkos_view, nk, nj, ni); diff --git a/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp b/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp index 8bb5b4bad..51a819951 100644 --- a/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp +++ b/src/basic-kokkos/PI_ATOMIC-Kokkos.cpp @@ -29,76 +29,21 @@ void PI_ATOMIC::runKokkosVariant(VariantID vid) { switch (vid) { - case Base_Seq: { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - *pi = m_pi_init; - for (Index_type i = ibegin; i < iend; ++i) { - double x = (double(i) + 0.5) * dx; - *pi += dx / (1.0 + x * x); - } - *pi *= 4.0; - } - stopTimer(); - - break; - } - - case Lambda_Seq: { - - auto atomicpi_base_lam = [=](Index_type i) { - double x = (double(i) + 0.5) * dx; - *pi += dx / (1.0 + x * x); - }; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - *pi = m_pi_init; - for (Index_type i = ibegin; i < iend; ++i) { - atomicpi_base_lam(i); - } - *pi *= 4.0; - } - stopTimer(); - - break; - } - case Kokkos_Lambda: { - // Ensure all upstream calculations have been completed before starting - // the timer + Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - // Here, making a pointer of pi defined in PI_ATOMIC.hpp; we will use a - // KokkosView instead - // *pi = m_pi_init; - // RAJA::forall( RAJA::RangeSegment(ibegin, iend), - // [=](Index_type i) { - // double x = (double(i) + 0.5) * dx; - // RAJA::atomicAdd(pi, dx / (1.0 + x * x)); - // }); - // // Initializing a value, pi, on the host *pi = m_pi_init; - // This is an assignment statement! Not a declaration. - // David made this assignment because of the structure of the - // computation. - // We're moving the data in the pointer to the device (GPU) - // IT IS IMPORTANT TO REALISE WHEN YOUR VARIABLE / DATA ARE BEING - // REINITIALIZED + pi_view = getViewFromPointer(pi, 1); Kokkos::parallel_for( "PI_ATOMIC-Kokkos Kokkos_Lambda", Kokkos::RangePolicy(ibegin, iend), KOKKOS_LAMBDA(Index_type i) { - // Original PI_ATOMIC kernel reference implementation - // defined in PI_ATOMIC.hpp double x = (double(i) + 0.5) * dx; // Make a reference to the 0th element of a 1D view with one // element @@ -112,9 +57,6 @@ void PI_ATOMIC::runKokkosVariant(VariantID vid) { // pointer, pi. moveDataToHostFromKokkosView(pi, pi_view, 1); *pi *= 4.0; - //*m_pi += *pi; - //*pi *= 4.0; - // pi_view *= 4.0; } Kokkos::fence(); @@ -129,7 +71,6 @@ void PI_ATOMIC::runKokkosVariant(VariantID vid) { } #endif // RUN_KOKKOS - // moveDataToHostFromKokkosView(pi, pi_view, 1); } } // end namespace basic diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 2d0514ef2..b6fa1629a 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -69,9 +69,6 @@ class Executor const RunParams& getRunParams(); - - - private: Executor() = delete; diff --git a/src/common/PerfsuiteKernelDefinitions.cpp b/src/common/PerfsuiteKernelDefinitions.cpp index d958415b7..2180294ed 100644 --- a/src/common/PerfsuiteKernelDefinitions.cpp +++ b/src/common/PerfsuiteKernelDefinitions.cpp @@ -126,6 +126,8 @@ void make_perfsuite_executor(rajaperf::Executor *exec, int argc, char *argv[]) { free_register_kernel(exec, "Lcals", new lcals::PLANCKIAN(run_params)); free_register_kernel(exec, "Lcals", new lcals::TRIDIAG_ELIM(run_params)); /* + // Uncomment these lines once Kokkos translations for the polybench kernel + // group have been made // Polybench free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_2MM(run_params)); free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_3MM(run_params)); diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index d1dd39dca..deaf07f7e 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -90,7 +90,7 @@ namespace rajaperf { std::string("Lcals_TRIDIAG_ELIM"), // //// Polybench kernels... -//// +//// Uncomment once Kokkos variants have been created // std::string("Polybench_2MM"), // std::string("Polybench_3MM"), // std::string("Polybench_ADI"), diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index a654ba6e9..1ab4d83f1 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -6,158 +6,94 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -/// -/// Tyoes and methods for managing Suite kernels, variants, features, etc.. -/// +/// Declare types, methods and namespaces to enable RAJAPerf Suite to handle Kokkos kernels, variants, features, etc. #ifndef RAJAPerfSuite_HPP #define RAJAPerfSuite_HPP -//#include "common/RPTypes.hpp" #ifndef RAJAPERF_INFRASTRUCTURE_ONLY #include "RAJA/config.hpp" -//#include "common/PerfsuiteKernelDefinitions.hpp" - - - #if defined(RUN_KOKKOS) #include "Kokkos_Core.hpp" -#endif -#endif +#endif // RUN_KOKKOS +#endif // RAJAPERF_INFRASTRUCTURE_ONLY #include namespace rajaperf { class RunParams; -class Executor; // forward declaration +class Executor; class KernelBase; const RunParams& getRunParams(Executor*); -void free_register_group(Executor*, std::string); // forward declaration -void free_register_kernel(Executor*, std::string, KernelBase*); // forward declaration +void free_register_group(Executor*, std::string); +void free_register_kernel(Executor*, std::string, KernelBase*); void make_perfsuite_executor(Executor* exec, int argc, char* argv[]); + #if defined(RUN_KOKKOS) -// Kokkos Design Spirit: -// WE NEED: -// 1) Use KokkosViews --> a wrapper around pointers for host and device memory -// management -// 2) Use default execution space -// -// -// NEW FUNCTION WILL: -// 1) Take in a raw pointer (e.g., float*, int*, etc.) -// 2) From this pointer, return a Kokkos::View -// -// Return type : Kokkos::View -// Kokkos::View takes tempalted arguments -// To write "generically" implies templated arguments -// https://eli.thegreenplace.net/2014/variadic-templates-in-c/ -// template - -// This is a TEMPLATED STRUCT. This struct will contain the type of a pointer -// of n dimensions This struct is templated on the template that immediately precedes the struct declaration. struct PointerOfNdimensions; -// This template block declares a specialization, which means that you say the -// template arguments that you're NOT specializing template - -// Here, we are specialising a template according to the type of argument that -// is passed. In this example, we've specialized the PointedAt template -// argument for the case that the number of dimensions is 0. All we will do in -// this struct is to define a type. - -// This struct is a specialization of : -// template struct PointerOfNdimensions { - // "using" is a type alias - // if you derefernce a pointer, you're just left with an object, the value - // of that pointer using type = PointedAt; }; -// NO SPECIALIZATION, i.e., we fix no templated arguments template - struct PointerOfNdimensions { - // PointerOfNdimensions is a type - // My type is a pointer to the type of myself, decremented using type = typename PointerOfNdimensions::type *; }; +// This templated function is used to wrap pointers (declared and defined in RAJAPerf Suite kernels) in Kokkos Views template - -// FUNCTION THAT GETS A VIEW FROM A POINTER WITH RETURN TYPE KOKKOS::VIEW -// - - auto getViewFromPointer(PointedAt *kokkos_ptr, Boundaries... boundaries) - // Recall: PointerOfNdimensions is struct that exists solely to hold a - // type - // -> connotes "return type after the arrow" -> typename Kokkos::View< - typename PointerOfNdimensions::type, - // typename Kokkos::DefaultHostExecutionSpace::memory_space> - // This more generic expression allow moving the - // View-wrapped pointer b/w - // Host and GPU - typename Kokkos::DefaultExecutionSpace::memory_space> + typename PointerOfNdimensions::type, + typename Kokkos::DefaultExecutionSpace::memory_space> { - // This says construct the pointer_holder variable from arguments passed to - // the template block - // Declaration of a type alias, host_view_type using host_view_type = typename Kokkos::View< - // in the line below , you are using the type alias that is the memeber - // of a struct - typename PointerOfNdimensions::type, typename Kokkos::DefaultHostExecutionSpace::memory_space>; - // FYI - Device can be GPU, OpenMPTarget, HIP (for targeting an AMD GPU), SYCL - // (library in Intel Compiler) - // using device_view_type = typename Kokkos::View< typename PointerOfNdimensions::type, typename Kokkos::DefaultExecutionSpace::memory_space>; - // When copying data, we can either change the Layout or the memory_space + // Nota bene: When copying data, we can either change the Layout or the memory_space // (host or device), but we cannot change both! - // Here, we are mirroring data on the host to the device, i.e., Layout is - // as if on the device, but the data is actually on the host. The host - // mirror will be Layout Left (optimal for the device), but data are - // actually on the HOST! + // Here, we are mirroring data on the (CPU) host TO the (GPU) device, i.e., Layout is + // as if on the device, but the data actually reside on the host. The host + // mirror will be Layout Left (optimal for the device, but not the host). - // Here, "using" is type alias; in this example,its our gpu Layout on cpu using mirror_view_type = typename device_view_type::HostMirror; - // Assignment statement; we are constructing a host_view_type with the name - // pointer_holder. The value of kokkos_ptr is the pointer we're wrapping on - // the Host, and the Boundaries parameter pack values, boundaries, will also + // Assignment statement: we are constructing a host_view_type called + // pointer_holder. The value of kokkos_ptr is the Kokkos View-wrapped pointer + // on the Host (CPU), and the Boundaries parameter pack values, boundaries (i.e., array boundaries) will also // be part of this this host_view_type object. host_view_type pointer_holder(kokkos_ptr, boundaries...); - // boundaries will contain the array dimenions; an allocation is implicitly - // made here + // The boundaries parameter pack contains the array dimenions; + // an allocation is implicitly made here device_view_type device_data_copy("StringName", boundaries...); mirror_view_type cpu_to_gpu_mirror = Kokkos::create_mirror_view(device_data_copy); - // We need to deep_copy our existing data, the contents of + // deep_copy our existing data, the contents of // pointer_holder, into the mirror_view; - // Copying from Host to Device has two steps: 1) Change the layout, 2) - // change the memory_space (host or device). Step 1 is to change the - // layout to enable sending data from CPU to GPU. Step 2 is actually - // sending the optimal data layout to the GPU - - // This step changes the Layout to be optimal for the gpu + // Copying from Host to Device has two steps: + // 1) Change the layout to enable sending data from CPU to GPU + // 2) Change the memory_space (host or device) to send the optimal data + // layout to the GPU. + + // This step changes the array layout to be optimal for the gpu, i.e., + // LayoutLeft. Kokkos::deep_copy(cpu_to_gpu_mirror, pointer_holder); // The mirror view data layout on the HOST is like the layout for the GPU. @@ -172,77 +108,41 @@ auto getViewFromPointer(PointedAt *kokkos_ptr, Boundaries... boundaries) return device_data_copy; } -/////////////////////////////////////////////////////////////////////////////// -// THIS FUNCTION WILL MOVE DATA IN A KOKKOS::VIEW BACK TO HOST FROM DEVICE, AND -// STORE IN AN EXISTING POINTER -/////////////////////////////////////////////////////////////////////////////// - +// This function will move data in a Kokkos::View back to host from device, +// and will store in the existing pointer(s) template - -// DEFINING FUNCTION THAT GETS A VIEW FROM A POINTER WITH RETURN TYPE -// KOKKOS::VIEW -//"my_view" parameter is equivalent to device_data_copy -// void moveDataToHostFromKokkosView(PointedAt *kokkos_ptr, ExistingView my_view, Boundaries... boundaries) - { - // This says construct the pointer_holder variable from arguments passed to - // the template block - // + using host_view_type = typename Kokkos::View< typename PointerOfNdimensions::type, typename Kokkos::DefaultHostExecutionSpace::memory_space>; - // FYI - Device can be GPU, OpenMPTarget, HIP (for targeting an AMD GPU), SYCL - // (library in Intel Compiler) - // using device_view_type = typename Kokkos::View< typename PointerOfNdimensions::type, typename Kokkos::DefaultExecutionSpace::memory_space>; - // When copying data, we can either change the Layout or the memory_space - // (host or device), but we cannot change both! - // Here, we are mirroring data on the host to the device, i.e., Layout is - // as if on the device, but the data is actually on the host. The host - // mirror will be Layout Left (optimal for the device), but data are - // actually on the HOST! - - // Here, "using" is type alias; in this example,its our gpu Layout on cpu using mirror_view_type = typename device_view_type::HostMirror; - // Assignment statement; we are constructing a host_view_type with the name - // pointer_holder. The value of kokkos_ptr is the pointer we're wrapping on + // Constructing a host_view_type with the name + // pointer_holder. The contents/value of kokkos_ptr is the pointer we're wrapping on // the Host, and the Boundaries parameter pack values, boundaries, will also // be part of this this host_view_type object. host_view_type pointer_holder(kokkos_ptr, boundaries...); - // Layout is optimal for gpu, but located on CPU + // Layout is optimal for gpu, but data are actually located on CPU mirror_view_type cpu_to_gpu_mirror = Kokkos::create_mirror_view(my_view); - //auto mirror_view_type cpu_to_gpu_mirror = Kokkos::create_mirror_view(my_view); - // We need to deep_copy our existing data, the contents of - // pointer_holder, into the mirror_view; - // Copying from Host to Device has two steps: 1) Change the layout, 2) - // change the memory_space (host or device). Step 1 is to change the - // layout to enable sending data from CPU to GPU. Step 2 is actually - // sending the optimal data layout to the GPU - - // This step changes the Layout to be optimal for the gpu - - // The mirror view data layout on the HOST is like the layout for the GPU. - // GPU-optimized layouts are LayoutLeft, i.e., column-major This deep_copy - // copy GPU-layout data on the HOST to the Device - - // Actual copying of the data from the gpu to the cpu + // Actual copying of the data from the gpu (my_view) back to the cpu Kokkos::deep_copy(cpu_to_gpu_mirror, my_view); - // This copies from the mirror on the cpu + // This copies from the mirror on the host cpu back to the existing + // pointer(s) Kokkos::deep_copy(pointer_holder, cpu_to_gpu_mirror); } - #endif // RUN_KOKKOS class KernelBase; @@ -323,6 +223,7 @@ enum KernelID { // // Polybench kernels... +// These will be uncommented once Kokkos translations for these kernels exist // // Polybench_2MM, // Polybench_3MM, diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp index f0e784344..c12b069c1 100644 --- a/src/common/RPTypes.hpp +++ b/src/common/RPTypes.hpp @@ -6,16 +6,17 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -/// -/// Basic data types used in the Suite. +/// Basic data types used in RAJAPerf Suite. /// #ifndef RAJAPerf_RPTypes_HPP #define RAJAPerf_RPTypes_HPP +// This macro, RAJAPERF_INFRASTRUCTURE_ONLY, is for Kokkos and Kokkos Kernels +// -based performance testing #ifndef RAJAPERF_INFRASTRUCTURE_ONLY #include "RAJA/util/types.hpp" #endif -// + // Only one of the following (double or float) should be defined. // #define RP_USE_DOUBLE @@ -115,8 +116,6 @@ using Complex_ptr = Complex_type*; #endif - - } // closing brace for rajaperf namespace #endif // closing endif for header file include guard diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp index 335e017e4..fe41ffdc9 100644 --- a/src/common/RunParams.cpp +++ b/src/common/RunParams.cpp @@ -590,13 +590,7 @@ void RunParams::printKernelNames(std::ostream& str) const { str << "\nAvailable kernels:"; str << "\n------------------\n"; -// TODO DZP reimplement -// for (int kid = 0; kid < NumKernels; ++kid) { -///// RDH DISABLE COUPLE KERNEL -// if (static_cast(kid) != Apps_COUPLE) { -// str << getKernelName(static_cast(kid)) << std::endl; -// } -// } + str.flush(); } @@ -605,13 +599,7 @@ void RunParams::printFullKernelNames(std::ostream& str) const { str << "\nAvailable kernels (_):"; str << "\n-----------------------------------------\n"; -// TODO DZP: reimplement -// for (int kid = 0; kid < NumKernels; ++kid) { -///// RDH DISABLE COUPLE KERNEL -// if (static_cast(kid) != Apps_COUPLE) { -// str << getFullKernelName(static_cast(kid)) << std::endl; -// } -// } + str.flush(); } @@ -654,24 +642,13 @@ void RunParams::printFeatureKernels(std::ostream& str) const for (int fid = 0; fid < NumFeatures; ++fid) { FeatureID tfid = static_cast(fid); str << getFeatureName(tfid) << std::endl; -// TODO DZP: reimplement -// for (int kid = 0; kid < NumKernels; ++kid) { -// KernelID tkid = static_cast(kid); -///// RDH DISABLE COUPLE KERNEL -// if (tkid != Apps_COUPLE) { -// KernelBase* kern = getKernelObject(tkid, *this); -// if ( kern->usesFeature(tfid) ) { -// str << "\t" << getFullKernelName(tkid) << std::endl; -// } -// delete kern; -// } -// } // loop over kernels + str << std::endl; } // loop over features str.flush(); } -// AJP, DZP: Commenting function body, because we have not yet integrated -// with Kokkos testing infrastructure +// TODO for Kokkos Team: Commenting function body, because this infrastructure +// has not yet been integrated with Kokkos testing infrastructure void RunParams::printKernelFeatures(std::ostream& str) const { /* diff --git a/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp b/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp index 66dcfb0a4..76753574e 100644 --- a/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp +++ b/src/lcals-kokkos/DIFF_PREDICT-Kokkos.cpp @@ -17,7 +17,6 @@ namespace rajaperf namespace lcals { - template void diff_predict_helper(Index_type run_reps, Index_type ibegin, @@ -60,8 +59,6 @@ void DIFF_PREDICT::runKokkosVariant(VariantID vid) auto px_view = getViewFromPointer(px, iend*14); auto cx_view = getViewFromPointer(cx, iend*14); - // NOTA BENE: in DIFF_PREDICT.hpp, this constant: - // const Index_type offset = m_offset; auto diffpredict_lam = [=](Index_type i) { DIFF_PREDICT_BODY; @@ -71,60 +68,11 @@ void DIFF_PREDICT::runKokkosVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - DIFF_PREDICT_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - diffpredict_lam(i); - } - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), diffpredict_lam); - - } - stopTimer(); - - break; - } -*/ - - - -// Kokkos-ifying here: -// case Kokkos_Lambda : { Kokkos::fence(); startTimer(); - diff_predict_helper( run_reps, + diff_predict_helper(run_reps, ibegin, iend, offset, diff --git a/src/lcals-kokkos/EOS-Kokkos.cpp b/src/lcals-kokkos/EOS-Kokkos.cpp index b0b1f7403..0bc5cd1bc 100644 --- a/src/lcals-kokkos/EOS-Kokkos.cpp +++ b/src/lcals-kokkos/EOS-Kokkos.cpp @@ -41,52 +41,6 @@ void EOS::runKokkosVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - EOS_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - eos_lam(i); - } - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), eos_lam); - - } - stopTimer(); - - break; - } - -*/ - case Kokkos_Lambda : { Kokkos::fence(); diff --git a/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp b/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp index a1714a382..fcd641f16 100644 --- a/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp +++ b/src/lcals-kokkos/FIRST_DIFF-Kokkos.cpp @@ -17,7 +17,6 @@ namespace rajaperf namespace lcals { -// Kokkos-ification starts here: void FIRST_DIFF::runKokkosVariant(VariantID vid) { @@ -34,7 +33,7 @@ void FIRST_DIFF::runKokkosVariant(VariantID vid) Real_ptr y = m_y; */ -// lcals = livermore compiler analysis loops suite + // lcals = livermore compiler analysis loops suite // Instiating KokkosViews using getViewFromPointer; // Wrapping pointers in KokkosViews @@ -50,54 +49,6 @@ void FIRST_DIFF::runKokkosVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - FIRST_DIFF_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - firstdiff_lam(i); - } - - } - stopTimer(); - - break; - } - -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), firstdiff_lam); - - } - - stopTimer(); - - break; - } -*/ - - // Kokkos-ifying here: case Kokkos_Lambda : { Kokkos::fence(); @@ -133,7 +84,6 @@ void FIRST_DIFF::runKokkosVariant(VariantID vid) moveDataToHostFromKokkosView(x, x_view, iend + 1); moveDataToHostFromKokkosView(y, y_view, iend + 1); - } } // end namespace lcals diff --git a/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp b/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp index 6d9502638..04d1f18b5 100644 --- a/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp +++ b/src/lcals-kokkos/FIRST_MIN-Kokkos.cpp @@ -30,84 +30,15 @@ void FIRST_MIN::runKokkosVariant(VariantID vid) // #define FIRST_MIN_DATA_SETUP \ // Real_ptr x = m_x; - auto x_view = getViewFromPointer(x, iend); #if defined(RUN_KOKKOS) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - FIRST_MIN_MINLOC_INIT; - - for (Index_type i = ibegin; i < iend; ++i ) { - FIRST_MIN_BODY; - } - - m_minloc = RAJA_MAX(m_minloc, mymin.loc); - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - auto firstmin_base_lam = [=](Index_type i) -> Real_type { - return x[i]; - }; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - FIRST_MIN_MINLOC_INIT; - - for (Index_type i = ibegin; i < iend; ++i ) { - if ( firstmin_base_lam(i) < mymin.val ) { \ - mymin.val = x[i]; \ - mymin.loc = i; \ - } - } - - m_minloc = RAJA_MAX(m_minloc, mymin.loc); - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceMinLoc loc( - m_xmin_init, m_initloc); - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - FIRST_MIN_BODY_RAJA; - }); - - m_minloc = RAJA_MAX(m_minloc, loc.getLoc()); - - } - stopTimer(); - - break; - } -*/ - case Kokkos_Lambda : { // https://github.com/kokkos/kokkos/wiki/Kokkos::MinLoc -// A templated class: // MinLoc::value_type result; // parallel_reduce(N,Functor,MinLoc(result)); @@ -125,8 +56,8 @@ void FIRST_MIN::runKokkosVariant(VariantID vid) reducer_type::value_type min_result_obj; Kokkos::parallel_reduce("FIRST_MIN_Kokkos Kokkos_Lambda", - Kokkos::RangePolicy(ibegin, iend), - KOKKOS_LAMBDA(Index_type i, reducer_type::value_type& mymin) { + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i, reducer_type::value_type& mymin) { // #define FIRST_MIN_BODY // if ( x[i] < mymin.val ) { @@ -139,11 +70,11 @@ void FIRST_MIN::runKokkosVariant(VariantID vid) mymin.loc = i; } - // Kokkos knows how to handle a MinLoc type + // Kokkos can handle a MinLoc type }, reducer_type(min_result_obj)); - // Kokkos translation of line below is needed + // Kokkos translation of line below // m_minloc = RAJA_MAX(m_minloc, loc.getLoc()); m_minloc = min_result_obj.loc; diff --git a/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp b/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp index f4a5d4b77..77ebeefcb 100644 --- a/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp +++ b/src/lcals-kokkos/FIRST_SUM-Kokkos.cpp @@ -39,55 +39,6 @@ void FIRST_SUM::runKokkosVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - FIRST_SUM_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - firstsum_lam(i); - } - - } - stopTimer(); - - break; - } - - -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), firstsum_lam); - - } - stopTimer(); - - break; - } - - */ - - case Kokkos_Lambda : { Kokkos::fence(); @@ -122,7 +73,6 @@ void FIRST_SUM::runKokkosVariant(VariantID vid) moveDataToHostFromKokkosView(x, x_view, iend); moveDataToHostFromKokkosView(y, y_view, iend); - } } // end namespace lcals diff --git a/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp b/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp index 8120687c9..ed4806972 100644 --- a/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp +++ b/src/lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp @@ -33,7 +33,7 @@ void GEN_LIN_RECUR::runKokkosVariant(VariantID vid) auto sb_view = getViewFromPointer(sb, iend); auto stb5_view = getViewFromPointer(stb5, iend); -// RPS Lambdas +// RAJAPerf Suite Lambdas auto genlinrecur_lam1 = [=](Index_type k) { GEN_LIN_RECUR_BODY1; @@ -46,65 +46,6 @@ void GEN_LIN_RECUR::runKokkosVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type k = 0; k < N; ++k ) { - GEN_LIN_RECUR_BODY1; - } - - for (Index_type i = 1; i < N+1; ++i ) { - GEN_LIN_RECUR_BODY2; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type k = 0; k < N; ++k ) { - genlinrecur_lam1(k); - } - - for (Index_type i = 1; i < N+1; ++i ) { - genlinrecur_lam2(i); - } - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(0, N), genlinrecur_lam1); - - - - - RAJA::forall( - RAJA::RangeSegment(1, N+1), genlinrecur_lam2); - - } - stopTimer(); - - break; - } -*/ - case Kokkos_Lambda : { Kokkos::fence(); @@ -116,9 +57,8 @@ void GEN_LIN_RECUR::runKokkosVariant(VariantID vid) // Index_type kb5i = m_kb5i; // Index_type N = m_N; - Kokkos::parallel_for("GEN_LIN_RECUR_Kokkos Kokkos Lambda -- BODY1", - // RPS indices are (0, N) for BODY1 + // Here, RAJAPerf Suite (RPS) indices are (0, N) for BODY1 Kokkos::RangePolicy(0, N), KOKKOS_LAMBDA(Index_type k) { /* @@ -134,7 +74,7 @@ void GEN_LIN_RECUR::runKokkosVariant(VariantID vid) Kokkos::parallel_for("GEN_LIN_RECUR_Kokkos Kokkos Lambda -- BODY2", // ATTN: you must adjust indices to align with - // RPS design intent here + // RPS design intent here; // RPS indices are (1, N+1) for BODY2 Kokkos::RangePolicy(1, N+1), KOKKOS_LAMBDA(Index_type i) { @@ -172,7 +112,6 @@ void GEN_LIN_RECUR::runKokkosVariant(VariantID vid) moveDataToHostFromKokkosView(sb, sb_view, iend); moveDataToHostFromKokkosView(stb5, stb5_view, iend); - } } // end namespace lcals diff --git a/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp b/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp index b8188a995..27a8d4f12 100644 --- a/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp +++ b/src/lcals-kokkos/HYDRO_1D-Kokkos.cpp @@ -47,54 +47,6 @@ void HYDRO_1D::runKokkosVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - HYDRO_1D_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - hydro1d_lam(i); - } - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), hydro1d_lam); - - } - stopTimer(); - - break; - } - - */ - - - case Kokkos_Lambda : { Kokkos::fence(); @@ -127,7 +79,7 @@ void HYDRO_1D::runKokkosVariant(VariantID vid) #endif // RUN_KOKKOS - // ATTN: Adjust arr dimensions to be congruent with the setup + // ATTN: Adjust array dimensions to be congruent with the setup // in the .cpp file: // m_array_length = getActualProblemSize() + 12; diff --git a/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp b/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp index 82cdf60ef..7f7aada10 100644 --- a/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp +++ b/src/lcals-kokkos/HYDRO_2D-Kokkos.cpp @@ -47,7 +47,7 @@ void HYDRO_2D::runKokkosVariant(VariantID vid) \ */ -// ATTN: THESE ARE 2D Views: +// ATTN: THESE INPUTS ARE 2D Views: // auto zadat_view = getViewFromPointer(zadat, kn, jn ); auto zbdat_view = getViewFromPointer(zbdat, kn, jn ); @@ -64,129 +64,10 @@ void HYDRO_2D::runKokkosVariant(VariantID vid) auto zroutdat_view = getViewFromPointer(zroutdat, kn, jn ); auto zzoutdat_view = getViewFromPointer(zzoutdat, kn, jn ); -// Pre-processor directives -// #if defined(RUN_KOKKOS) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type k = kbeg; k < kend; ++k ) { - for (Index_type j = jbeg; j < jend; ++j ) { - HYDRO_2D_BODY1; - } - } - - for (Index_type k = kbeg; k < kend; ++k ) { - for (Index_type j = jbeg; j < jend; ++j ) { - HYDRO_2D_BODY2; - } - } - - for (Index_type k = kbeg; k < kend; ++k ) { - for (Index_type j = jbeg; j < jend; ++j ) { - HYDRO_2D_BODY3; - } - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - auto hydro2d_base_lam1 = [=] (Index_type k, Index_type j) { - HYDRO_2D_BODY1; - }; - auto hydro2d_base_lam2 = [=] (Index_type k, Index_type j) { - HYDRO_2D_BODY2; - }; - auto hydro2d_base_lam3 = [=] (Index_type k, Index_type j) { - HYDRO_2D_BODY3; - }; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type k = kbeg; k < kend; ++k ) { - for (Index_type j = jbeg; j < jend; ++j ) { - hydro2d_base_lam1(k, j); - } - } - - for (Index_type k = kbeg; k < kend; ++k ) { - for (Index_type j = jbeg; j < jend; ++j ) { - hydro2d_base_lam2(k, j); - } - } - - for (Index_type k = kbeg; k < kend; ++k ) { - for (Index_type j = jbeg; j < jend; ++j ) { - hydro2d_base_lam3(k, j); - } - } - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - HYDRO_2D_VIEWS_RAJA; - - auto hydro2d_lam1 = [=] (Index_type k, Index_type j) { - HYDRO_2D_BODY1_RAJA; - }; - auto hydro2d_lam2 = [=] (Index_type k, Index_type j) { - HYDRO_2D_BODY2_RAJA; - }; - auto hydro2d_lam3 = [=] (Index_type k, Index_type j) { - HYDRO_2D_BODY3_RAJA; - }; - - using EXECPOL = - RAJA::KernelPolicy< - RAJA::statement::For<0, RAJA::loop_exec, // k - RAJA::statement::For<1, RAJA::loop_exec, // j - RAJA::statement::Lambda<0> - > - > - >; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::kernel( - RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), - RAJA::RangeSegment(jbeg, jend)), - hydro2d_lam1); - - RAJA::kernel( - RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), - RAJA::RangeSegment(jbeg, jend)), - hydro2d_lam2); - - RAJA::kernel( - RAJA::make_tuple( RAJA::RangeSegment(kbeg, kend), - RAJA::RangeSegment(jbeg, jend)), - hydro2d_lam3); - - } - stopTimer(); - - break; - } -*/ - - case Kokkos_Lambda : { Kokkos::fence(); @@ -292,11 +173,9 @@ void HYDRO_2D::runKokkosVariant(VariantID vid) Real_ptr zzoutdat = m_zzout; \ \ - */ - - // There are 9 inputs: + // There are 9 input views: moveDataToHostFromKokkosView(zadat, zadat_view, kn, jn); moveDataToHostFromKokkosView(zbdat, zbdat_view, kn, jn); moveDataToHostFromKokkosView(zmdat, zmdat_view, kn, jn); @@ -307,7 +186,7 @@ void HYDRO_2D::runKokkosVariant(VariantID vid) moveDataToHostFromKokkosView(zvdat, zvdat_view, kn, jn); moveDataToHostFromKokkosView(zzdat, zzdat_view, kn, jn); - // There are 2 output views + // There are 2 output views: moveDataToHostFromKokkosView(zroutdat, zroutdat_view, kn, jn); moveDataToHostFromKokkosView(zzoutdat, zzoutdat_view, kn, jn); diff --git a/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp b/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp index 04c49ff5f..e21d0f1ad 100644 --- a/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp +++ b/src/lcals-kokkos/INT_PREDICT-Kokkos.cpp @@ -41,7 +41,7 @@ void INT_PREDICT::runKokkosVariant(VariantID vid) */ - // Wrap pointer in Kokkos View + // Wrap pointer in Kokkos View, and adjust indices auto px_view = getViewFromPointer(px, iend*13); @@ -53,51 +53,6 @@ void INT_PREDICT::runKokkosVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - INT_PREDICT_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - intpredict_lam(i); - } - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), intpredict_lam); - - } - stopTimer(); - - break; - } -*/ - case Kokkos_Lambda : { Kokkos::fence(); diff --git a/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp b/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp index 0c74ad017..e0081b3b8 100644 --- a/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp +++ b/src/lcals-kokkos/PLANCKIAN-Kokkos.cpp @@ -50,52 +50,6 @@ void PLANCKIAN::runKokkosVariant(VariantID vid) # if defined (RUN_KOKKOS) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - PLANCKIAN_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - planckian_lam(i); - } - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), planckian_lam); - - } - stopTimer(); - - break; - } -*/ - - case Kokkos_Lambda : { Kokkos::fence(); diff --git a/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp b/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp index 92b6b46f4..fffcf40fb 100644 --- a/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp +++ b/src/lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp @@ -49,51 +49,6 @@ void TRIDIAG_ELIM::runKokkosVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - TRIDIAG_ELIM_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - tridiag_elim_lam(i); - } - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), tridiag_elim_lam); - - } - stopTimer(); - - break; - } -*/ - case Kokkos_Lambda : { Kokkos::fence(); diff --git a/src/stream-kokkos/ADD-Kokkos.cpp b/src/stream-kokkos/ADD-Kokkos.cpp index c89f990f1..58aa1ad16 100644 --- a/src/stream-kokkos/ADD-Kokkos.cpp +++ b/src/stream-kokkos/ADD-Kokkos.cpp @@ -17,8 +17,6 @@ namespace rajaperf namespace stream { -// Start Kokkos-ifying here: -// Nota bene: the original RAJAPerf Suite code left for reference void ADD::runKokkosVariant(VariantID vid) { @@ -45,60 +43,8 @@ namespace stream switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - ADD_BODY; - } - - } - stopTimer(); - - break; - } - - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - add_lam(i); - } - - } - stopTimer(); - - break; - } - -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), add_lam); - - } - stopTimer(); - - break; - } -*/ - -////////////////////////////////////////////////////////////////////////////// -// Kokkos -fying here: -// - case Kokkos_Lambda : { - // open Kokkos fence Kokkos::fence(); startTimer(); @@ -112,7 +58,7 @@ namespace stream }); } - // close Kokkos fence + Kokkos::fence(); stopTimer(); @@ -133,8 +79,6 @@ namespace stream moveDataToHostFromKokkosView(b, b_view, iend); moveDataToHostFromKokkosView(c, c_view, iend); - - } } // end namespace stream diff --git a/src/stream-kokkos/COPY-Kokkos.cpp b/src/stream-kokkos/COPY-Kokkos.cpp index b896ea4e2..59dd499f6 100644 --- a/src/stream-kokkos/COPY-Kokkos.cpp +++ b/src/stream-kokkos/COPY-Kokkos.cpp @@ -17,14 +17,6 @@ namespace rajaperf namespace stream { -/* -void COPY::runSeqVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); -*/ - void COPY::runKokkosVariant(VariantID vid) { @@ -47,54 +39,7 @@ void COPY::runSeqVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - COPY_BODY; - } - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - copy_lam(i); - } - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), copy_lam); - - } - stopTimer(); - - break; - } - - */ - case Kokkos_Lambda : { - Kokkos::fence(); startTimer(); @@ -103,8 +48,8 @@ void COPY::runSeqVariant(VariantID vid) Kokkos::parallel_for("COPY_Kokkos Kokkos_Lambda", Kokkos::RangePolicy(ibegin,iend), KOKKOS_LAMBDA(Index_type i) { - // COPY BODY DEFINITION IN HEADER: - // c[i] = a[i] ; + // DEFINITION IN HEADER: + // c[i] = a[i] ; c_view[i] = a_view[i]; }); @@ -116,7 +61,6 @@ void COPY::runSeqVariant(VariantID vid) } - default : { std::cout << "\n COPY : Unknown variant id = " << vid << std::endl; } diff --git a/src/stream-kokkos/DOT-Kokkos.cpp b/src/stream-kokkos/DOT-Kokkos.cpp index 503fda649..19a1c00eb 100644 --- a/src/stream-kokkos/DOT-Kokkos.cpp +++ b/src/stream-kokkos/DOT-Kokkos.cpp @@ -24,97 +24,22 @@ void DOT::runKokkosVariant(VariantID vid) { const Index_type ibegin = 0; const Index_type iend = getActualProblemSize(); - - DOT_DATA_SETUP; - // Instantiation of pointer - wrapped views: + // Instantiation of pointer - wrapped Kokkos views: auto a_view = getViewFromPointer(a, iend); auto b_view = getViewFromPointer(b, iend); - - - // Pre-processor directive #if defined(RUN_KOKKOS) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Real_type dot = m_dot_init; - - for (Index_type i = ibegin; i < iend; ++i ) { - DOT_BODY; - } - - m_dot += dot; - - } - stopTimer(); - - break; - } - - case Lambda_Seq : { - - auto dot_base_lam = [=](Index_type i) -> Real_type { - return a[i] * b[i]; - }; - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - Real_type dot = m_dot_init; - - for (Index_type i = ibegin; i < iend; ++i ) { - dot += dot_base_lam(i); - } - - m_dot += dot; - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::ReduceSum dot(m_dot_init); - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { - DOT_BODY; - }); - - m_dot += static_cast(dot.get()); - - } - stopTimer(); - - break; - } - */ - case Kokkos_Lambda : { - - // open Kokkosfence Kokkos::fence(); startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - // Declare and initialize dot - // dot will contain the reduction value, - // i.e., the dot product - // - // Reductions combine contributions from - // loop iterations + Real_type dot = m_dot_init; parallel_reduce("DOT-Kokkos Kokkos_Lambda", @@ -123,11 +48,7 @@ void DOT::runKokkosVariant(VariantID vid) { // DOT BODY definition from header: // dot += a[i] * b[i] ; - //dot_res += a_view[i]*b_view[i]; - /////////////////////////////// - //Int_type vec_i = vec_view[i]; dot_res += a_view[i]*b_view[i]; - //dot_res = vec_i; }, dot); m_dot += static_cast(dot); } diff --git a/src/stream-kokkos/MUL-Kokkos.cpp b/src/stream-kokkos/MUL-Kokkos.cpp index 10809b74d..b68ba6291 100644 --- a/src/stream-kokkos/MUL-Kokkos.cpp +++ b/src/stream-kokkos/MUL-Kokkos.cpp @@ -17,14 +17,6 @@ namespace rajaperf namespace stream { -/* -void MUL::runSeqVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); -*/ - void MUL::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -44,10 +36,6 @@ void MUL::runSeqVariant(VariantID vid) auto b_view = getViewFromPointer(b, iend); auto c_view = getViewFromPointer(c, iend); - // Is this needed here? - // The declaration and initialization is from stream/MUL.hpp - //Real_type alpha = m_alpha; - auto mul_lam = [=](Index_type i) { MUL_BODY; @@ -58,55 +46,6 @@ void MUL::runSeqVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - MUL_BODY; - } - - } - stopTimer(); - - break; - } - -// #if defined(RUN_RAJA_SEQ) - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - mul_lam(i); - } - - } - stopTimer(); - - break; - } - -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), mul_lam); - - } - stopTimer(); - - break; - } - */ - -//#endif // RUN_RAJA_SEQ - case Kokkos_Lambda : { Kokkos::fence(); @@ -130,10 +69,6 @@ void MUL::runSeqVariant(VariantID vid) } - - //} - - default : { std::cout << "\n MUL : Unknown variant id = " << vid << std::endl; } @@ -142,8 +77,6 @@ void MUL::runSeqVariant(VariantID vid) #endif // RUN_KOKKOS - // move data to host from view - moveDataToHostFromKokkosView(b, b_view, iend); moveDataToHostFromKokkosView(c, c_view, iend); diff --git a/src/stream-kokkos/TRIAD-Kokkos.cpp b/src/stream-kokkos/TRIAD-Kokkos.cpp index 9fbef444f..b7b491181 100644 --- a/src/stream-kokkos/TRIAD-Kokkos.cpp +++ b/src/stream-kokkos/TRIAD-Kokkos.cpp @@ -17,14 +17,6 @@ namespace rajaperf namespace stream { -/* -void TRIAD::runSeqVariant(VariantID vid) -{ - const Index_type run_reps = getRunReps(); - const Index_type ibegin = 0; - const Index_type iend = getActualProblemSize(); -*/ - void TRIAD::runKokkosVariant(VariantID vid) { const Index_type run_reps = getRunReps(); @@ -54,75 +46,27 @@ void TRIAD::runKokkosVariant(VariantID vid) switch ( vid ) { - case Base_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - TRIAD_BODY; - } - - } - stopTimer(); - - break; - } - -// #if defined(RUN_RAJA_SEQ) - case Lambda_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - for (Index_type i = ibegin; i < iend; ++i ) { - triad_lam(i); - } - - } - stopTimer(); - - break; - } -/* - case RAJA_Seq : { - - startTimer(); - for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - - RAJA::forall( - RAJA::RangeSegment(ibegin, iend), triad_lam); - - } - stopTimer(); - - break; - } - */ - case Kokkos_Lambda : { - Kokkos::fence(); - startTimer(); + Kokkos::fence(); + startTimer(); - for (RepIndex_type irep =0; irep < run_reps; ++irep) { - + for (RepIndex_type irep =0; irep < run_reps; ++irep) { Kokkos::parallel_for("TRIAD_Kokkos, Kokkos_Lambda", - Kokkos::RangePolicy(ibegin, iend), - KOKKOS_LAMBDA(Index_type i) { - // TRIAD_BODY definition in TRIAD.hpp - // a[i] = b[i] + alpha * c[i] ; - a_view[i] = b_view[i] + alpha * c_view[i]; - }); + Kokkos::RangePolicy(ibegin, iend), + KOKKOS_LAMBDA(Index_type i) { + // TRIAD_BODY definition in TRIAD.hpp + // a[i] = b[i] + alpha * c[i] ; + a_view[i] = b_view[i] + alpha * c_view[i]; + }); } - Kokkos::fence(); - stopTimer(); + Kokkos::fence(); + stopTimer(); break; } -//#endif // RUN_RAJA_SEQ default : { std::cout << "\n TRIAD : Unknown variant id = " << vid << std::endl; From 18cca87d047c385c21fd433e4c9a28bc9d37fb51 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Thu, 2 Dec 2021 07:55:04 -0700 Subject: [PATCH 123/124] First pass clean up of Executor files --- src/common/Executor.cpp | 255 +++++++++++++++------------------------- src/common/Executor.hpp | 39 +++--- 2 files changed, 113 insertions(+), 181 deletions(-) diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index b7b96ab30..d18bd4c10 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -12,7 +12,9 @@ #include "common/KernelBase.hpp" #include "common/OutputUtils.hpp" -// Warmup kernels to run first to help reduce startup overheads in timings +// Warmup kernels will be run if not in a RAJAPerf Suite infrastructure build +// Warm up runs reduce startup overheads +// This overhead should not be reflected in perf testing timing #ifndef RAJAPERF_INFRASTRUCTURE_ONLY #include "basic/DAXPY.hpp" #include "basic/REDUCE3_INT.hpp" @@ -29,7 +31,7 @@ #include #include #include - +// Defines miscellaneous symbolic constants and types, and declares miscellaneous functions. #include @@ -37,13 +39,15 @@ namespace rajaperf { using namespace std; +// Kokkos Design: +// Executor constructor Executor::Executor(int argc, char** argv) : run_params(argc, argv), reference_vid(NumVariants) { } - +// Executor destructor Executor::~Executor() { for (size_t ik = 0; ik < kernels.size(); ++ik) { @@ -61,15 +65,18 @@ void Executor::setupSuite() cout << "\nSetting up suite based on input..." << endl; + // + // Kokkoks Design: + // using Slist = list; using Svector = vector; using KIDset = set; using VIDset = set; - // - // Determine which kernels to exclude from input. - // exclude_kern will be non-duplicated ordered set of IDs of kernel to exclude. - // + + // Determine which kernels and features to exclude from input + // Store excluded inputs vector of strings. + const Svector& exclude_kernel_input = run_params.getExcludeKernelInput(); const Svector& exclude_feature_input = run_params.getExcludeFeatureInput(); @@ -77,15 +84,14 @@ void Executor::setupSuite() if ( !exclude_kernel_input.empty() ) { - // Make list copy of exclude kernel name input to manipulate for - // processing potential group names and/or kernel names, next + // Create list of excluded kernel names. + // In subsequent steps, this list will be used to form the group and/or kernel names to be run Slist exclude_kern_names(exclude_kernel_input.begin(), exclude_kernel_input.end()); - // - // Search exclude_kern_names for matching group names. - // groups2exclude will contain names of groups to exclude. - // + + // groups2exclude, a vector of strings, will contain names of groups to exclude. Svector groups2exclude; + // Search exclude_kern_names list for matching group names. for (Slist::iterator it = exclude_kern_names.begin(); it != exclude_kern_names.end(); ++it) { for (size_t ig = 0; ig < NumGroups; ++ig) { @@ -96,10 +102,10 @@ void Executor::setupSuite() } } - // - // If group name(s) found in exclude_kern_names, assemble kernels in group(s) - // to run and remove those group name(s) from exclude_kern_names list. - // + + // If group name(s) found in the list of exclude_kern_names, assemble kernels in group(s) + // to run and remove the identified group name(s) from exclude_kern_names list. + for (size_t ig = 0; ig < groups2exclude.size(); ++ig) { const string& gname(groups2exclude[ig]); @@ -109,17 +115,16 @@ void Executor::setupSuite() exclude_kern.insert(kid); } } - + // List of kernel names to be excluded; + // Here, removing errant / erroneous group names from this list exclude_kern_names.remove(gname); } // - // Look for matching names of individual kernels in remaining exclude_kern_names. - // - // Assemble invalid input for warning message. - // + // Vector of strings containing invalid input + // A warning message is associated with invalid input Svector invalid; - + // Search for matching names of kernels in remaining exclude_kern_names for (Slist::iterator it = exclude_kern_names.begin(); it != exclude_kern_names.end(); ++it) { bool found_it = false; @@ -131,7 +136,7 @@ void Executor::setupSuite() found_it = true; } } - + // If kernel not found, add to the vector "invalid" if ( !found_it ) invalid.push_back(*it); } @@ -141,7 +146,7 @@ void Executor::setupSuite() if ( !exclude_feature_input.empty() ) { - // First, check for invalid exclude_feature input. + // Check for invalid exclude_feature input. // Assemble invalid input for warning message. // Svector invalid; @@ -161,14 +166,16 @@ void Executor::setupSuite() run_params.setInvalidExcludeFeatureInput(invalid); // - // If feature input is valid, determine which kernels use - // input-specified features and add to set of kernels to run. + // If feature input is valid, determine which kernels to use + // Input-specified features and add to set of kernels to run. // if ( run_params.getInvalidExcludeFeatureInput().empty() ) { for (size_t i = 0; i < exclude_feature_input.size(); ++i) { const string& feature = exclude_feature_input[i]; + +// COMMENTED OUT BY KOKKOS; FEATURES DO NOT YET WORK IN OUR DESIGN /* bool found_it = false; for (size_t fid = 0; fid < NumFeatures && !found_it; ++fid) { @@ -194,12 +201,12 @@ void Executor::setupSuite() } // - // Determine which kernels to execute from input. - // run_kern will be non-duplicated ordered set of IDs of kernel to run. + // Determine which kernels to execute from input + // run_kern is an ordered set of KernelID to run // const Svector& kernel_input = run_params.getKernelInput(); const Svector& feature_input = run_params.getFeatureInput(); - + // Set of KernelID objects KIDset run_kern; if ( kernel_input.empty() && feature_input.empty() ) { @@ -211,19 +218,15 @@ void Executor::setupSuite() } else { - // - // Need to parse input to determine which kernels to run - // - - // // Look for kernels using features if such input provided - // if ( !feature_input.empty() ) { - // First, check for invalid feature input. - // Assemble invalid input for warning message. - // -/** TODO: reimplement + +// Kokkos Design: +// AJP left some of the extensive commented code, because RAJA & Kokkos developers may +// want to use / fix these blocs for integrated use with Kokkos +// FEATURE DOES NOT YET WORK WITH KOKKOS +/** TODO: Kokkos, reimplement! Svector invalid; for (size_t i = 0; i < feature_input.size(); ++i) { @@ -339,11 +342,11 @@ void Executor::setupSuite() run_params.setInvalidKernelInput(invalid); */ + Svector invalid; for (auto kernelName: kernel_input) { std::vector matchingKernelsVec = lookUpKernelByName(kernelName); - // if everything that matched is in the vector, and nothing matched, i.e., an empty vector, - // i.e., the kernel name was invalid + // Check -- everything that matched is added kernels vector if (matchingKernelsVec.empty()) { invalid.push_back(kernelName); @@ -359,10 +362,8 @@ void Executor::setupSuite() } - // + // Assemble set of available variants to run - // (based on compile-time configuration). - // VIDset available_var; for (size_t iv = 0; iv < NumVariants; ++iv) { VariantID vid = static_cast(iv); @@ -372,22 +373,16 @@ void Executor::setupSuite() } - // - // Determine variants to execute from input. - // run_var will be non-duplicated ordered set of IDs of variants to run. - // + // Declare and set exclude_variant_names (from run parameter inputs), a + // vector of strings const Svector& exclude_variant_names = run_params.getExcludeVariantInput(); VIDset exclude_var; if ( !exclude_variant_names.empty() ) { - // - // Parse input to determine which variants to exclude. - // + // Assemble invalid input for warning message. - // - Svector invalid; for (size_t it = 0; it < exclude_variant_names.size(); ++it) { @@ -410,8 +405,8 @@ void Executor::setupSuite() } // - // Determine variants to execute from input. - // run_var will be non-duplicated ordered set of IDs of variants to run. + // Determine variants to run based on user input (stored in run_params). + // run_var will be an ordered set of unique variant IDs to run. // const Svector& variant_names = run_params.getVariantInput(); @@ -420,7 +415,7 @@ void Executor::setupSuite() if ( variant_names.empty() ) { // - // No variants specified in input options, run all available. + // If no variants specified in input options, run all available. // Also, set reference variant if specified. // for (VIDset::iterator vid_it = available_var.begin(); @@ -443,15 +438,6 @@ void Executor::setupSuite() } else { - // - // Parse input to determine which variants to run: - // - variants to run will be the intersection of available variants - // and those specified in input - // - reference variant will be set to specified input if available - // and variant will be run; else first variant that will be run. - // - // Assemble invalid input for warning message. - // Svector invalid; @@ -486,12 +472,8 @@ void Executor::setupSuite() } - // - // Create kernel objects and variants to execute. If invalid input is not - // empty for either case, then there were unmatched input items. - // - // A message will be emitted later so user can sort it out... - // + // Kokkos Design: + // Create kernel objects and variants to execute. if ( !(run_params.getInvalidKernelInput().empty()) || !(run_params.getInvalidExcludeKernelInput().empty()) ) { @@ -526,8 +508,7 @@ void Executor::setupSuite() variant_ids.push_back( *vid ); } - // - // If we've gotten to this point, we have good input to run. + // If the bloc of code below executes, we have good input to run. // if ( run_params.getInputState() != RunParams::DryRun && run_params.getInputState() != RunParams::CheckRun ) { @@ -720,13 +701,15 @@ void Executor::runSuite() in_state != RunParams::CheckRun ) { return; } - +// Kokkos Design: #ifndef RAJAPERF_INFRASTRUCTURE_ONLY cout << "\n\nRun warmup kernels...\n"; vector warmup_kernels; + // The warm-up kernels that will be run if RAJAPERF_INFRASTUCTURE_ONLY NOT + // enabled warmup_kernels.push_back(new basic::DAXPY(run_params)); warmup_kernels.push_back(new basic::REDUCE3_INT(run_params)); warmup_kernels.push_back(new algorithm::SORT(run_params)); @@ -1346,47 +1329,14 @@ void Executor::getFOMGroups(vector& fom_groups) #endif } - -// New functions for Kokkos to register new group and kernel IDs -// The return type is Executor::groupID - + // Kokkos Desgin: + // Function to register new Kokkos and /or RAJA group and kernel ID + // The return type is Executor::groupID Executor::groupID Executor::registerGroup(std::string groupName) { - // find() method searches the string for the first occurrence of the sequence specified by its arguments. - // Recall, "kernelsPerGroup" is a mapping of kernel groups (e.g., basic) and their constituent kernels (e.g., DAXPY) auto checkIfGroupExists = kernelsPerGroup.find(groupName); - /* Recall, these items are defined in Executor.hpp: - using groupID = int; - using kernelID = int; - using kernelSet = std::set; // data type: set of KernelBase* instances - using kernelMap = std::map; // data type: map of string kernel names to instances of KernelBase* - using groupMap = std::map; // data type: map of groupNames to sets of kernels - ... - // "allKernels" is an instance of kernelMap, which is a "map" of all kernels and their ID's - kernelMap allKernels; - - // "kernelsPerGroup" is an instance of "groupMap;" "kernelsPerGroup" maps kernels to their categories (e.g., basic, polybench, etc.) - groupMap kernelsPerGroup; - - */ - - /* end() - * Return iterator to end - * Returns an iterator referring to the past-the-end element in the vector container. - * The past-the-end element is the theoretical element that would follow the last element in the vector. - * It does not point to any element, and thus shall not be de-referenced. - * Because the ranges used by functions of the standard library do not include - * the element pointed by their closing iterator, - * this function is often used in combination with vector::begin to specify a range including all the elements in the container. - * If the container is empty, this function returns the same as vector::begin. - * - */ - - - // HERE, WE ARE CHECKING THE CASE THAT THE groupNAME **IS NOT** IN THE MAP OBJECT - // Using the .end() idiom to check if I've fallen off the edge of the container without finding a match if (checkIfGroupExists == kernelsPerGroup.end()) { // If groupName not found, set that groupName in kernelsPerGroup to an empty kernelSet obj kernelsPerGroup[groupName] = kernelSet(); @@ -1396,7 +1346,7 @@ void Executor::getFOMGroups(vector& fom_groups) std::cout << "The Group Name " << groupName << " already exists. Program is exiting." << std::endl; - // In kernelsPerGroup, the Group Name is the first position / key value, and the second position / value type in the set + // In kernelsPerGroup, groupName is the second position / value in the set auto fullKernelSet = checkIfGroupExists->second; // fullKernelSet is of type std::set @@ -1416,38 +1366,40 @@ void Executor::getFOMGroups(vector& fom_groups) } -// New function with return type Executor::kernelID, returning getNewKernelID(); registerKernel is a new function in the Executor class -// + // Kokkos Design: + // Function to register new kernels + // The return type -- Executor::kernelID, returning getNewKernelID() Executor::kernelID Executor::registerKernel(std::string groupName, KernelBase *kernel) { - // declaring and setting kernelName to de-referenced kernel pointer obj, an instance of KernelBase* + // Declaring and setting kernelName to de-referenced kernel pointer obj (passed in as as argument), an instance of KernelBase* auto kernelName = kernel->getName(); - // Recall, "allKernels" maps named kernels to their IDs + // Check if kernel exists; "allKernels" maps named kernels to their IDs; auto checkIfKernelExists = allKernels.find(kernelName); // Check if checkKernelExists value IS NOT in the map of all kernels + // to determine if a new kernel should be created if (checkIfKernelExists == allKernels.end()) { - // if the kernel name IS NOT in the allKernels map, set kernelName to kernel, the KernelBase* instance + // If the kernel name IS NOT in the allKernels map, set kernelName to kernel, a KernelBase* instance allKernels[kernelName] = kernel; } else { // ERROR CONDITION: if the kernel is found / exists, make the program exit + // kernelName is the key, or first element of allKernels std::cout << "Kernel " << checkIfKernelExists->first << " already exists. Program is exiting." << std::endl; exit(1); } - ////////////////////////////////////////////////////////////////////////////// - // This error condition : adding a groupName before checking if the group associated with the kernel exists - // Declare and set checkIfGroupExists to the value of the string-type groupName in the kernelsPerGroup map + // Kokkos Desgin: + // ERROR CONDITION : adding a groupName (to kernelsPerGroup) before checking if the (kernel) group exists. + // auto checkIfGroupExists = kernelsPerGroup.find(groupName); - // LOGIC: Check if checkIfGroupExists value is the same as the past-the-end element in the vector container, which - // does not have a value - // i.e., check for the case that the groupName DOES NOT exist with the ".end()" idiom; + + if (checkIfGroupExists == kernelsPerGroup.end()) { } else { - // If the groupName DOES EXIST, then insert the kernel (instance of KernelBase*) at the second position of the - // allKernels map to associate the kernel and its groupNAme + // If the groupName DOES EXIST in kernelsPerGroup, then insert the associated kernel (instance of KernelBase*) + // at the second (value) position of the allKernels map to associate correctly the kernel and its groupName checkIfGroupExists->second.insert(kernel); @@ -1456,81 +1408,60 @@ void Executor::getFOMGroups(vector& fom_groups) // getNewKernelID is an obj of type Executor::kernelID return getNewKernelID(); } -// AJP & DZP new function -// AJP GOAL: return a vector of all kernelBase* objects to be run by + + // Kokkos Design: + // Function of the Executor class that returns a vector of all kernelBase* objects. std::vector Executor::lookUpKernelByName(std::string kernelOrGroupName) { - // The vector / list return type, std::vector will contain - // either all of the kernels with a given kernel name or group name - // We have two maps (defined in Executor.hpp): kernelMap allKernels, groupMap kernelsPerGroup, - // STEPS: - // 1) declare new vector that will contain the string data: - // 2) LOGIC: - // i) check to see if the kernel / group requested on the - // "./rajaperf.exe -k" line (you can pass either a specific kernel or a - // kernel groupName, e.g., "Basic" - - // Declaring the vector kernelsByNameVect of type std::vector; // This variable will contain the set of kernels to run std::vector kernelsByNameVect; - - // CONDITIONS TO INCLUDE: - // 1) If kernelName is groupName , then add that set of kernels in the - // group to the vector - - // 2) else if kernelName is kernel, then add the kernel to the vector - // 3) else if kernelName is horse stuff, then say so - - // HINT: Declare iterator against which you can test equivalence - + // kernelsPerGroup: first (key) is kernel, second (value), is group auto checkLookUpGroupNameIterator = kernelsPerGroup.find(kernelOrGroupName); auto checkLookUpKernelNameIterator = allKernels.find(kernelOrGroupName); - // Check to see if groupName NOT in kernelsPerGroup; - // end() iterates to the end if (checkLookUpGroupNameIterator != kernelsPerGroup.end()) { - //cout << " STEP 1" << endl; - - // when using the arrow, you get a key, value pair. - // You can access either member by "first" or "second" - // - - // we have std::set of KernelBase* + // Gather the kernel groups that will be perf tested auto groupSetForTests = checkLookUpGroupNameIterator->second; - + // Capture the group name, and store in kernelsByNameVect for (auto item: groupSetForTests) { kernelsByNameVect.push_back(item); } + // Check -- if kernel name not an empty element, i.e., it exists, + // capture the name of the kernel, and store in kernelsByNameVect } else if (checkLookUpKernelNameIterator != allKernels.end()) { auto kernel = checkLookUpKernelNameIterator->second; kernelsByNameVect.push_back(kernel); - } - // kernelsByNameVect is an object of type std::vector that will be used by return kernelsByNameVect; - } + // Kokkos Desgin: + // Take user-entered run parameters in by reference, and return const RunParams &Executor::getRunParams() { return run_params; } + // Function to register a new kernel group for an instance of an Executor + // object void free_register_group(Executor *exec, std::string groupName) { exec->registerGroup(groupName); } - + // Function to register a new kernel for an instance of an Executor + // object void free_register_kernel(Executor *exec, std::string groupName, KernelBase *kernel) { exec->registerKernel(groupName, kernel); } + // Function to populate an instance of an Executor object with run parameters + const RunParams& getRunParams(Executor* exec){ return exec->getRunParams(); diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index b6fa1629a..e3c88a22b 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -17,6 +17,12 @@ #include #include + /////////////////////////////////////////////////// + // Logic: + // Need the full set of kernels + // Associate group names (e.g., lcals, basic) with kernel sets + // Interface to add new kernels (e.g., DAXPY) and groups (basic) + // for Kokkos Performance Testing namespace rajaperf { class KernelBase; @@ -45,20 +51,12 @@ class Executor void outputRunData(); // Interface for adding new Kokkos groups and kernels - using groupID = int; using kernelSet = std::set; using kernelMap = std::map; using groupMap = std::map; using kernelID = int; - /////////////////////////////////////////////////// - // - // Logic: - // Need the full set of kernels - // Associate group names (e.g., lcals, basic) with kernel sets - // Interface to add new kernels (e.g., DAXPY) and groups (basic) - // for Kokkos Performance Testing groupID registerGroup(std::string groupName); @@ -99,12 +97,12 @@ class Executor void writeFOMReport(const std::string& filename); void getFOMGroups(std::vector& fom_groups); + // Kokkos Design: // Kokkos add group and kernel ID inline functions - // Provisional Design for Kokkos + // The newGroupID and newKerneID, both type int, will be shared amongst invocations of these inline functions. inline groupID getNewGroupID() { - // The newGroupID will be shared amongst invocations of this - // function. + static groupID newGroupID; return newGroupID++; @@ -118,9 +116,9 @@ class Executor } - - - // Data members + // Required data members: + // running parameters, specific kernels (e.g., DAXPY), variants (e.g., + // Kokkos, CUDA, Sequential, etc.) RunParams run_params; std::vector kernels; @@ -128,18 +126,21 @@ class Executor VariantID reference_vid; - // "allKernels" is an instance of kernelMap, which is a "map" of all kernels (as strings, e.g., DAXPY, to their - // kernelBase* instances; the string name will be the key (first), and the kernelBase* instance will be the value (second) + // "allKernels" is an instance of kernelMap, a std::map that takes a std::string name (key) and pointer to the associated KernelBase object (value). kernelMap allKernels; - // "kernelsPerGroup" is an instance of "groupMap;" "kernelsPerGroup" maps kernels to their - // categories / parent class (e.g., basic, polybench, etc.) + // "kernelsPerGroup" is an instance of the groupMap type, a std::map that takes a std::string name (key) and a kernelSet object, + // containing the set of unique kernels (in a kernel group, such as basic, + // lcals, etc.) to be run. groupMap kernelsPerGroup; }; - +// Kokkos design: +// Register a new kernel group (see: PerfsuiteKernelDefinitions.*): void free_register_group(Executor*, std::string); +// Register a new kernel (that belongs to a particular kernel group): void free_register_kernel(Executor*, std::string, KernelBase*); +// Take in run parameters by reference const RunParams& getRunParams(Executor* exec); } // closing brace for rajaperf namespace From 487bce9c63801431e85efdab0ddbefb33cf8bf55 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Wed, 5 Jan 2022 10:36:36 -0700 Subject: [PATCH 124/124] Addressing PR comments from David Beckingsale & polybench-kokkos stubs --- CMakeLists.txt | 11 +- src/CMakeLists.txt | 248 +++++++------ src/RAJAPerfSuiteDriver.cpp | 34 +- src/apps/CMakeLists.txt | 2 +- src/apps/WIP-COUPLE.cpp | 335 +++++++++--------- src/common/Executor.cpp | 18 +- src/common/PerfsuiteKernelDefinitions.cpp | 12 +- src/common/RAJAPerfSuite.cpp | 30 +- src/common/RAJAPerfSuite.hpp | 61 +--- src/common/RPTypes.hpp | 1 - src/polybench-kokkos/CMakeLists.txt | 27 ++ src/polybench-kokkos/POLYBENCH_2MM-Kokkos.cpp | 196 ++++++++++ src/polybench-kokkos/POLYBENCH_3MM-Kokkos.cpp | 249 +++++++++++++ src/polybench-kokkos/POLYBENCH_ADI-Kokkos.cpp | 222 ++++++++++++ .../POLYBENCH_ATAX-Kokkos.cpp | 193 ++++++++++ .../POLYBENCH_FDTD_2D-Kokkos.cpp | 199 +++++++++++ .../POLYBENCH_FLOYD_WARSHALL-Kokkos.cpp | 123 +++++++ .../POLYBENCH_GEMM-Kokkos.cpp | 157 ++++++++ .../POLYBENCH_GEMVER-Kokkos.cpp | 232 ++++++++++++ .../POLYBENCH_GESUMMV-Kokkos.cpp | 138 ++++++++ .../POLYBENCH_HEAT_3D-Kokkos.cpp | 170 +++++++++ .../POLYBENCH_JACOBI_1D-Kokkos.cpp | 126 +++++++ .../POLYBENCH_JACOBI_2D-Kokkos.cpp | 157 ++++++++ src/polybench-kokkos/POLYBENCH_MVT-Kokkos.cpp | 186 ++++++++++ src/polybench/POLYBENCH_2MM.cpp | 3 + src/polybench/POLYBENCH_2MM.hpp | 4 + src/polybench/POLYBENCH_3MM.cpp | 3 + src/polybench/POLYBENCH_3MM.hpp | 2 + src/polybench/POLYBENCH_ADI.cpp | 3 + src/polybench/POLYBENCH_ADI.hpp | 1 + src/polybench/POLYBENCH_ATAX.cpp | 3 + src/polybench/POLYBENCH_ATAX.hpp | 2 + src/polybench/POLYBENCH_FDTD_2D.cpp | 3 + src/polybench/POLYBENCH_FDTD_2D.hpp | 2 + src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 3 + src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp | 2 + src/polybench/POLYBENCH_GEMM.cpp | 3 + src/polybench/POLYBENCH_GEMM.hpp | 2 + src/polybench/POLYBENCH_GEMVER.cpp | 3 + src/polybench/POLYBENCH_GEMVER.hpp | 2 + src/polybench/POLYBENCH_GESUMMV.cpp | 3 + src/polybench/POLYBENCH_GESUMMV.hpp | 2 + src/polybench/POLYBENCH_HEAT_3D.cpp | 3 + src/polybench/POLYBENCH_HEAT_3D.hpp | 2 + src/polybench/POLYBENCH_JACOBI_1D.cpp | 3 + src/polybench/POLYBENCH_JACOBI_1D.hpp | 2 + src/polybench/POLYBENCH_JACOBI_2D.cpp | 3 + src/polybench/POLYBENCH_JACOBI_2D.hpp | 2 + src/polybench/POLYBENCH_MVT.cpp | 3 + src/polybench/POLYBENCH_MVT.hpp | 2 + 50 files changed, 2804 insertions(+), 389 deletions(-) create mode 100644 src/polybench-kokkos/CMakeLists.txt create mode 100644 src/polybench-kokkos/POLYBENCH_2MM-Kokkos.cpp create mode 100644 src/polybench-kokkos/POLYBENCH_3MM-Kokkos.cpp create mode 100644 src/polybench-kokkos/POLYBENCH_ADI-Kokkos.cpp create mode 100644 src/polybench-kokkos/POLYBENCH_ATAX-Kokkos.cpp create mode 100644 src/polybench-kokkos/POLYBENCH_FDTD_2D-Kokkos.cpp create mode 100644 src/polybench-kokkos/POLYBENCH_FLOYD_WARSHALL-Kokkos.cpp create mode 100644 src/polybench-kokkos/POLYBENCH_GEMM-Kokkos.cpp create mode 100644 src/polybench-kokkos/POLYBENCH_GEMVER-Kokkos.cpp create mode 100644 src/polybench-kokkos/POLYBENCH_GESUMMV-Kokkos.cpp create mode 100644 src/polybench-kokkos/POLYBENCH_HEAT_3D-Kokkos.cpp create mode 100644 src/polybench-kokkos/POLYBENCH_JACOBI_1D-Kokkos.cpp create mode 100644 src/polybench-kokkos/POLYBENCH_JACOBI_2D-Kokkos.cpp create mode 100644 src/polybench-kokkos/POLYBENCH_MVT-Kokkos.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index deab22a9f..1cdab4474 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,7 +84,6 @@ endif() # Separate RAJAPerf Suite and Kokkos handling of HIP compilers if ((ENABLE_HIP) AND (NOT ENABLE_KOKKOS)) -#if (ENABLE_HIP) list(APPEND RAJA_PERFSUITE_DEPENDS hip) endif() @@ -113,7 +112,7 @@ endif() configure_file(${CMAKE_CURRENT_SOURCE_DIR}/src/rajaperf_config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/bin/rajaperf_config.hpp) -# Make sure RAJA flag propagate (we need to do some tidying to +# Make sure RAJA flags propagate (we need to do some tidying to # remove project-specific CMake variables that are no longer needed) set (CUDA_NVCC_FLAGS ${RAJA_NVCC_FLAGS}) @@ -123,14 +122,13 @@ if(ENABLE_KOKKOS) endif() -# ENABLE KOKKOS IS A RAJA PERFSUITE OPTION +# ENABLE_KOKKOS is A RAJAPerf Suite Option if(ENABLE_KOKKOS) add_definitions(-DRUN_KOKKOS) if(ENABLE_HIP) set(Kokkos_ENABLE_HIP ON CACHE BOOL "Kokkos builds for AMD HIP set the Kokkos_ENABLE_HIP variable to ON") #set(Kokkos_ARCH_VEGA900 ON CACHE BOOL "Docstring") #TODO: better - #set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE /ascldap/users/ajpowel/RAJAPerf/amd_build/compiler_unscrewer) endif() if(ENABLE_TARGET_OPENMP) set(Kokkos_ENABLE_OPENMPTARGET ON CACHE BOOL "Docstring") @@ -155,19 +153,14 @@ Kokkos_ENABLE_HIP variable to ON") enable_language(CUDA) endif() if(ENABLE_OPENMP) - #set(Kokkos_ENABLE_OPENMP CACHE BOOL ON) set(Kokkos_ENABLE_OPENMP ON CACHE BOOL "Docstring") endif() add_subdirectory(tpl/kokkos) - if(ENABLE_CUDA) - endif() get_property(KOKKOS_INCLUDE_DIRS DIRECTORY tpl/kokkos PROPERTY INCLUDE_DIRECTORIES) include_directories(${KOKKOS_INCLUDE_DIRS}) list(APPEND RAJA_PERFSUITE_DEPENDS kokkos) endif() -# Each directory in the perf suite has its own CMakeLists.txt file. -# DZP, AJP, DB, DA fixes add_subdirectory(src) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4aee5ffa7..b648d8c18 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,42 +8,53 @@ include_directories(.) -# Added as part of the merge with build only infrastructure: +# The INFRASTRUCTURE_ONLY option is for the scenario where +# ONLY the RAJAPerf Suite infrastructure is used as the driver for +# Kokkos and Kokkos Kernels performance tests add_subdirectory(common) if(NOT INFRASTRUCTURE_ONLY) -add_subdirectory(apps) -add_subdirectory(apps-kokkos) -add_subdirectory(basic) -add_subdirectory(basic-kokkos) -#add_subdirectory(kokkos-mechanics) -add_subdirectory(lcals) -add_subdirectory(lcals-kokkos) -#add_subdirectory(polybench) -add_subdirectory(stream) -add_subdirectory(stream-kokkos) -add_subdirectory(algorithm) -add_subdirectory(algorithm-kokkos) + add_subdirectory(algorithm) + add_subdirectory(apps) + add_subdirectory(basic) + add_subdirectory(lcals) + add_subdirectory(stream) + add_subdirectory(polybench) + if(ENABLE_KOKKOS) + # Kokkos translations + add_subdirectory(algorithm-kokkos) + add_subdirectory(apps-kokkos) + add_subdirectory(basic-kokkos) + add_subdirectory(lcals-kokkos) + add_subdirectory(stream-kokkos) + # Stub Kokkos implementations for polybench + add_subdirectory(polybench-kokkos) + endif() + endif() -# Ask David about necessary changes here (wrt to file in Kokkos Kernels) + set(RAJA_PERFSUITE_EXECUTABLE_DEPENDS common) if(NOT INFRASTRUCTURE_ONLY) -list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS + list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS basic - basic-kokkos apps - apps-kokkos - #kokkos-mechanics lcals - lcals-kokkos - #polybench stream - stream-kokkos algorithm + polybench) +endif() +if(ENABLE_KOKKOS) + list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS + basic-kokkos + apps-kokkos + lcals-kokkos + stream-kokkos algorithm-kokkos + # Stub implementation of polybench for Kokkos + polybench-kokkos ) endif() - +#endif() # This line must be kept list(APPEND RAJA_PERFSUITE_EXECUTABLE_DEPENDS ${RAJA_PERFSUITE_DEPENDS}) @@ -55,9 +66,9 @@ include_directories(apps) include_directories(algorithm) include_directories(stream) include_directories(polybench) -blt_add_executable( - NAME raja-perf-omptarget.exe - SOURCES RAJAPerfSuiteDriver.cpp +list(APPEND RAJA_PERF_OMP_SOURCES + +RAJAPerfSuiteDriver.cpp apps/AppsData.cpp apps/DEL_DOT_VEC_2D.cpp apps/DEL_DOT_VEC_2D-Seq.cpp @@ -93,19 +104,9 @@ blt_add_executable( apps/VOL3D-Seq.cpp apps/VOL3D-OMPTarget.cpp #apps/WIP-COUPLE.cpp - #Kokkos bloc - apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp - apps-kokkos/ENERGY-Kokkos.cpp - apps-kokkos/FIR-Kokkos.cpp - apps-kokkos/HALOEXCHANGE-Kokkos.cpp - apps-kokkos/PRESSURE-Kokkos.cpp - apps-kokkos/LTIMES-Kokkos.cpp - apps-kokkos/LTIMES_NOVIEW-Kokkos.cpp - apps-kokkos/VOL3D-Kokkos.cpp - #apps-kokkos/WIP-COUPLE.cpp - #basic/ATOMIC_PI.cpp - #basic/ATOMIC_PI-Seq.cpp - #basic/ATOMIC_PI-OMPTarget.cpp + basic/ATOMIC_PI.cpp + basic/ATOMIC_PI-Seq.cpp + basic/ATOMIC_PI-OMPTarget.cpp basic/PI_ATOMIC.cpp basic/PI_ATOMIC-Seq.cpp basic/PI_ATOMIC-OMPTarget.cpp @@ -145,16 +146,6 @@ blt_add_executable( basic/TRAP_INT.cpp basic/TRAP_INT-Seq.cpp basic/TRAP_INT-OMPTarget.cpp - basic-kokkos/PI_ATOMIC-Kokkos.cpp - basic-kokkos/DAXPY-Kokkos.cpp - basic-kokkos/IF_QUAD-Kokkos.cpp - basic-kokkos/INIT3-Kokkos.cpp - basic-kokkos/INIT_VIEW1D-Kokkos.cpp - basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp - basic-kokkos/MULADDSUB-Kokkos.cpp - basic-kokkos/NESTED_INIT-Kokkos.cpp - basic-kokkos/REDUCE3_INT-Kokkos.cpp - basic-kokkos/TRAP_INT-Kokkos.cpp lcals/DIFF_PREDICT.cpp lcals/DIFF_PREDICT-Seq.cpp lcals/DIFF_PREDICT-OMPTarget.cpp @@ -188,57 +179,45 @@ blt_add_executable( lcals/TRIDIAG_ELIM.cpp lcals/TRIDIAG_ELIM-Seq.cpp lcals/TRIDIAG_ELIM-OMPTarget.cpp - #Kokkos Bloc - lcals-kokkos/DIFF_PREDICT-Kokkos.cpp - lcals-kokkos/EOS-Kokkos.cpp - lcals-kokkos/FIRST_DIFF-Kokkos.cpp - lcals-kokkos/FIRST_MIN-Kokkos.cpp - lcals-kokkos/FIRST_SUM-Kokkos.cpp - lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp - lcals-kokkos/HYDRO_1D-Kokkos.cpp - lcals-kokkos/HYDRO_2D-Kokkos.cpp - lcals-kokkos/INT_PREDICT-Kokkos.cpp - lcals-kokkos/PLANCKIAN-Kokkos.cpp - lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp - #polybench/POLYBENCH_2MM.cpp - #polybench/POLYBENCH_2MM-Seq.cpp - #polybench/POLYBENCH_2MM-OMPTarget.cpp - #polybench/POLYBENCH_3MM.cpp - #polybench/POLYBENCH_3MM-Seq.cpp - #polybench/POLYBENCH_3MM-OMPTarget.cpp - #polybench/POLYBENCH_ADI.cpp - #polybench/POLYBENCH_ADI-Seq.cpp - #polybench/POLYBENCH_ADI-OMPTarget.cpp - #polybench/POLYBENCH_ATAX.cpp - #polybench/POLYBENCH_ATAX-Seq.cpp - #polybench/POLYBENCH_ATAX-OMPTarget.cpp - #polybench/POLYBENCH_FDTD_2D.cpp - #polybench/POLYBENCH_FDTD_2D-Seq.cpp - #polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp - #polybench/POLYBENCH_FLOYD_WARSHALL.cpp - #polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp - #polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp - #polybench/POLYBENCH_GEMM.cpp - #polybench/POLYBENCH_GEMM-Seq.cpp - #polybench/POLYBENCH_GEMM-OMPTarget.cpp - #polybench/POLYBENCH_GEMVER.cpp - #polybench/POLYBENCH_GEMVER-Seq.cpp - #polybench/POLYBENCH_GEMVER-OMPTarget.cpp - #polybench/POLYBENCH_GESUMMV.cpp - #polybench/POLYBENCH_GESUMMV-Seq.cpp - #polybench/POLYBENCH_GESUMMV-OMPTarget.cpp - #polybench/POLYBENCH_HEAT_3D.cpp - #polybench/POLYBENCH_HEAT_3D-Seq.cpp - #polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp - #polybench/POLYBENCH_JACOBI_1D.cpp - #polybench/POLYBENCH_JACOBI_1D-Seq.cpp - #polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp - #polybench/POLYBENCH_JACOBI_2D.cpp - #polybench/POLYBENCH_JACOBI_2D-Seq.cpp - #polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp - #polybench/POLYBENCH_MVT.cpp - #polybench/POLYBENCH_MVT-Seq.cpp - #polybench/POLYBENCH_MVT-OMPTarget.cpp + polybench/POLYBENCH_2MM.cpp + polybench/POLYBENCH_2MM-Seq.cpp + polybench/POLYBENCH_2MM-OMPTarget.cpp + polybench/POLYBENCH_3MM.cpp + polybench/POLYBENCH_3MM-Seq.cpp + polybench/POLYBENCH_3MM-OMPTarget.cpp + polybench/POLYBENCH_ADI.cpp + polybench/POLYBENCH_ADI-Seq.cpp + polybench/POLYBENCH_ADI-OMPTarget.cpp + polybench/POLYBENCH_ATAX.cpp + polybench/POLYBENCH_ATAX-Seq.cpp + polybench/POLYBENCH_ATAX-OMPTarget.cpp + polybench/POLYBENCH_FDTD_2D.cpp + polybench/POLYBENCH_FDTD_2D-Seq.cpp + polybench/POLYBENCH_FDTD_2D-OMPTarget.cpp + polybench/POLYBENCH_FLOYD_WARSHALL.cpp + polybench/POLYBENCH_FLOYD_WARSHALL-Seq.cpp + polybench/POLYBENCH_FLOYD_WARSHALL-OMPTarget.cpp + polybench/POLYBENCH_GEMM.cpp + polybench/POLYBENCH_GEMM-Seq.cpp + polybench/POLYBENCH_GEMM-OMPTarget.cpp + polybench/POLYBENCH_GEMVER.cpp + polybench/POLYBENCH_GEMVER-Seq.cpp + polybench/POLYBENCH_GEMVER-OMPTarget.cpp + polybench/POLYBENCH_GESUMMV.cpp + polybench/POLYBENCH_GESUMMV-Seq.cpp + polybench/POLYBENCH_GESUMMV-OMPTarget.cpp + polybench/POLYBENCH_HEAT_3D.cpp + polybench/POLYBENCH_HEAT_3D-Seq.cpp + polybench/POLYBENCH_HEAT_3D-OMPTarget.cpp + polybench/POLYBENCH_JACOBI_1D.cpp + polybench/POLYBENCH_JACOBI_1D-Seq.cpp + polybench/POLYBENCH_JACOBI_1D-OMPTarget.cpp + polybench/POLYBENCH_JACOBI_2D.cpp + polybench/POLYBENCH_JACOBI_2D-Seq.cpp + polybench/POLYBENCH_JACOBI_2D-OMPTarget.cpp + polybench/POLYBENCH_MVT.cpp + polybench/POLYBENCH_MVT-Seq.cpp + polybench/POLYBENCH_MVT-OMPTarget.cpp stream/ADD.cpp stream/ADD-Seq.cpp stream/ADD-OMPTarget.cpp @@ -254,17 +233,10 @@ blt_add_executable( stream/TRIAD.cpp stream/TRIAD-Seq.cpp stream/TRIAD-OMPTarget.cpp - stream-kokkos/ADD-Kokkos.cpp - stream-kokkos/COPY-Kokkos.cpp - stream-kokkos/DOT-Kokkos.cpp - stream-kokkos/MUL-Kokkos.cpp - stream-kokkos/TRIAD-Kokkos.cpp algorithm/SORT.cpp algorithm/SORT-Seq.cpp algorithm/SORTPAIRS.cpp algorithm/SORTPAIRS-Seq.cpp - algorithm-kokkos/SORT-Kokkos.cpp - algorithm-kokkos/SORTPAIRS-Kokkos.cpp common/DataUtils.cpp common/Executor.cpp common/KernelBase.cpp @@ -272,15 +244,71 @@ blt_add_executable( common/RAJAPerfSuite.cpp common/RPTypes.hpp common/RunParams.cpp +) +if(ENABLE_KOKKOS) +list(APPEND RAJA_PERF_OMP_SOURCES + #Kokkos translations + apps-kokkos/DEL_DOT_VEC_2D-Kokkos.cpp + apps-kokkos/ENERGY-Kokkos.cpp + apps-kokkos/FIR-Kokkos.cpp + apps-kokkos/HALOEXCHANGE-Kokkos.cpp + apps-kokkos/PRESSURE-Kokkos.cpp + apps-kokkos/LTIMES-Kokkos.cpp + apps-kokkos/LTIMES_NOVIEW-Kokkos.cpp + apps-kokkos/VOL3D-Kokkos.cpp + basic-kokkos/PI_ATOMIC-Kokkos.cpp + basic-kokkos/DAXPY-Kokkos.cpp + basic-kokkos/IF_QUAD-Kokkos.cpp + basic-kokkos/INIT3-Kokkos.cpp + basic-kokkos/INIT_VIEW1D-Kokkos.cpp + basic-kokkos/INIT_VIEW1D_OFFSET-Kokkos.cpp + basic-kokkos/MULADDSUB-Kokkos.cpp + basic-kokkos/NESTED_INIT-Kokkos.cpp + basic-kokkos/REDUCE3_INT-Kokkos.cpp + basic-kokkos/TRAP_INT-Kokkos.cpp + lcals-kokkos/DIFF_PREDICT-Kokkos.cpp + lcals-kokkos/EOS-Kokkos.cpp + lcals-kokkos/FIRST_DIFF-Kokkos.cpp + lcals-kokkos/FIRST_MIN-Kokkos.cpp + lcals-kokkos/FIRST_SUM-Kokkos.cpp + lcals-kokkos/GEN_LIN_RECUR-Kokkos.cpp + lcals-kokkos/HYDRO_1D-Kokkos.cpp + lcals-kokkos/HYDRO_2D-Kokkos.cpp + lcals-kokkos/INT_PREDICT-Kokkos.cpp + lcals-kokkos/PLANCKIAN-Kokkos.cpp + lcals-kokkos/TRIDIAG_ELIM-Kokkos.cpp + stream-kokkos/ADD-Kokkos.cpp + stream-kokkos/COPY-Kokkos.cpp + stream-kokkos/DOT-Kokkos.cpp + stream-kokkos/MUL-Kokkos.cpp + algorithm-kokkos/SORT-Kokkos.cpp + algorithm-kokkos/SORTPAIRS-Kokkos.cpptream-kokkos/TRIAD-Kokkos.cpp + #Stub implementations for polybench-kokkos + polybench-kokkos/POLYBENCH_2MM-Seq.cpp + polybench-kokkos/POLYBENCH_3MM-Seq.cpp + polybench-kokkos/POLYBENCH_ADI-Seq.cpp + polybench-kokkos/POLYBENCH_ATAX-Seq.cpp + polybench-kokkos/POLYBENCH_FDTD_2D-Seq.cpp + polybench-kokkos/POLYBENCH_FLOYD_WARSHALL-Seq.cpp + polybench-kokkos/POLYBENCH_GEMM-Seq.cpp + polybench-kokkos/POLYBENCH_GEMVER-Seq.cpp + polybench-kokkos/POLYBENCH_GESUMMV-Seq.cpp + polybench-kokkos/POLYBENCH_HEAT_3D-Seq.cpp + polybench-kokkos/POLYBENCH_JACOBI_1D-Seq.cpp + polybench-kokkos/POLYBENCH_JACOBI_2D-Seq.cpp + polybench-kokkos/POLYBENCH_MVT-Seq.cpp +) +endif() #ENABLE_KOKKOS +blt_add_executable( + NAME raja-perf-omptarget.exe + SOURCES ${RAJA_PERF_OMP_SOURCES} DEPENDS_ON ${RAJA_PERFSUITE_DEPENDS} ) +else() #ENABLE_TARGET_OPENMP -else() - if(NOT INFRASTRUCTURE_ONLY) blt_add_executable( NAME raja-perf.exe SOURCES RAJAPerfSuiteDriver.cpp DEPENDS_ON ${RAJA_PERFSUITE_EXECUTABLE_DEPENDS} ) - endif() -endif() +endif() # NOT INFRASTRUCTURE_ONLY diff --git a/src/RAJAPerfSuiteDriver.cpp b/src/RAJAPerfSuiteDriver.cpp index b33db8578..a58de4240 100644 --- a/src/RAJAPerfSuiteDriver.cpp +++ b/src/RAJAPerfSuiteDriver.cpp @@ -13,8 +13,8 @@ //------------------------------------------------------------------------------ int main( int argc, char** argv ) { - // STEP 1: Create suite executor object - //rajaperf::Executor executor(argc, argv); + // Create suite executor object with the arguments that were passed in + // rajaperf::Executor executor(argc, argv); #if defined(RUN_KOKKOS) Kokkos::initialize(argc, argv); @@ -22,44 +22,24 @@ int main( int argc, char** argv ) rajaperf::Executor executor(argc, argv); rajaperf::make_perfsuite_executor(&executor, argc, argv); - //executor.registerKernel - //rajaperf::RunParams params(argc, argv); - //executor.registerGroup("Sparse"); - - //executor.registerKernel("Sparse", rajaperf::make_kernel_base( - // "Sparse_SPMV", params, [&](const int repfact, const int size){ - // }, - // [&] (const int repfact, const int size) {} - // )); - // executor.registerKernel("Sparse", rajaperf::make_kernel_base( - // "Sparse_SPMM", params, [&](const int repfact, const int size){ - // return std::make_tuple(1); - // }, - // [&] (const int repfact, const int size, auto matrix) { - // // do the math using Kokkos Kernels operators - // } - // )); - - // STEP 2: Assemble kernels and variants to run + + // Assemble kernels and variants to run executor.setupSuite(); - // STEP 3: Report suite run summary + // Report suite run summary // (enable users to catch errors before entire suite is run) executor.reportRunSummary(std::cout); - // STEP 4: Execute suite + // Execute suite of selected tests executor.runSuite(); - // STEP 5: Generate suite execution reports + // Generate suite execution reports executor.outputRunData(); - // Pre-processor directives - #if defined(RUN_KOKKOS) Kokkos::finalize(); // TODO DZP: should this be here? Good question. AJP #endif - std::cout << "\n\nDONE!!!...." << std::endl; return 0; diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index a82bed339..21f7ce5f3 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -75,6 +75,6 @@ blt_add_library( VOL3D-Cuda.cpp VOL3D-OMP.cpp VOL3D-OMPTarget.cpp - WIP-COUPLE.cpp + #WIP-COUPLE.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/apps/WIP-COUPLE.cpp b/src/apps/WIP-COUPLE.cpp index d7a8e3d09..e536c07a1 100644 --- a/src/apps/WIP-COUPLE.cpp +++ b/src/apps/WIP-COUPLE.cpp @@ -20,175 +20,174 @@ namespace rajaperf namespace apps { -// -//COUPLE::COUPLE(const RunParams& params) -// : KernelBase(rajaperf::Apps_COUPLE, params) -//{ -// -// setDefaultSize(64); // See rzmax in ADomain struct -// setDefaultReps(60); -// -// m_domain = new ADomain(getRunSize(), /* ndims = */ 3); -// -// m_imin = m_domain->imin; -// m_imax = m_domain->imax; -// m_jmin = m_domain->jmin; -// m_jmax = m_domain->jmax; -// m_kmin = m_domain->kmin; -// m_kmax = m_domain->kmax; -//} -// +COUPLE::COUPLE(const RunParams& params) + : KernelBase(rajaperf::Apps_COUPLE, params) +{ + + setDefaultSize(64); // See rzmax in ADomain struct + setDefaultReps(60); -//COUPLE::~COUPLE() -//{ -// delete m_domain; -//} -// -//Index_type COUPLE::getItsPerRep() const -//{ -// return ( (m_imax - m_imin) * (m_jmax - m_jmin) * (m_kmax - m_kmin) ); -//} -// -//void COUPLE::setUp(VariantID vid) -//{ -// Index_type max_loop_index = m_domain->lrn; -// -// allocAndInitData(m_t0, max_loop_index, vid); -// allocAndInitData(m_t1, max_loop_index, vid); -// allocAndInitData(m_t2, max_loop_index, vid); -// allocAndInitData(m_denac, max_loop_index, vid); -// allocAndInitData(m_denlw, max_loop_index, vid); -// -// m_clight = 3.e+10; -// m_csound = 3.09e+7; -// m_omega0 = 0.9; -// m_omegar = 0.9; -// m_dt = 0.208; -// m_c10 = 0.25 * (m_clight / m_csound); -// m_fratio = sqrt(m_omegar / m_omega0); -// m_r_fratio = 1.0/m_fratio; -// m_c20 = 0.25 * (m_clight / m_csound) * m_r_fratio; -// m_ireal = Complex_type(0.0, 1.0); -//} -// -//void COUPLE::runKernel(VariantID vid) -//{ -// const Index_type run_reps = getRunReps(); -// -// COUPLE_DATA_SETUP; -// -// switch ( vid ) { -// -// case Base_Seq : { -// -// startTimer(); -// for (RepIndex_type irep = 0; irep < run_reps; ++irep) { -// -// for (Index_type k = kmin ; k < kmax ; ++k ) { -// COUPLE_BODY; -// } -// -// } -// stopTimer(); -// -// break; -// } -// -//#if defined(RUN_RAJA_SEQ) -// case RAJA_Seq : { -// -// startTimer(); -// for (RepIndex_type irep = 0; irep < run_reps; ++irep) { -// -// RAJA::forall( -// RAJA::RangeSegment(kmin, kmax), [=](Index_type k) { -// COUPLE_BODY; -// }); -// -// } -// stopTimer(); -// -// break; -// } -//#endif -// -//#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) -// case Base_OpenMP : { -// -// startTimer(); -// for (RepIndex_type irep = 0; irep < run_reps; ++irep) { -// -// #pragma omp parallel for -// for (Index_type k = kmin ; k < kmax ; ++k ) { -// COUPLE_BODY; -// } -// -// } -// stopTimer(); -// break; -// } -// -// case RAJA_OpenMP : { -// -// startTimer(); -// for (RepIndex_type irep = 0; irep < run_reps; ++irep) { -// -// RAJA::forall( -// RAJA::RangeSegment(kmin, kmax), [=](Index_type k) { -// COUPLE_BODY; -// }); -// -// } -// stopTimer(); -// -// break; -// } -//#endif -// -//#if defined(RAJA_ENABLE_TARGET_OPENMP) && 0 -// case Base_OpenMPTarget : -// case RAJA_OpenMPTarget : -// { -// runOpenMPTargetVariant(vid); -// break; -// } -//#endif -// -//#if defined(RAJA_ENABLE_CUDA) && 0 -// case Base_CUDA : -// case RAJA_CUDA : -// { -// runCudaVariant(vid); -// break; -// } -//#endif -// -// default : { -// std::cout << "\n COUPLE : Unknown variant id = " << vid << std::endl; -// } -// -// } -//} -// -//void COUPLE::updateChecksum(VariantID vid) -//{ -// Index_type max_loop_index = m_domain->lrn; -// -// checksum[vid] += calcChecksum(m_t0, max_loop_index); -// checksum[vid] += calcChecksum(m_t1, max_loop_index); -// checksum[vid] += calcChecksum(m_t2, max_loop_index); -//} -// -//void COUPLE::tearDown(VariantID vid) -//{ -// (void) vid; -// -// deallocData(m_t0); -// deallocData(m_t1); -// deallocData(m_t2); -// deallocData(m_denac); -// deallocData(m_denlw); -//} + m_domain = new ADomain(getRunSize(), /* ndims = */ 3); + + m_imin = m_domain->imin; + m_imax = m_domain->imax; + m_jmin = m_domain->jmin; + m_jmax = m_domain->jmax; + m_kmin = m_domain->kmin; + m_kmax = m_domain->kmax; +} + + +COUPLE::~COUPLE() +{ + delete m_domain; +} + +Index_type COUPLE::getItsPerRep() const +{ + return ( (m_imax - m_imin) * (m_jmax - m_jmin) * (m_kmax - m_kmin) ); +} + +void COUPLE::setUp(VariantID vid) +{ + Index_type max_loop_index = m_domain->lrn; + + allocAndInitData(m_t0, max_loop_index, vid); + allocAndInitData(m_t1, max_loop_index, vid); + allocAndInitData(m_t2, max_loop_index, vid); + allocAndInitData(m_denac, max_loop_index, vid); + allocAndInitData(m_denlw, max_loop_index, vid); + + m_clight = 3.e+10; + m_csound = 3.09e+7; + m_omega0 = 0.9; + m_omegar = 0.9; + m_dt = 0.208; + m_c10 = 0.25 * (m_clight / m_csound); + m_fratio = sqrt(m_omegar / m_omega0); + m_r_fratio = 1.0/m_fratio; + m_c20 = 0.25 * (m_clight / m_csound) * m_r_fratio; + m_ireal = Complex_type(0.0, 1.0); +} + +void COUPLE::runKernel(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + + COUPLE_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type k = kmin ; k < kmax ; ++k ) { + COUPLE_BODY; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(kmin, kmax), [=](Index_type k) { + COUPLE_BODY; + }); + + } + stopTimer(); + + break; + } +#endif + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + case Base_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + #pragma omp parallel for + for (Index_type k = kmin ; k < kmax ; ++k ) { + COUPLE_BODY; + } + + } + stopTimer(); + break; + } + + case RAJA_OpenMP : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall( + RAJA::RangeSegment(kmin, kmax), [=](Index_type k) { + COUPLE_BODY; + }); + + } + stopTimer(); + + break; + } +#endif + +#if defined(RAJA_ENABLE_TARGET_OPENMP) && 0 + case Base_OpenMPTarget : + case RAJA_OpenMPTarget : + { + runOpenMPTargetVariant(vid); + break; + } +#endif + +#if defined(RAJA_ENABLE_CUDA) && 0 + case Base_CUDA : + case RAJA_CUDA : + { + runCudaVariant(vid); + break; + } +#endif + + default : { + std::cout << "\n COUPLE : Unknown variant id = " << vid << std::endl; + } + + } +} + +void COUPLE::updateChecksum(VariantID vid) +{ + Index_type max_loop_index = m_domain->lrn; + + checksum[vid] += calcChecksum(m_t0, max_loop_index); + checksum[vid] += calcChecksum(m_t1, max_loop_index); + checksum[vid] += calcChecksum(m_t2, max_loop_index); +} + +void COUPLE::tearDown(VariantID vid) +{ + (void) vid; + + deallocData(m_t0); + deallocData(m_t1); + deallocData(m_t2); + deallocData(m_denac); + deallocData(m_denlw); +} } // end namespace apps } // end namespace rajaperf diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index d18bd4c10..76e50aafc 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -12,8 +12,8 @@ #include "common/KernelBase.hpp" #include "common/OutputUtils.hpp" -// Warmup kernels will be run if not in a RAJAPerf Suite infrastructure build -// Warm up runs reduce startup overheads +// Warmup kernels will be run if NOT in a RAJAPerf Suite infrastructure build +// The purpose of warm up runs reduce startup overheads // This overhead should not be reflected in perf testing timing #ifndef RAJAPERF_INFRASTRUCTURE_ONLY #include "basic/DAXPY.hpp" @@ -21,7 +21,6 @@ #include "algorithm/SORT.hpp" #endif - #include #include #include @@ -104,7 +103,7 @@ void Executor::setupSuite() // If group name(s) found in the list of exclude_kern_names, assemble kernels in group(s) - // to run and remove the identified group name(s) from exclude_kern_names list. + // to run, and remove the identified group name(s) from exclude_kern_names list. for (size_t ig = 0; ig < groups2exclude.size(); ++ig) { const string& gname(groups2exclude[ig]); @@ -166,8 +165,9 @@ void Executor::setupSuite() run_params.setInvalidExcludeFeatureInput(invalid); // + // Kokkos TODO: Ask David Beckingsale & Rich Hornung (LLNL) if this is correct: // If feature input is valid, determine which kernels to use - // Input-specified features and add to set of kernels to run. + // input-specified features, and add to set of kernels to run. // if ( run_params.getInvalidExcludeFeatureInput().empty() ) { @@ -221,10 +221,8 @@ void Executor::setupSuite() // Look for kernels using features if such input provided if ( !feature_input.empty() ) { - +// Ask David Beckingsale & Rich H. what to do here // Kokkos Design: -// AJP left some of the extensive commented code, because RAJA & Kokkos developers may -// want to use / fix these blocs for integrated use with Kokkos // FEATURE DOES NOT YET WORK WITH KOKKOS /** TODO: Kokkos, reimplement! Svector invalid; @@ -373,8 +371,8 @@ void Executor::setupSuite() } - // Declare and set exclude_variant_names (from run parameter inputs), a - // vector of strings + // Declare and set exclude_variant_names (from run parameter inputs) + // const Svector& exclude_variant_names = run_params.getExcludeVariantInput(); VIDset exclude_var; diff --git a/src/common/PerfsuiteKernelDefinitions.cpp b/src/common/PerfsuiteKernelDefinitions.cpp index 2180294ed..1f97494d2 100644 --- a/src/common/PerfsuiteKernelDefinitions.cpp +++ b/src/common/PerfsuiteKernelDefinitions.cpp @@ -68,7 +68,7 @@ // // Apps kernels... // -#include "apps/WIP-COUPLE.hpp" +//#include "apps/WIP-COUPLE.hpp" #include "apps/DEL_DOT_VEC_2D.hpp" #include "apps/DIFFUSION3DPA.hpp" #include "apps/ENERGY.hpp" @@ -93,6 +93,7 @@ namespace rajaperf { void make_perfsuite_executor(rajaperf::Executor *exec, int argc, char *argv[]) { RunParams run_params(argc, argv); + free_register_group(exec, std::string("Basic")); free_register_group(exec, std::string("Lcals")); free_register_group(exec, std::string("Polybench")); @@ -125,10 +126,11 @@ void make_perfsuite_executor(rajaperf::Executor *exec, int argc, char *argv[]) { free_register_kernel(exec, "Lcals", new lcals::INT_PREDICT(run_params)); free_register_kernel(exec, "Lcals", new lcals::PLANCKIAN(run_params)); free_register_kernel(exec, "Lcals", new lcals::TRIDIAG_ELIM(run_params)); -/* - // Uncomment these lines once Kokkos translations for the polybench kernel - // group have been made + + // Nota bene: No Kokkos translations of polybench yet, + // only stub implementations // Polybench + free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_2MM(run_params)); free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_3MM(run_params)); free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_ADI(run_params)); @@ -142,7 +144,7 @@ void make_perfsuite_executor(rajaperf::Executor *exec, int argc, char *argv[]) { free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_JACOBI_1D(run_params)); free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_JACOBI_2D(run_params)); free_register_kernel(exec, "Polybench", new polybench::POLYBENCH_MVT(run_params)); -*/ + // Stream free_register_kernel(exec, "Stream", new stream::ADD(run_params)); free_register_kernel(exec, "Stream", new stream::COPY(run_params)); diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index deaf07f7e..baed495b2 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -90,20 +90,20 @@ namespace rajaperf { std::string("Lcals_TRIDIAG_ELIM"), // //// Polybench kernels... -//// Uncomment once Kokkos variants have been created -// std::string("Polybench_2MM"), -// std::string("Polybench_3MM"), -// std::string("Polybench_ADI"), -// std::string("Polybench_ATAX"), -// std::string("Polybench_FDTD_2D"), -// std::string("Polybench_FLOYD_WARSHALL"), -// std::string("Polybench_GEMM"), -// std::string("Polybench_GEMVER"), -// std::string("Polybench_GESUMMV"), -// std::string("Polybench_HEAT_3D"), -// std::string("Polybench_JACOBI_1D"), -// std::string("Polybench_JACOBI_2D"), -// std::string("Polybench_MVT"), +//// Nota bene: Kokkos variants have not yet been created + std::string("Polybench_2MM"), + std::string("Polybench_3MM"), + std::string("Polybench_ADI"), + std::string("Polybench_ATAX"), + std::string("Polybench_FDTD_2D"), + std::string("Polybench_FLOYD_WARSHALL"), + std::string("Polybench_GEMM"), + std::string("Polybench_GEMVER"), + std::string("Polybench_GESUMMV"), + std::string("Polybench_HEAT_3D"), + std::string("Polybench_JACOBI_1D"), + std::string("Polybench_JACOBI_2D"), + std::string("Polybench_MVT"), // //// //// Stream kernels... @@ -116,7 +116,7 @@ namespace rajaperf { // // Apps kernels... // -// std::string("Apps_COUPLE"), + //std::string("Apps_COUPLE"), std::string("Apps_DEL_DOT_VEC_2D"), std::string("Apps_DIFFUSION3DPA"), std::string("Apps_ENERGY"), diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 1ab4d83f1..c0a47515f 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -46,7 +46,9 @@ struct PointerOfNdimensions { typename PointerOfNdimensions::type *; }; -// This templated function is used to wrap pointers (declared and defined in RAJAPerf Suite kernels) in Kokkos Views +// This templated function is used to wrap pointers +// (declared and defined in RAJAPerf Suite kernels) in Kokkos Views +// template auto getViewFromPointer(PointedAt *kokkos_ptr, Boundaries... boundaries) -> typename Kokkos::View< @@ -63,44 +65,22 @@ auto getViewFromPointer(PointedAt *kokkos_ptr, Boundaries... boundaries) typename PointerOfNdimensions::type, typename Kokkos::DefaultExecutionSpace::memory_space>; - // Nota bene: When copying data, we can either change the Layout or the memory_space - // (host or device), but we cannot change both! - // Here, we are mirroring data on the (CPU) host TO the (GPU) device, i.e., Layout is - // as if on the device, but the data actually reside on the host. The host - // mirror will be Layout Left (optimal for the device, but not the host). using mirror_view_type = typename device_view_type::HostMirror; - // Assignment statement: we are constructing a host_view_type called - // pointer_holder. The value of kokkos_ptr is the Kokkos View-wrapped pointer - // on the Host (CPU), and the Boundaries parameter pack values, boundaries (i.e., array boundaries) will also - // be part of this this host_view_type object. host_view_type pointer_holder(kokkos_ptr, boundaries...); // The boundaries parameter pack contains the array dimenions; - // an allocation is implicitly made here + // An allocation is implicitly made here device_view_type device_data_copy("StringName", boundaries...); mirror_view_type cpu_to_gpu_mirror = Kokkos::create_mirror_view(device_data_copy); - // deep_copy our existing data, the contents of - // pointer_holder, into the mirror_view; - // Copying from Host to Device has two steps: - // 1) Change the layout to enable sending data from CPU to GPU - // 2) Change the memory_space (host or device) to send the optimal data - // layout to the GPU. - // This step changes the array layout to be optimal for the gpu, i.e., - // LayoutLeft. Kokkos::deep_copy(cpu_to_gpu_mirror, pointer_holder); - // The mirror view data layout on the HOST is like the layout for the GPU. - // GPU-optimized layouts are LayoutLeft, i.e., column-major This deep_copy - // copy GPU-layout data on the HOST to the Device - - // Actual copying of the data from the host to the gpu Kokkos::deep_copy(device_data_copy, cpu_to_gpu_mirror); // Kokkos::View return type @@ -109,7 +89,7 @@ auto getViewFromPointer(PointedAt *kokkos_ptr, Boundaries... boundaries) } // This function will move data in a Kokkos::View back to host from device, -// and will store in the existing pointer(s) +// and will be stored in the existing pointer(s) template void moveDataToHostFromKokkosView(PointedAt *kokkos_ptr, ExistingView my_view, Boundaries... boundaries) @@ -125,10 +105,6 @@ void moveDataToHostFromKokkosView(PointedAt *kokkos_ptr, ExistingView my_view, using mirror_view_type = typename device_view_type::HostMirror; - // Constructing a host_view_type with the name - // pointer_holder. The contents/value of kokkos_ptr is the pointer we're wrapping on - // the Host, and the Boundaries parameter pack values, boundaries, will also - // be part of this this host_view_type object. host_view_type pointer_holder(kokkos_ptr, boundaries...); @@ -223,21 +199,20 @@ enum KernelID { // // Polybench kernels... -// These will be uncommented once Kokkos translations for these kernels exist // -// Polybench_2MM, -// Polybench_3MM, -// Polybench_ADI, -// Polybench_ATAX, -// Polybench_FDTD_2D, -// Polybench_FLOYD_WARSHALL, -// Polybench_GEMM, -// Polybench_GEMVER, -// Polybench_GESUMMV, -// Polybench_HEAT_3D, -// Polybench_JACOBI_1D, -// Polybench_JACOBI_2D, -// Polybench_MVT, + Polybench_2MM, + Polybench_3MM, + Polybench_ADI, + Polybench_ATAX, + Polybench_FDTD_2D, + Polybench_FLOYD_WARSHALL, + Polybench_GEMM, + Polybench_GEMVER, + Polybench_GESUMMV, + Polybench_HEAT_3D, + Polybench_JACOBI_1D, + Polybench_JACOBI_2D, + Polybench_MVT, // Stream kernels... diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp index c12b069c1..7f0388b9e 100644 --- a/src/common/RPTypes.hpp +++ b/src/common/RPTypes.hpp @@ -12,7 +12,6 @@ #ifndef RAJAPerf_RPTypes_HPP #define RAJAPerf_RPTypes_HPP // This macro, RAJAPERF_INFRASTRUCTURE_ONLY, is for Kokkos and Kokkos Kernels -// -based performance testing #ifndef RAJAPERF_INFRASTRUCTURE_ONLY #include "RAJA/util/types.hpp" #endif diff --git a/src/polybench-kokkos/CMakeLists.txt b/src/polybench-kokkos/CMakeLists.txt new file mode 100644 index 000000000..68c53dde2 --- /dev/null +++ b/src/polybench-kokkos/CMakeLists.txt @@ -0,0 +1,27 @@ +############################################################################### +# Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +# and RAJA Performance Suite project contributors. +# See the RAJAPerf/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/../polybench) + +blt_add_library( + NAME polybench-kokkos + SOURCES POLYBENCH_2MM-Kokkos.cpp + POLYBENCH_3MM-Kokkos.cpp + POLYBENCH_ADI-Kokkos.cpp + POLYBENCH_ATAX-Kokkos.cpp + POLYBENCH_FDTD_2D-Kokkos.cpp + POLYBENCH_FLOYD_WARSHALL-Kokkos.cpp + POLYBENCH_GEMM-Kokkos.cpp + POLYBENCH_GEMVER-Kokkos.cpp + POLYBENCH_GESUMMV-Kokkos.cpp + POLYBENCH_HEAT_3D-Kokkos.cpp + POLYBENCH_JACOBI_1D-Kokkos.cpp + POLYBENCH_JACOBI_2D-Kokkos.cpp + POLYBENCH_MVT-Kokkos.cpp + DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} + ) diff --git a/src/polybench-kokkos/POLYBENCH_2MM-Kokkos.cpp b/src/polybench-kokkos/POLYBENCH_2MM-Kokkos.cpp new file mode 100644 index 000000000..ad97430b8 --- /dev/null +++ b/src/polybench-kokkos/POLYBENCH_2MM-Kokkos.cpp @@ -0,0 +1,196 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_2MM.hpp" + +#include "RAJA/RAJA.hpp" + +#include + + +namespace rajaperf +{ +namespace polybench +{ + +void POLYBENCH_2MM::runKokkosVariant(VariantID vid) +{ + + // Kokkos stub + return; + + + const Index_type run_reps= getRunReps(); + + POLYBENCH_2MM_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = 0; i < ni; i++ ) { + for (Index_type j = 0; j < nj; j++) { + POLYBENCH_2MM_BODY1; + for (Index_type k = 0; k < nk; k++) { + POLYBENCH_2MM_BODY2; + } + POLYBENCH_2MM_BODY3; + } + } + + for (Index_type i = 0; i < ni; i++) { + for (Index_type l = 0; l < nl; l++) { + POLYBENCH_2MM_BODY4; + for (Index_type j = 0; j < nj; j++) { + POLYBENCH_2MM_BODY5; + } + POLYBENCH_2MM_BODY6; + } + } + + } + stopTimer(); + + break; + } + + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto poly_2mm_base_lam2 = [=](Index_type i, Index_type j, + Index_type k, Real_type &dot) { + POLYBENCH_2MM_BODY2; + }; + auto poly_2mm_base_lam3 = [=](Index_type i, Index_type j, + Real_type &dot) { + POLYBENCH_2MM_BODY3; + }; + auto poly_2mm_base_lam5 = [=](Index_type i, Index_type l, + Index_type j, Real_type &dot) { + POLYBENCH_2MM_BODY5; + }; + auto poly_2mm_base_lam6 = [=](Index_type i, Index_type l, + Real_type &dot) { + POLYBENCH_2MM_BODY6; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = 0; i < ni; i++ ) { + for(Index_type j = 0; j < nj; j++) { + POLYBENCH_2MM_BODY1; + for (Index_type k = 0; k < nk; k++) { + poly_2mm_base_lam2(i, j, k, dot); + } + poly_2mm_base_lam3(i, j, dot); + } + } + + for(Index_type i = 0; i < ni; i++) { + for(Index_type l = 0; l < nl; l++) { + POLYBENCH_2MM_BODY4; + for (Index_type j = 0; j < nj; j++) { + poly_2mm_base_lam5(i, l, j, dot); + } + poly_2mm_base_lam6(i, l, dot); + } + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + POLYBENCH_2MM_VIEWS_RAJA; + + auto poly_2mm_lam1 = [=](Real_type &dot) { + POLYBENCH_2MM_BODY1_RAJA; + }; + auto poly_2mm_lam2 = [=](Index_type i, Index_type j, Index_type k, + Real_type &dot) { + POLYBENCH_2MM_BODY2_RAJA; + }; + auto poly_2mm_lam3 = [=](Index_type i, Index_type j, + Real_type &dot) { + POLYBENCH_2MM_BODY3_RAJA; + }; + auto poly_2mm_lam4 = [=](Real_type &dot) { + POLYBENCH_2MM_BODY4_RAJA; + }; + auto poly_2mm_lam5 = [=](Index_type i, Index_type l, Index_type j, + Real_type &dot) { + POLYBENCH_2MM_BODY5_RAJA; + }; + auto poly_2mm_lam6 = [=](Index_type i, Index_type l, + Real_type &dot) { + POLYBENCH_2MM_BODY6_RAJA; + }; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::Lambda<0, RAJA::Params<0>>, + RAJA::statement::For<2, RAJA::loop_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0,1>, RAJA::Params<0>> + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, ni}, + RAJA::RangeSegment{0, nj}, + RAJA::RangeSegment{0, nk}), + RAJA::tuple{0.0}, + + poly_2mm_lam1, + poly_2mm_lam2, + poly_2mm_lam3 + ); + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, ni}, + RAJA::RangeSegment{0, nl}, + RAJA::RangeSegment{0, nj}), + RAJA::tuple{0.0}, + + poly_2mm_lam4, + poly_2mm_lam5, + poly_2mm_lam6 + ); + + } + stopTimer(); + break; + } + +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n POLYBENCH_2MM : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace polybench +} // end namespace rajaperf diff --git a/src/polybench-kokkos/POLYBENCH_3MM-Kokkos.cpp b/src/polybench-kokkos/POLYBENCH_3MM-Kokkos.cpp new file mode 100644 index 000000000..170e442fc --- /dev/null +++ b/src/polybench-kokkos/POLYBENCH_3MM-Kokkos.cpp @@ -0,0 +1,249 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_3MM.hpp" + +#include "RAJA/RAJA.hpp" + +#include +#include + + +namespace rajaperf +{ +namespace polybench +{ + + +void POLYBENCH_3MM::runKokkosVariant(VariantID vid) +{ + + // Kokkos stub + return; + + const Index_type run_reps = getRunReps(); + + POLYBENCH_3MM_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = 0; i < ni; i++ ) { + for (Index_type j = 0; j < nj; j++) { + POLYBENCH_3MM_BODY1; + for (Index_type k = 0; k < nk; k++) { + POLYBENCH_3MM_BODY2; + } + POLYBENCH_3MM_BODY3; + } + } + + for (Index_type j = 0; j < nj; j++) { + for (Index_type l = 0; l < nl; l++) { + POLYBENCH_3MM_BODY4; + for (Index_type m = 0; m < nm; m++) { + POLYBENCH_3MM_BODY5; + } + POLYBENCH_3MM_BODY6; + } + } + + for (Index_type i = 0; i < ni; i++) { + for (Index_type l = 0; l < nl; l++) { + POLYBENCH_3MM_BODY7; + for (Index_type j = 0; j < nj; j++) { + POLYBENCH_3MM_BODY8; + } + POLYBENCH_3MM_BODY9; + } + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto poly_3mm_base_lam2 = [=] (Index_type i, Index_type j, Index_type k, + Real_type &dot) { + POLYBENCH_3MM_BODY2; + }; + auto poly_3mm_base_lam3 = [=] (Index_type i, Index_type j, + Real_type &dot) { + POLYBENCH_3MM_BODY3; + }; + auto poly_3mm_base_lam5 = [=] (Index_type j, Index_type l, Index_type m, + Real_type &dot) { + POLYBENCH_3MM_BODY5; + }; + auto poly_3mm_base_lam6 = [=] (Index_type j, Index_type l, + Real_type &dot) { + POLYBENCH_3MM_BODY6; + }; + auto poly_3mm_base_lam8 = [=] (Index_type i, Index_type l, Index_type j, + Real_type &dot) { + POLYBENCH_3MM_BODY8; + }; + auto poly_3mm_base_lam9 = [=] (Index_type i, Index_type l, + Real_type &dot) { + POLYBENCH_3MM_BODY9; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = 0; i < ni; i++ ) { + for (Index_type j = 0; j < nj; j++) { + POLYBENCH_3MM_BODY1; + for (Index_type k = 0; k < nk; k++) { + poly_3mm_base_lam2(i, j, k, dot); + } + poly_3mm_base_lam3(i, j, dot); + } + } + + for (Index_type j = 0; j < nj; j++) { + for (Index_type l = 0; l < nl; l++) { + POLYBENCH_3MM_BODY4; + for (Index_type m = 0; m < nm; m++) { + poly_3mm_base_lam5(j, l, m, dot); + } + poly_3mm_base_lam6(j, l, dot); + } + } + + for (Index_type i = 0; i < ni; i++) { + for (Index_type l = 0; l < nl; l++) { + POLYBENCH_3MM_BODY7; + for (Index_type j = 0; j < nj; j++) { + poly_3mm_base_lam8(i, l, j, dot); + } + poly_3mm_base_lam9(i, l, dot); + } + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + POLYBENCH_3MM_VIEWS_RAJA; + + auto poly_3mm_lam1 = [=] (Real_type &dot) { + POLYBENCH_3MM_BODY1_RAJA; + }; + auto poly_3mm_lam2 = [=] (Index_type i, Index_type j, Index_type k, + Real_type &dot) { + POLYBENCH_3MM_BODY2_RAJA; + }; + auto poly_3mm_lam3 = [=] (Index_type i, Index_type j, + Real_type &dot) { + POLYBENCH_3MM_BODY3_RAJA; + }; + auto poly_3mm_lam4 = [=] (Real_type &dot) { + POLYBENCH_3MM_BODY4_RAJA; + }; + auto poly_3mm_lam5 = [=] (Index_type j, Index_type l, Index_type m, + Real_type &dot) { + POLYBENCH_3MM_BODY5_RAJA; + }; + auto poly_3mm_lam6 = [=] (Index_type j, Index_type l, + Real_type &dot) { + POLYBENCH_3MM_BODY6_RAJA; + }; + auto poly_3mm_lam7 = [=] (Real_type &dot) { + POLYBENCH_3MM_BODY7_RAJA; + }; + auto poly_3mm_lam8 = [=] (Index_type i, Index_type l, Index_type j, + Real_type &dot) { + POLYBENCH_3MM_BODY8_RAJA; + }; + auto poly_3mm_lam9 = [=] (Index_type i, Index_type l, + Real_type &dot) { + POLYBENCH_3MM_BODY9_RAJA; + }; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::Lambda<0, RAJA::Params<0>>, + RAJA::statement::For<2, RAJA::loop_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1,2>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0,1>, RAJA::Params<0>> + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, ni}, + RAJA::RangeSegment{0, nj}, + RAJA::RangeSegment{0, nk}), + RAJA::tuple{0.0}, + + poly_3mm_lam1, + poly_3mm_lam2, + poly_3mm_lam3 + + ); + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, nj}, + RAJA::RangeSegment{0, nl}, + RAJA::RangeSegment{0, nm}), + RAJA::tuple{0.0}, + + poly_3mm_lam4, + poly_3mm_lam5, + poly_3mm_lam6 + + ); + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, ni}, + RAJA::RangeSegment{0, nl}, + RAJA::RangeSegment{0, nj}), + RAJA::tuple{0.0}, + + poly_3mm_lam7, + poly_3mm_lam8, + poly_3mm_lam9 + + ); + + } // end run_reps + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n POLYBENCH_2MM : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/polybench-kokkos/POLYBENCH_ADI-Kokkos.cpp b/src/polybench-kokkos/POLYBENCH_ADI-Kokkos.cpp new file mode 100644 index 000000000..7056724ed --- /dev/null +++ b/src/polybench-kokkos/POLYBENCH_ADI-Kokkos.cpp @@ -0,0 +1,222 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_ADI.hpp" + +#include "RAJA/RAJA.hpp" + +#include +#include + +namespace rajaperf +{ +namespace polybench +{ + +void POLYBENCH_ADI::runKokkosVariant(VariantID vid) +{ + // Kokkos stub + return; + + const Index_type run_reps = getRunReps(); + + POLYBENCH_ADI_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 1; t <= tsteps; ++t) { + + for (Index_type i = 1; i < n-1; ++i) { + POLYBENCH_ADI_BODY2; + for (Index_type j = 1; j < n-1; ++j) { + POLYBENCH_ADI_BODY3; + } + POLYBENCH_ADI_BODY4; + for (Index_type k = n-2; k >= 1; --k) { + POLYBENCH_ADI_BODY5; + } + } + + for (Index_type i = 1; i < n-1; ++i) { + POLYBENCH_ADI_BODY6; + for (Index_type j = 1; j < n-1; ++j) { + POLYBENCH_ADI_BODY7; + } + POLYBENCH_ADI_BODY8; + for (Index_type k = n-2; k >= 1; --k) { + POLYBENCH_ADI_BODY9; + } + } + + } // tstep loop + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto poly_adi_base_lam2 = [=](Index_type i) { + POLYBENCH_ADI_BODY2; + }; + auto poly_adi_base_lam3 = [=](Index_type i, Index_type j) { + POLYBENCH_ADI_BODY3; + }; + auto poly_adi_base_lam4 = [=](Index_type i) { + POLYBENCH_ADI_BODY4; + }; + auto poly_adi_base_lam5 = [=](Index_type i, Index_type k) { + POLYBENCH_ADI_BODY5; + }; + auto poly_adi_base_lam6 = [=](Index_type i) { + POLYBENCH_ADI_BODY6; + }; + auto poly_adi_base_lam7 = [=](Index_type i, Index_type j) { + POLYBENCH_ADI_BODY7; + }; + auto poly_adi_base_lam8 = [=](Index_type i) { + POLYBENCH_ADI_BODY8; + }; + auto poly_adi_base_lam9 = [=](Index_type i, Index_type k) { + POLYBENCH_ADI_BODY9; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 1; t <= tsteps; ++t) { + + for (Index_type i = 1; i < n-1; ++i) { + poly_adi_base_lam2(i); + for (Index_type j = 1; j < n-1; ++j) { + poly_adi_base_lam3(i, j); + } + poly_adi_base_lam4(i); + for (Index_type k = n-2; k >= 1; --k) { + poly_adi_base_lam5(i, k); + } + } + + for (Index_type i = 1; i < n-1; ++i) { + poly_adi_base_lam6(i); + for (Index_type j = 1; j < n-1; ++j) { + poly_adi_base_lam7(i, j); + } + poly_adi_base_lam8(i); + for (Index_type k = n-2; k >= 1; --k) { + poly_adi_base_lam9(i, k); + } + } + + } // tstep loop + + } // run_reps + stopTimer(); + + break; + } + + case RAJA_Seq : { + + POLYBENCH_ADI_VIEWS_RAJA; + + auto poly_adi_lam2 = [=](Index_type i) { + POLYBENCH_ADI_BODY2_RAJA; + }; + auto poly_adi_lam3 = [=](Index_type i, Index_type j) { + POLYBENCH_ADI_BODY3_RAJA; + }; + auto poly_adi_lam4 = [=](Index_type i) { + POLYBENCH_ADI_BODY4_RAJA; + }; + auto poly_adi_lam5 = [=](Index_type i, Index_type k) { + POLYBENCH_ADI_BODY5_RAJA; + }; + auto poly_adi_lam6 = [=](Index_type i) { + POLYBENCH_ADI_BODY6_RAJA; + }; + auto poly_adi_lam7 = [=](Index_type i, Index_type j) { + POLYBENCH_ADI_BODY7_RAJA; + }; + auto poly_adi_lam8 = [=](Index_type i) { + POLYBENCH_ADI_BODY8_RAJA; + }; + auto poly_adi_lam9 = [=](Index_type i, Index_type k) { + POLYBENCH_ADI_BODY9_RAJA; + }; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::Lambda<0, RAJA::Segs<0>>, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0>>, + RAJA::statement::For<2, RAJA::loop_exec, + RAJA::statement::Lambda<3, RAJA::Segs<0,2>> + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 1; t <= tsteps; ++t) { + + RAJA::kernel( + RAJA::make_tuple(RAJA::RangeSegment{1, n-1}, + RAJA::RangeSegment{1, n-1}, + RAJA::RangeStrideSegment{n-2, 0, -1}), + + poly_adi_lam2, + poly_adi_lam3, + poly_adi_lam4, + poly_adi_lam5 + + ); + + RAJA::kernel( + RAJA::make_tuple(RAJA::RangeSegment{1, n-1}, + RAJA::RangeSegment{1, n-1}, + RAJA::RangeStrideSegment{n-2, 0, -1}), + + poly_adi_lam6, + poly_adi_lam7, + poly_adi_lam8, + poly_adi_lam9 + + ); + + } // tstep loop + + } // run_reps + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\nPOLYBENCH_ADI Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace polybench +} // end namespace rajaperf diff --git a/src/polybench-kokkos/POLYBENCH_ATAX-Kokkos.cpp b/src/polybench-kokkos/POLYBENCH_ATAX-Kokkos.cpp new file mode 100644 index 000000000..d2a99c3ad --- /dev/null +++ b/src/polybench-kokkos/POLYBENCH_ATAX-Kokkos.cpp @@ -0,0 +1,193 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_ATAX.hpp" + +#include "RAJA/RAJA.hpp" + +#include + + +namespace rajaperf +{ +namespace polybench +{ + +void POLYBENCH_ATAX::runKokkosVariant(VariantID vid) +{ + + // Kokkos stub + return; + + const Index_type run_reps= getRunReps(); + + POLYBENCH_ATAX_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = 0; i < N; ++i ) { + POLYBENCH_ATAX_BODY1; + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_ATAX_BODY2; + } + POLYBENCH_ATAX_BODY3; + } + + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_ATAX_BODY4; + for (Index_type i = 0; i < N; ++i ) { + POLYBENCH_ATAX_BODY5; + } + POLYBENCH_ATAX_BODY6; + } + + } + stopTimer(); + + break; + } + + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto poly_atax_base_lam2 = [=] (Index_type i, Index_type j, + Real_type &dot) { + POLYBENCH_ATAX_BODY2; + }; + auto poly_atax_base_lam3 = [=] (Index_type i, + Real_type &dot) { + POLYBENCH_ATAX_BODY3; + }; + auto poly_atax_base_lam5 = [=] (Index_type i, Index_type j , + Real_type &dot) { + POLYBENCH_ATAX_BODY5; + }; + auto poly_atax_base_lam6 = [=] (Index_type j, + Real_type &dot) { + POLYBENCH_ATAX_BODY6; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = 0; i < N; ++i ) { + POLYBENCH_ATAX_BODY1; + for (Index_type j = 0; j < N; ++j ) { + poly_atax_base_lam2(i, j, dot); + } + poly_atax_base_lam3(i, dot); + } + + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_ATAX_BODY4; + for (Index_type i = 0; i < N; ++i ) { + poly_atax_base_lam5(i, j, dot); + } + poly_atax_base_lam6(j, dot); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + POLYBENCH_ATAX_VIEWS_RAJA; + + auto poly_atax_lam1 = [=] (Index_type i, Real_type &dot) { + POLYBENCH_ATAX_BODY1_RAJA; + }; + auto poly_atax_lam2 = [=] (Index_type i, Index_type j, Real_type &dot) { + POLYBENCH_ATAX_BODY2_RAJA; + }; + auto poly_atax_lam3 = [=] (Index_type i, Real_type &dot) { + POLYBENCH_ATAX_BODY3_RAJA; + }; + auto poly_atax_lam4 = [=] (Index_type j, Real_type &dot) { + POLYBENCH_ATAX_BODY4_RAJA; + }; + auto poly_atax_lam5 = [=] (Index_type i, Index_type j , Real_type &dot) { + POLYBENCH_ATAX_BODY5_RAJA; + }; + auto poly_atax_lam6 = [=] (Index_type j, Real_type &dot) { + POLYBENCH_ATAX_BODY6_RAJA; + }; + + using EXEC_POL1 = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::Lambda<0, RAJA::Segs<0>, RAJA::Params<0>>, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0>> + > + >; + + using EXEC_POL2 = + RAJA::KernelPolicy< + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::Lambda<0, RAJA::Segs<1>, RAJA::Params<0>>, + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<1>, RAJA::Params<0>> + > + >; + + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + RAJA::tuple{0.0}, + + poly_atax_lam1, + poly_atax_lam2, + poly_atax_lam3 + + ); + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + RAJA::tuple{0.0}, + + poly_atax_lam4, + poly_atax_lam5, + poly_atax_lam6 + + ); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n POLYBENCH_ATAX : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace polybench +} // end namespace rajaperf diff --git a/src/polybench-kokkos/POLYBENCH_FDTD_2D-Kokkos.cpp b/src/polybench-kokkos/POLYBENCH_FDTD_2D-Kokkos.cpp new file mode 100644 index 000000000..e296985cf --- /dev/null +++ b/src/polybench-kokkos/POLYBENCH_FDTD_2D-Kokkos.cpp @@ -0,0 +1,199 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_FDTD_2D.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + +void POLYBENCH_FDTD_2D::runKokkosVariant(VariantID vid) +{ + // Kokkos stub + return; + + + const Index_type run_reps = getRunReps(); + + POLYBENCH_FDTD_2D_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (t = 0; t < tsteps; ++t) { + + for (Index_type j = 0; j < ny; j++) { + POLYBENCH_FDTD_2D_BODY1; + } + for (Index_type i = 1; i < nx; i++) { + for (Index_type j = 0; j < ny; j++) { + POLYBENCH_FDTD_2D_BODY2; + } + } + for (Index_type i = 0; i < nx; i++) { + for (Index_type j = 1; j < ny; j++) { + POLYBENCH_FDTD_2D_BODY3; + } + } + for (Index_type i = 0; i < nx - 1; i++) { + for (Index_type j = 0; j < ny - 1; j++) { + POLYBENCH_FDTD_2D_BODY4; + } + } + + } // tstep loop + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + // + // Note: first lambda must use capture by reference so that the + // scalar variable 't' used in it is updated for each + // t-loop iteration. + // + auto poly_fdtd2d_base_lam1 = [&](Index_type j) { + POLYBENCH_FDTD_2D_BODY1; + }; + auto poly_fdtd2d_base_lam2 = [=](Index_type i, Index_type j) { + POLYBENCH_FDTD_2D_BODY2; + }; + auto poly_fdtd2d_base_lam3 = [=](Index_type i, Index_type j) { + POLYBENCH_FDTD_2D_BODY3; + }; + auto poly_fdtd2d_base_lam4 = [=](Index_type i, Index_type j) { + POLYBENCH_FDTD_2D_BODY4; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (t = 0; t < tsteps; ++t) { + + for (Index_type j = 0; j < ny; j++) { + poly_fdtd2d_base_lam1(j); + } + for (Index_type i = 1; i < nx; i++) { + for (Index_type j = 0; j < ny; j++) { + poly_fdtd2d_base_lam2(i, j); + } + } + for (Index_type i = 0; i < nx; i++) { + for (Index_type j = 1; j < ny; j++) { + poly_fdtd2d_base_lam3(i, j); + } + } + for (Index_type i = 0; i < nx - 1; i++) { + for (Index_type j = 0; j < ny - 1; j++) { + poly_fdtd2d_base_lam4(i, j); + } + } + + } // tstep loop + + } // run_reps + stopTimer(); + + break; + } + + case RAJA_Seq : { + + POLYBENCH_FDTD_2D_VIEWS_RAJA; + + // + // Note: first lambda must use capture by reference so that the + // scalar variable 't' used in it is updated for each + // t-loop iteration. + // + auto poly_fdtd2d_lam1 = [&](Index_type j) { + POLYBENCH_FDTD_2D_BODY1_RAJA; + }; + auto poly_fdtd2d_lam2 = [=](Index_type i, Index_type j) { + POLYBENCH_FDTD_2D_BODY2_RAJA; + }; + auto poly_fdtd2d_lam3 = [=](Index_type i, Index_type j) { + POLYBENCH_FDTD_2D_BODY3_RAJA; + }; + auto poly_fdtd2d_lam4 = [=](Index_type i, Index_type j) { + POLYBENCH_FDTD_2D_BODY4_RAJA; + }; + + using EXEC_POL1 = RAJA::loop_exec; + + using EXEC_POL234 = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::Lambda<0> + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (t = 0; t < tsteps; ++t) { + + RAJA::forall( RAJA::RangeSegment(0, ny), + poly_fdtd2d_lam1 + ); + + RAJA::kernel( + RAJA::make_tuple(RAJA::RangeSegment{1, nx}, + RAJA::RangeSegment{0, ny}), + poly_fdtd2d_lam2 + ); + + RAJA::kernel( + RAJA::make_tuple(RAJA::RangeSegment{0, nx}, + RAJA::RangeSegment{1, ny}), + poly_fdtd2d_lam3 + ); + + RAJA::kernel( + RAJA::make_tuple(RAJA::RangeSegment{0, nx-1}, + RAJA::RangeSegment{0, ny-1}), + poly_fdtd2d_lam4 + ); + + } // tstep loop + + } // run_reps + stopTimer(); + + break; + } + +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\nPOLYBENCH_FDTD_2D Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace polybench +} // end namespace rajaperf diff --git a/src/polybench-kokkos/POLYBENCH_FLOYD_WARSHALL-Kokkos.cpp b/src/polybench-kokkos/POLYBENCH_FLOYD_WARSHALL-Kokkos.cpp new file mode 100644 index 000000000..29450fb66 --- /dev/null +++ b/src/polybench-kokkos/POLYBENCH_FLOYD_WARSHALL-Kokkos.cpp @@ -0,0 +1,123 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_FLOYD_WARSHALL.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf +{ +namespace polybench +{ + + +void POLYBENCH_FLOYD_WARSHALL::runKokkosVariant(VariantID vid) +{ + + // Kokkos stub + return; + + const Index_type run_reps= getRunReps(); + + POLYBENCH_FLOYD_WARSHALL_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type k = 0; k < N; ++k) { + for (Index_type i = 0; i < N; ++i) { + for (Index_type j = 0; j < N; ++j) { + POLYBENCH_FLOYD_WARSHALL_BODY; + } + } + } + + } + stopTimer(); + + break; + } + + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto poly_floydwarshall_base_lam = [=](Index_type k, Index_type i, + Index_type j) { + POLYBENCH_FLOYD_WARSHALL_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type k = 0; k < N; ++k) { + for (Index_type i = 0; i < N; ++i) { + for (Index_type j = 0; j < N; ++j) { + poly_floydwarshall_base_lam(k, i, j); + } + } + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + POLYBENCH_FLOYD_WARSHALL_VIEWS_RAJA; + + auto poly_floydwarshall_lam = [=](Index_type k, Index_type i, + Index_type j) { + POLYBENCH_FLOYD_WARSHALL_BODY_RAJA; + }; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::For<2, RAJA::loop_exec, + RAJA::statement::Lambda<0> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + poly_floydwarshall_lam + ); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n POLYBENCH_FLOYD_WARSHALL : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace polybench +} // end namespace rajaperf diff --git a/src/polybench-kokkos/POLYBENCH_GEMM-Kokkos.cpp b/src/polybench-kokkos/POLYBENCH_GEMM-Kokkos.cpp new file mode 100644 index 000000000..182f31448 --- /dev/null +++ b/src/polybench-kokkos/POLYBENCH_GEMM-Kokkos.cpp @@ -0,0 +1,157 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_GEMM.hpp" + +#include "RAJA/RAJA.hpp" + +#include + + +namespace rajaperf +{ +namespace polybench +{ + + +void POLYBENCH_GEMM::runKokkosVariant(VariantID vid) +{ + + // Kokkos stub + return; + + const Index_type run_reps= getRunReps(); + + POLYBENCH_GEMM_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = 0; i < ni; ++i ) { + for (Index_type j = 0; j < nj; ++j ) { + POLYBENCH_GEMM_BODY1; + POLYBENCH_GEMM_BODY2; + for (Index_type k = 0; k < nk; ++k ) { + POLYBENCH_GEMM_BODY3; + } + POLYBENCH_GEMM_BODY4; + } + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto poly_gemm_base_lam2 = [=](Index_type i, Index_type j) { + POLYBENCH_GEMM_BODY2; + }; + auto poly_gemm_base_lam3 = [=](Index_type i, Index_type j, Index_type k, + Real_type& dot) { + POLYBENCH_GEMM_BODY3; + }; + auto poly_gemm_base_lam4 = [=](Index_type i, Index_type j, + Real_type& dot) { + POLYBENCH_GEMM_BODY4; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = 0; i < ni; ++i ) { + for (Index_type j = 0; j < nj; ++j ) { + POLYBENCH_GEMM_BODY1; + poly_gemm_base_lam2(i, j); + for (Index_type k = 0; k < nk; ++k ) { + poly_gemm_base_lam3(i, j, k, dot); + } + poly_gemm_base_lam4(i, j, dot); + } + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + POLYBENCH_GEMM_VIEWS_RAJA; + + auto poly_gemm_lam1 = [=](Real_type& dot) { + POLYBENCH_GEMM_BODY1_RAJA; + }; + auto poly_gemm_lam2 = [=](Index_type i, Index_type j) { + POLYBENCH_GEMM_BODY2_RAJA; + }; + auto poly_gemm_lam3 = [=](Index_type i, Index_type j, Index_type k, + Real_type& dot) { + POLYBENCH_GEMM_BODY3_RAJA; + }; + auto poly_gemm_lam4 = [=](Index_type i, Index_type j, + Real_type& dot) { + POLYBENCH_GEMM_BODY4_RAJA; + }; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::Lambda<0, RAJA::Params<0>>, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>>, + RAJA::statement::For<2, RAJA::loop_exec, + RAJA::statement::Lambda<2, RAJA::Segs<0,1,2>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<3, RAJA::Segs<0,1>, RAJA::Params<0>> + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_param( + + RAJA::make_tuple( RAJA::RangeSegment{0, ni}, + RAJA::RangeSegment{0, nj}, + RAJA::RangeSegment{0, nk} ), + RAJA::tuple{0.0}, // variable for dot + + poly_gemm_lam1, + poly_gemm_lam2, + poly_gemm_lam3, + poly_gemm_lam4 + + ); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n POLYBENCH_GEMM : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace polybench +} // end namespace rajaperf diff --git a/src/polybench-kokkos/POLYBENCH_GEMVER-Kokkos.cpp b/src/polybench-kokkos/POLYBENCH_GEMVER-Kokkos.cpp new file mode 100644 index 000000000..f32d7651b --- /dev/null +++ b/src/polybench-kokkos/POLYBENCH_GEMVER-Kokkos.cpp @@ -0,0 +1,232 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_GEMVER.hpp" + +#include "RAJA/RAJA.hpp" + +#include +#include + + +namespace rajaperf +{ +namespace polybench +{ + + +void POLYBENCH_GEMVER::runKokkosVariant(VariantID vid) +{ + + // Kokkos stub + return; + + + const Index_type run_reps = getRunReps(); + + POLYBENCH_GEMVER_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = 0; i < n; i++ ) { + for (Index_type j = 0; j < n; j++) { + POLYBENCH_GEMVER_BODY1; + } + } + + for (Index_type i = 0; i < n; i++ ) { + POLYBENCH_GEMVER_BODY2; + for (Index_type j = 0; j < n; j++) { + POLYBENCH_GEMVER_BODY3; + } + POLYBENCH_GEMVER_BODY4; + } + + for (Index_type i = 0; i < n; i++ ) { + POLYBENCH_GEMVER_BODY5; + } + + for (Index_type i = 0; i < n; i++ ) { + POLYBENCH_GEMVER_BODY6; + for (Index_type j = 0; j < n; j++) { + POLYBENCH_GEMVER_BODY7; + } + POLYBENCH_GEMVER_BODY8; + } + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto poly_gemver_base_lam1 = [=](Index_type i, Index_type j) { + POLYBENCH_GEMVER_BODY1; + }; + auto poly_gemver_base_lam3 = [=](Index_type i, Index_type j, + Real_type &dot) { + POLYBENCH_GEMVER_BODY3; + }; + auto poly_gemver_base_lam4 = [=](Index_type i, Real_type &dot) { + POLYBENCH_GEMVER_BODY4; + }; + auto poly_gemver_base_lam5 = [=](Index_type i) { + POLYBENCH_GEMVER_BODY5; + }; + auto poly_gemver_base_lam7 = [=](Index_type i, Index_type j, + Real_type &dot) { + POLYBENCH_GEMVER_BODY7; + }; + auto poly_gemver_base_lam8 = [=](Index_type i, Real_type &dot) { + POLYBENCH_GEMVER_BODY8; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = 0; i < n; i++ ) { + for (Index_type j = 0; j < n; j++) { + poly_gemver_base_lam1(i, j); + } + } + + for (Index_type i = 0; i < n; i++ ) { + POLYBENCH_GEMVER_BODY2; + for (Index_type j = 0; j < n; j++) { + poly_gemver_base_lam3(i, j, dot); + } + poly_gemver_base_lam4(i, dot); + } + + for (Index_type i = 0; i < n; i++ ) { + poly_gemver_base_lam5(i); + } + + for (Index_type i = 0; i < n; i++ ) { + POLYBENCH_GEMVER_BODY6; + for (Index_type j = 0; j < n; j++) { + poly_gemver_base_lam7(i, j, dot); + } + poly_gemver_base_lam8(i, dot); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + POLYBENCH_GEMVER_VIEWS_RAJA; + + auto poly_gemver_lam1 = [=] (Index_type i, Index_type j) { + POLYBENCH_GEMVER_BODY1_RAJA; + }; + auto poly_gemver_lam2 = [=] (Index_type /* i */, Real_type &dot) { + POLYBENCH_GEMVER_BODY2_RAJA; + }; + auto poly_gemver_lam3 = [=] (Index_type i, Index_type j, Real_type &dot) { + POLYBENCH_GEMVER_BODY3_RAJA; + }; + auto poly_gemver_lam4 = [=] (Index_type i, Real_type &dot) { + POLYBENCH_GEMVER_BODY4_RAJA; + }; + auto poly_gemver_lam5 = [=] (Index_type i) { + POLYBENCH_GEMVER_BODY5_RAJA; + }; + auto poly_gemver_lam6 = [=] (Index_type i, Real_type &dot) { + POLYBENCH_GEMVER_BODY6_RAJA; + }; + auto poly_gemver_lam7 = [=] (Index_type i, Index_type j, Real_type &dot) { + POLYBENCH_GEMVER_BODY7_RAJA; + }; + auto poly_gemver_lam8 = [=] (Index_type i, Real_type &dot) { + POLYBENCH_GEMVER_BODY8_RAJA; + }; + + using EXEC_POL1 = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::Lambda<0, RAJA::Segs<0,1>> + > + > + >; + + using EXEC_POL24 = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::Lambda<0, RAJA::Segs<0>, RAJA::Params<0>>, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0>> + > + >; + + using EXEC_POL3 = RAJA::loop_exec; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment{0, n}, + RAJA::RangeSegment{0, n}), + poly_gemver_lam1 + ); + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, n}, + RAJA::RangeSegment{0, n}), + RAJA::tuple{0.0}, + + poly_gemver_lam2, + poly_gemver_lam3, + poly_gemver_lam4 + ); + + RAJA::forall (RAJA::RangeSegment{0, n}, + poly_gemver_lam5 + ); + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, n}, + RAJA::RangeSegment{0, n}), + RAJA::tuple{0.0}, + + poly_gemver_lam6, + poly_gemver_lam7, + poly_gemver_lam8 + + ); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n POLYBENCH_GEMVER : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/polybench-kokkos/POLYBENCH_GESUMMV-Kokkos.cpp b/src/polybench-kokkos/POLYBENCH_GESUMMV-Kokkos.cpp new file mode 100644 index 000000000..77d9e0ce0 --- /dev/null +++ b/src/polybench-kokkos/POLYBENCH_GESUMMV-Kokkos.cpp @@ -0,0 +1,138 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_GESUMMV.hpp" + +#include "RAJA/RAJA.hpp" + +#include + + +namespace rajaperf +{ +namespace polybench +{ + +void POLYBENCH_GESUMMV::runKokkosVariant(VariantID vid) +{ + // Kokkos stub + return; + + const Index_type run_reps= getRunReps(); + + POLYBENCH_GESUMMV_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = 0; i < N; ++i ) { + POLYBENCH_GESUMMV_BODY1; + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_GESUMMV_BODY2; + } + POLYBENCH_GESUMMV_BODY3; + } + + } + stopTimer(); + + break; + } + + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto poly_gesummv_base_lam2 = [=](Index_type i, Index_type j, + Real_type& tmpdot, Real_type& ydot) { + POLYBENCH_GESUMMV_BODY2; + }; + auto poly_gesummv_base_lam3 = [=](Index_type i, + Real_type& tmpdot, Real_type& ydot) { + POLYBENCH_GESUMMV_BODY3; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = 0; i < N; ++i ) { + POLYBENCH_GESUMMV_BODY1; + for (Index_type j = 0; j < N; ++j ) { + poly_gesummv_base_lam2(i, j, tmpdot, ydot); + } + poly_gesummv_base_lam3(i, tmpdot, ydot); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + POLYBENCH_GESUMMV_VIEWS_RAJA; + + auto poly_gesummv_lam1 = [=](Real_type& tmpdot, Real_type& ydot) { + POLYBENCH_GESUMMV_BODY1_RAJA; + }; + auto poly_gesummv_lam2 = [=](Index_type i, Index_type j, + Real_type& tmpdot, Real_type& ydot) { + POLYBENCH_GESUMMV_BODY2_RAJA; + }; + auto poly_gesummv_lam3 = [=](Index_type i, + Real_type& tmpdot, Real_type& ydot) { + POLYBENCH_GESUMMV_BODY3_RAJA; + }; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::loop_exec, // i + RAJA::statement::Lambda<0, RAJA::Params<0,1>>, + RAJA::statement::For<1, RAJA::loop_exec, // j + RAJA::statement::Lambda<1, RAJA::Segs<0, 1>, RAJA::Params<0,1>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0,1>> + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel_param( + RAJA::make_tuple( RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N} ), + RAJA::make_tuple(static_cast(0.0), + static_cast(0.0)), + + poly_gesummv_lam1, + poly_gesummv_lam2, + poly_gesummv_lam3 + ); + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n POLYBENCH_GESUMMV : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace polybench +} // end namespace rajaperf diff --git a/src/polybench-kokkos/POLYBENCH_HEAT_3D-Kokkos.cpp b/src/polybench-kokkos/POLYBENCH_HEAT_3D-Kokkos.cpp new file mode 100644 index 000000000..df91ae76e --- /dev/null +++ b/src/polybench-kokkos/POLYBENCH_HEAT_3D-Kokkos.cpp @@ -0,0 +1,170 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_HEAT_3D.hpp" + +#include "RAJA/RAJA.hpp" + +#include + + +namespace rajaperf +{ +namespace polybench +{ + + +void POLYBENCH_HEAT_3D::runKokkosVariant(VariantID vid) +{ + + // Kokkos stub + return; + + const Index_type run_reps= getRunReps(); + + POLYBENCH_HEAT_3D_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type j = 1; j < N-1; ++j ) { + for (Index_type k = 1; k < N-1; ++k ) { + POLYBENCH_HEAT_3D_BODY1; + } + } + } + + for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type j = 1; j < N-1; ++j ) { + for (Index_type k = 1; k < N-1; ++k ) { + POLYBENCH_HEAT_3D_BODY2; + } + } + } + + } + + } + stopTimer(); + + POLYBENCH_HEAT_3D_DATA_RESET; + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto poly_heat3d_base_lam1 = [=](Index_type i, Index_type j, + Index_type k) { + POLYBENCH_HEAT_3D_BODY1; + }; + auto poly_heat3d_base_lam2 = [=](Index_type i, Index_type j, + Index_type k) { + POLYBENCH_HEAT_3D_BODY2; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type j = 1; j < N-1; ++j ) { + for (Index_type k = 1; k < N-1; ++k ) { + poly_heat3d_base_lam1(i, j, k); + } + } + } + + for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type j = 1; j < N-1; ++j ) { + for (Index_type k = 1; k < N-1; ++k ) { + poly_heat3d_base_lam2(i, j, k); + } + } + } + + } + + } + stopTimer(); + + POLYBENCH_HEAT_3D_DATA_RESET; + + break; + } + + case RAJA_Seq : { + + POLYBENCH_HEAT_3D_VIEWS_RAJA; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::For<2, RAJA::loop_exec, + RAJA::statement::Lambda<0> + > + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + + [=](Index_type i, Index_type j, Index_type k) { + POLYBENCH_HEAT_3D_BODY1_RAJA; + } + + ); + + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + + [=](Index_type i, Index_type j, Index_type k) { + POLYBENCH_HEAT_3D_BODY2_RAJA; + } + + ); + + } + + } + stopTimer(); + + POLYBENCH_HEAT_3D_DATA_RESET; + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n POLYBENCH_HEAT_3D : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace polybench +} // end namespace rajaperf diff --git a/src/polybench-kokkos/POLYBENCH_JACOBI_1D-Kokkos.cpp b/src/polybench-kokkos/POLYBENCH_JACOBI_1D-Kokkos.cpp new file mode 100644 index 000000000..258d677e6 --- /dev/null +++ b/src/polybench-kokkos/POLYBENCH_JACOBI_1D-Kokkos.cpp @@ -0,0 +1,126 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_JACOBI_1D.hpp" + +#include "RAJA/RAJA.hpp" + +#include + + +namespace rajaperf +{ +namespace polybench +{ + + +void POLYBENCH_JACOBI_1D::runKokkosVariant(VariantID vid) +{ + + // Kokkos stub + return; + + const Index_type run_reps= getRunReps(); + + POLYBENCH_JACOBI_1D_DATA_SETUP; + + auto poly_jacobi1d_lam1 = [=] (Index_type i) { + POLYBENCH_JACOBI_1D_BODY1; + }; + auto poly_jacobi1d_lam2 = [=] (Index_type i) { + POLYBENCH_JACOBI_1D_BODY2; + }; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + for (Index_type i = 1; i < N-1; ++i ) { + POLYBENCH_JACOBI_1D_BODY1; + } + for (Index_type i = 1; i < N-1; ++i ) { + POLYBENCH_JACOBI_1D_BODY2; + } + + } + + } + stopTimer(); + + POLYBENCH_JACOBI_1D_DATA_RESET; + + break; + } + + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + for (Index_type i = 1; i < N-1; ++i ) { + poly_jacobi1d_lam1(i); + } + for (Index_type i = 1; i < N-1; ++i ) { + poly_jacobi1d_lam2(i); + } + + } + + } + stopTimer(); + + POLYBENCH_JACOBI_1D_DATA_RESET; + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + RAJA::forall ( RAJA::RangeSegment{1, N-1}, + poly_jacobi1d_lam1 + ); + + RAJA::forall ( RAJA::RangeSegment{1, N-1}, + poly_jacobi1d_lam2 + ); + + } + + } + stopTimer(); + + POLYBENCH_JACOBI_1D_DATA_RESET; + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n POLYBENCH_JACOBI_1D : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace polybench +} // end namespace rajaperf diff --git a/src/polybench-kokkos/POLYBENCH_JACOBI_2D-Kokkos.cpp b/src/polybench-kokkos/POLYBENCH_JACOBI_2D-Kokkos.cpp new file mode 100644 index 000000000..9886fa2fc --- /dev/null +++ b/src/polybench-kokkos/POLYBENCH_JACOBI_2D-Kokkos.cpp @@ -0,0 +1,157 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_JACOBI_2D.hpp" + +#include "RAJA/RAJA.hpp" + +#include + + +namespace rajaperf +{ +namespace polybench +{ + + +void POLYBENCH_JACOBI_2D::runKokkosVariant(VariantID vid) +{ + + // Kokkos stub + return; + + const Index_type run_reps= getRunReps(); + + POLYBENCH_JACOBI_2D_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type j = 1; j < N-1; ++j ) { + POLYBENCH_JACOBI_2D_BODY1; + } + } + for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type j = 1; j < N-1; ++j ) { + POLYBENCH_JACOBI_2D_BODY2; + } + } + + } + + } + stopTimer(); + + POLYBENCH_JACOBI_2D_DATA_RESET; + + break; + } + + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto poly_jacobi2d_base_lam1 = [=](Index_type i, Index_type j) { + POLYBENCH_JACOBI_2D_BODY1; + }; + auto poly_jacobi2d_base_lam2 = [=](Index_type i, Index_type j) { + POLYBENCH_JACOBI_2D_BODY2; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type j = 1; j < N-1; ++j ) { + poly_jacobi2d_base_lam1(i, j); + } + } + + for (Index_type i = 1; i < N-1; ++i ) { + for (Index_type j = 1; j < N-1; ++j ) { + poly_jacobi2d_base_lam2(i, j); + } + } + + } + + } + stopTimer(); + + POLYBENCH_JACOBI_2D_DATA_RESET; + + break; + } + + case RAJA_Seq : { + + POLYBENCH_JACOBI_2D_VIEWS_RAJA; + + auto poly_jacobi2d_lam1 = [=](Index_type i, Index_type j) { + POLYBENCH_JACOBI_2D_BODY1_RAJA; + }; + auto poly_jacobi2d_lam2 = [=](Index_type i, Index_type j) { + POLYBENCH_JACOBI_2D_BODY2_RAJA; + }; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::Lambda<0> + > + >, + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::Lambda<1> + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type t = 0; t < tsteps; ++t) { + + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment{1, N-1}, + RAJA::RangeSegment{1, N-1}), + + poly_jacobi2d_lam1, + poly_jacobi2d_lam2 + ); + + } + + } + stopTimer(); + + POLYBENCH_JACOBI_2D_DATA_RESET; + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n POLYBENCH_JACOBI_2D : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace polybench +} // end namespace rajaperf diff --git a/src/polybench-kokkos/POLYBENCH_MVT-Kokkos.cpp b/src/polybench-kokkos/POLYBENCH_MVT-Kokkos.cpp new file mode 100644 index 000000000..36d30946c --- /dev/null +++ b/src/polybench-kokkos/POLYBENCH_MVT-Kokkos.cpp @@ -0,0 +1,186 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-21, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "POLYBENCH_MVT.hpp" + +#include "RAJA/RAJA.hpp" + +#include + + +namespace rajaperf +{ +namespace polybench +{ + + +void POLYBENCH_MVT::runKokkosVariant(VariantID vid) +{ + // Kokkos stub + return; + + + const Index_type run_reps= getRunReps(); + + POLYBENCH_MVT_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = 0; i < N; ++i ) { + POLYBENCH_MVT_BODY1; + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_MVT_BODY2; + } + POLYBENCH_MVT_BODY3; + } + + for (Index_type i = 0; i < N; ++i ) { + POLYBENCH_MVT_BODY4; + for (Index_type j = 0; j < N; ++j ) { + POLYBENCH_MVT_BODY5; + } + POLYBENCH_MVT_BODY6; + } + + } + stopTimer(); + + break; + } + + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + auto poly_mvt_base_lam2 = [=] (Index_type i, Index_type j, + Real_type &dot) { + POLYBENCH_MVT_BODY2; + }; + auto poly_mvt_base_lam3 = [=] (Index_type i, + Real_type &dot) { + POLYBENCH_MVT_BODY3; + }; + auto poly_mvt_base_lam5 = [=] (Index_type i, Index_type j, + Real_type &dot) { + POLYBENCH_MVT_BODY5; + }; + auto poly_mvt_base_lam6 = [=] (Index_type i, + Real_type &dot) { + POLYBENCH_MVT_BODY6; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + for (Index_type i = 0; i < N; ++i ) { + POLYBENCH_MVT_BODY1; + for (Index_type j = 0; j < N; ++j ) { + poly_mvt_base_lam2(i, j, dot); + } + poly_mvt_base_lam3(i, dot); + } + + for (Index_type i = 0; i < N; ++i ) { + POLYBENCH_MVT_BODY4; + for (Index_type j = 0; j < N; ++j ) { + poly_mvt_base_lam5(i, j, dot); + } + poly_mvt_base_lam6(i, dot); + } + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + POLYBENCH_MVT_VIEWS_RAJA; + + auto poly_mvt_lam1 = [=] (Real_type &dot) { + POLYBENCH_MVT_BODY1_RAJA; + }; + auto poly_mvt_lam2 = [=] (Index_type i, Index_type j, Real_type &dot) { + POLYBENCH_MVT_BODY2_RAJA; + }; + auto poly_mvt_lam3 = [=] (Index_type i, Real_type &dot) { + POLYBENCH_MVT_BODY3_RAJA; + }; + auto poly_mvt_lam4 = [=] (Real_type &dot) { + POLYBENCH_MVT_BODY4_RAJA; + }; + auto poly_mvt_lam5 = [=] (Index_type i, Index_type j, Real_type &dot) { + POLYBENCH_MVT_BODY5_RAJA; + }; + auto poly_mvt_lam6 = [=] (Index_type i, Real_type &dot) { + POLYBENCH_MVT_BODY6_RAJA; + }; + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<0, RAJA::loop_exec, // i + RAJA::statement::Lambda<0, RAJA::Params<0>>, + RAJA::statement::For<1, RAJA::loop_exec, // j + RAJA::statement::Lambda<1, RAJA::Segs<0,1>, RAJA::Params<0>> + >, + RAJA::statement::Lambda<2, RAJA::Segs<0>, RAJA::Params<0>> + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::region( [=]() { + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + RAJA::tuple{0.0}, + + poly_mvt_lam1, + poly_mvt_lam2, + poly_mvt_lam3 + + ); + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::RangeSegment{0, N}, + RAJA::RangeSegment{0, N}), + RAJA::tuple{0.0}, + + poly_mvt_lam4, + poly_mvt_lam5, + poly_mvt_lam6 + + ); + + }); // end sequential region (for single-source code) + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + std::cout << "\n POLYBENCH_MVT : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace polybench +} // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index 7e2083c50..dcf955903 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -78,6 +78,9 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined(Kokkos_Lambda); + } POLYBENCH_2MM::~POLYBENCH_2MM() diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp index 897eb13a3..d5bbea602 100644 --- a/src/polybench/POLYBENCH_2MM.hpp +++ b/src/polybench/POLYBENCH_2MM.hpp @@ -128,6 +128,10 @@ class POLYBENCH_2MM : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + + + private: Index_type m_ni; Index_type m_nj; diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index 2c06a72ac..bf0eaf916 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -86,6 +86,9 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); + } POLYBENCH_3MM::~POLYBENCH_3MM() diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp index 80d0a2fe5..7a7ed08d5 100644 --- a/src/polybench/POLYBENCH_3MM.hpp +++ b/src/polybench/POLYBENCH_3MM.hpp @@ -154,6 +154,8 @@ class POLYBENCH_3MM : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Index_type m_ni; Index_type m_nj; diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index c36b41050..3e0fdf815 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -63,6 +63,9 @@ POLYBENCH_ADI::POLYBENCH_ADI(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); + } POLYBENCH_ADI::~POLYBENCH_ADI() diff --git a/src/polybench/POLYBENCH_ADI.hpp b/src/polybench/POLYBENCH_ADI.hpp index bec422925..f33a477d0 100644 --- a/src/polybench/POLYBENCH_ADI.hpp +++ b/src/polybench/POLYBENCH_ADI.hpp @@ -195,6 +195,7 @@ class POLYBENCH_ADI : public KernelBase void runCudaVariant(VariantID vid); void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); private: Index_type m_n; diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index e06917239..5a54b9585 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -65,6 +65,9 @@ POLYBENCH_ATAX::POLYBENCH_ATAX(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); + } POLYBENCH_ATAX::~POLYBENCH_ATAX() diff --git a/src/polybench/POLYBENCH_ATAX.hpp b/src/polybench/POLYBENCH_ATAX.hpp index d2c5ec63e..07dd31ec4 100644 --- a/src/polybench/POLYBENCH_ATAX.hpp +++ b/src/polybench/POLYBENCH_ATAX.hpp @@ -116,6 +116,8 @@ class POLYBENCH_ATAX : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Index_type m_N; Real_ptr m_tmp; diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index 59e03721c..a0ac53e93 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -84,6 +84,9 @@ POLYBENCH_FDTD_2D::POLYBENCH_FDTD_2D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); + } POLYBENCH_FDTD_2D::~POLYBENCH_FDTD_2D() diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp index a1ead28b2..3cb4b6c2c 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.hpp +++ b/src/polybench/POLYBENCH_FDTD_2D.hpp @@ -114,6 +114,8 @@ class POLYBENCH_FDTD_2D : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Index_type m_nx; Index_type m_ny; diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index b3306a992..f0a77b1ce 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -60,6 +60,9 @@ POLYBENCH_FLOYD_WARSHALL::POLYBENCH_FLOYD_WARSHALL(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); + } POLYBENCH_FLOYD_WARSHALL::~POLYBENCH_FLOYD_WARSHALL() diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp index ec2bcab9f..5d1c5fdfe 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp @@ -77,6 +77,8 @@ class POLYBENCH_FLOYD_WARSHALL : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Index_type m_N; diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index a50ac09da..aef2b30ec 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -70,6 +70,9 @@ POLYBENCH_GEMM::POLYBENCH_GEMM(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); + } POLYBENCH_GEMM::~POLYBENCH_GEMM() diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp index dd9e4a5a7..04b01adfe 100644 --- a/src/polybench/POLYBENCH_GEMM.hpp +++ b/src/polybench/POLYBENCH_GEMM.hpp @@ -100,6 +100,8 @@ class POLYBENCH_GEMM : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Index_type m_ni; Index_type m_nj; diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index fce83907a..01f3a8b69 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -79,6 +79,9 @@ POLYBENCH_GEMVER::POLYBENCH_GEMVER(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); + } POLYBENCH_GEMVER::~POLYBENCH_GEMVER() diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp index 919f18e5c..0dd0c04ed 100644 --- a/src/polybench/POLYBENCH_GEMVER.hpp +++ b/src/polybench/POLYBENCH_GEMVER.hpp @@ -153,6 +153,8 @@ class POLYBENCH_GEMVER : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Index_type m_n; Real_type m_alpha; diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index 39cb94510..63ba0e7d9 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -59,6 +59,9 @@ POLYBENCH_GESUMMV::POLYBENCH_GESUMMV(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); + } POLYBENCH_GESUMMV::~POLYBENCH_GESUMMV() diff --git a/src/polybench/POLYBENCH_GESUMMV.hpp b/src/polybench/POLYBENCH_GESUMMV.hpp index c8cc9e191..35c024852 100644 --- a/src/polybench/POLYBENCH_GESUMMV.hpp +++ b/src/polybench/POLYBENCH_GESUMMV.hpp @@ -99,6 +99,8 @@ class POLYBENCH_GESUMMV : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Index_type m_N; diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index 85fd0ce38..3723fc3b5 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -70,6 +70,9 @@ POLYBENCH_HEAT_3D::POLYBENCH_HEAT_3D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); + } POLYBENCH_HEAT_3D::~POLYBENCH_HEAT_3D() diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index b21b56576..f93a08ffd 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -125,6 +125,8 @@ class POLYBENCH_HEAT_3D : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Index_type m_N; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index 48c064780..0dd246434 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -67,6 +67,9 @@ POLYBENCH_JACOBI_1D::POLYBENCH_JACOBI_1D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); + } POLYBENCH_JACOBI_1D::~POLYBENCH_JACOBI_1D() diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp index 290e26ce0..6990d489b 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp @@ -71,6 +71,8 @@ class POLYBENCH_JACOBI_1D : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Index_type m_N; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index 9e204bdab..b78ee2134 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -69,6 +69,9 @@ POLYBENCH_JACOBI_2D::POLYBENCH_JACOBI_2D(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( Lambda_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); + } POLYBENCH_JACOBI_2D::~POLYBENCH_JACOBI_2D() diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index 9a57325a1..0b0b104a3 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -91,6 +91,8 @@ class POLYBENCH_JACOBI_2D : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Index_type m_N; Index_type m_tsteps; diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index ae2749ce5..6af5b5e45 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -62,6 +62,9 @@ POLYBENCH_MVT::POLYBENCH_MVT(const RunParams& params) setVariantDefined( Base_HIP ); setVariantDefined( RAJA_HIP ); + + setVariantDefined( Kokkos_Lambda ); + } POLYBENCH_MVT::~POLYBENCH_MVT() diff --git a/src/polybench/POLYBENCH_MVT.hpp b/src/polybench/POLYBENCH_MVT.hpp index cb72784ed..37c953c53 100644 --- a/src/polybench/POLYBENCH_MVT.hpp +++ b/src/polybench/POLYBENCH_MVT.hpp @@ -113,6 +113,8 @@ class POLYBENCH_MVT : public KernelBase void runHipVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); + void runKokkosVariant(VariantID vid); + private: Index_type m_N; Real_ptr m_x1;