diff --git a/CMakeLists.txt b/CMakeLists.txt index 00297ae..51f1630 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,7 @@ option(COSMA_WITH_PROFILING "Enable profiling." OFF) option(COSMA_WITH_NCCL "Use NCCL as communication backend." OFF) option(COSMA_WITH_RCCL "Use RCCL as communication backend." OFF) option(COSMA_WITH_GPU_AWARE_MPI "Use gpu-aware MPI for communication." OFF) +option(COSMA_USE_UNIFIED_MEMORY "Use unified memory when GPU acceleration is ON" OFF) option(BUILD_SHARED_LIBS "Build shared libraries." OFF) set(COSMA_SCALAPACK "OFF" CACHE STRING "scalapack implementation. Can be MKL, CRAY_LIBSCI, NVPL, CUSTOM or OFF.") set(COSMA_BLAS "OFF" CACHE STRING "Blas library for computations on host or GPU") @@ -42,13 +43,15 @@ set_property(CACHE COSMA_BLAS PROPERTY STRINGS ${COSMA_BLAS_LIST}) # GPU supports since they are treated as separate components if(COSMA_BLAS STREQUAL "OFF") - message(FATAL_ERROR "A Blas implementation is needed when running on CPU only: choices are : auto, MKL, OPENBLAS, CRAY_LIBSCI, NVPL, CUSTOM, BLIS, ATLAS, FLEXIBLAS, ARMPL, GenericBLAS, CUDA or ROCM") + message(FATAL_ERROR "A Blas implementation is needed when running on CPU only: choices are : auto, MKL, OPENBLAS, CRAY_LIBSCI, CUSTOM, BLIS, ATLAS, FLEXIBLAS, ARMPL, GenericBLAS, CUDA or ROCM") endif() if (COSMA_BLAS MATCHES "CUDA|ROCM") set(COSMA_GPU_BACKEND ${COSMA_BLAS}) + set(COSMA_BLAS_VENDOR "OFF") else() set(COSMA_BLAS_VENDOR ${COSMA_BLAS}) + set(COSMA_GPU_BACKEND "OFF") endif() if ((COSMA_WITH_NCCL OR COSMA_WITH_RCCL) AND NOT COSMA_GPU_BACKEND IN_LIST COSMA_GPU_BACKENDS_LIST) @@ -103,6 +106,7 @@ FetchContent_Declare( GIT_TAG 03847e66f05ad4a1eb371b85be628e218ce46f11 # v2.2.3 FIND_PACKAGE_ARGS NAMES costa ) + # the joy of fetch_content. if we build costa and cosma together # fetch_content will pick up the FindSCALAPACK from cosma NOT costa. if (NOT TARGET costa::scalapack::scalapack AND NOT COSMA_SCALAPACK MATCHES "OFF") @@ -141,7 +145,8 @@ if (COSMA_WITH_PROFILING) semiprof GIT_REPOSITORY https://github.com/bcumming/semiprof.git GIT_TAG f132142ff2215dfa073e416fa7911d8877d62752 - FIND_PACKAGE_ARGS NAMES semiprof) + FIND_PACKAGE_ARGS NAMES semiprof + ) FetchContent_MakeAvailable(semiprof) endif () @@ -239,3 +244,4 @@ endif() if(COSMA_WITH_BENCHMARKS AND NOT COSMA_BLAS MATCHES "OPENBLAS") add_subdirectory(benchmarks) endif() + diff --git a/spack/repo.yaml b/spack/repo.yaml deleted file mode 100644 index 0309bae..0000000 --- a/spack/repo.yaml +++ /dev/null @@ -1,2 +0,0 @@ -repo: - namespace: cosma-repo diff --git a/spack/packages/cosma/fj-ssl2.patch b/spack_repo/cosma/packages/cosma/fj-ssl2.patch similarity index 100% rename from spack/packages/cosma/fj-ssl2.patch rename to spack_repo/cosma/packages/cosma/fj-ssl2.patch diff --git a/spack/packages/cosma/package.py b/spack_repo/cosma/packages/cosma/package.py similarity index 81% rename from spack/packages/cosma/package.py rename to spack_repo/cosma/packages/cosma/package.py index 6e62f71..befc51d 100644 --- a/spack/packages/cosma/package.py +++ b/spack_repo/cosma/packages/cosma/package.py @@ -1,9 +1,10 @@ -# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other -# Spack Project Developers. See the top-level COPYRIGHT file for details. +# Copyright Spack Project Developers. See COPYRIGHT file for details. # # SPDX-License-Identifier: (Apache-2.0 OR MIT) +from spack_repo.builtin.build_systems.cmake import CMakePackage + from spack.package import * @@ -22,6 +23,7 @@ class Cosma(CMakePackage): # note: The default archives produced with github do not have the archives # of the submodules. version("master", branch="master", submodules=False) + version("2.7.0", sha256="f4775d18379539d7bb5053bff8acb4e13d6ed31a9677f498d9099a7500488789") version("2.6.6", sha256="1604be101e77192fbcc5551236bc87888d336e402f5409bbdd9dea900401cc37") version("2.6.5", sha256="10d9b7ecc1ce44ec5b9e0c0bf89278a63029912ec3ea99661be8576b553ececf") version("2.6.4", sha256="6d7bd5e3005874af9542a329c93e7ccd29ca1a5573dae27618fac2704fa2b6ab") @@ -36,8 +38,6 @@ class Cosma(CMakePackage): version("2.0.7", sha256="8d70bfcbda6239b6a8fbeaca138790bbe58c0c3aa576879480d2632d4936cf7e") version("2.0.2", sha256="4f3354828bc718f3eef2f0098c3bdca3499297497a220da32db1acd57920c68d") - depends_on("cxx", type="build") # generated - # We just need the libraries of cuda and rocm, so no need to extend # CudaPackage or ROCmPackage. variant("cuda", default=False, description="Build with cuBLAS support") @@ -55,6 +55,13 @@ class Cosma(CMakePackage): with when("+rocm"): variant("rccl", default=False, description="Use rocm rccl") + with when("@2.8.0:+rocm"): + variant("unified_memory", default=False) + + depends_on("cxx", type="build") + depends_on("c", type="build") + depends_on("fortran", type="build") + depends_on("cmake@3.22:", type="build") depends_on("mpi@3:") depends_on("blas", when="~cuda ~rocm") @@ -82,7 +89,7 @@ class Cosma(CMakePackage): patch("fj-ssl2.patch", when="^fujitsu-ssl2") - def setup_build_environment(self, env): + def setup_build_environment(self, env: EnvironmentModifications) -> None: if self.spec.satisfies("+cuda"): env.set("CUDA_PATH", self.spec["cuda"].prefix) @@ -90,17 +97,20 @@ def cosma_blas_cmake_arg(self): query_to_cmake_arg = [ ("+cuda", "CUDA"), ("+rocm", "ROCM"), - ("^intel-mkl", "MKL"), - ("^intel-oneapi-mkl", "MKL"), - ("^cray-libsci", "CRAY_LIBSCI"), - ("^netlib-lapack", "CUSTOM"), - ("^openblas", "OPENBLAS"), - ("^fujitsu-ssl2", "SSL2"), + ("^[virtuals=blas] intel-oneapi-mkl", "MKL"), + ("^[virtuals=blas] cray-libsci", "CRAY_LIBSCI"), + ("^[virtuals=blas] netlib-lapack", "CUSTOM"), + ("^[virtuals=blas] openblas", "OPENBLAS"), + ("^[virtuals=blas] fujitsu-ssl2", "SSL2"), ] if self.version >= Version("2.4.0"): query_to_cmake_arg.extend( - [("^blis", "BLIS"), ("^amdblis", "BLIS"), ("^atlas", "ATLAS")] + [ + ("^[virtuals=blas] blis", "BLIS"), + ("^[virtuals=blas] amdblis", "BLIS"), + ("^[virtuals=blas] atlas", "ATLAS"), + ] ) for query, cmake_arg in query_to_cmake_arg: @@ -114,7 +124,7 @@ def cosma_scalapack_cmake_arg(self): if spec.satisfies("~scalapack"): return "OFF" - elif spec.satisfies("^intel-mkl") or spec.satisfies("^intel-oneapi-mkl"): + elif spec.satisfies("^[virtuals=scalapack] intel-oneapi-mkl"): return "MKL" elif spec.satisfies("^cray-libsci"): return "CRAY_LIBSCI" @@ -129,6 +139,7 @@ def cmake_args(self): self.define_from_variant("COSMA_WITH_RCCL", "rccl"), self.define_from_variant("COSMA_WITH_GPU_AWARE_MPI", "gpu_direct"), self.define_from_variant("COSMA_WITH_PROFILING", "profiling"), + self.define_from_variant("COSMA_USE_UNIFIED_MEMORY", "unified_memory"), self.define("COSMA_WITH_BENCHMARKS", False), self.define("COSMA_BLAS", self.cosma_blas_cmake_arg()), self.define("COSMA_SCALAPACK", self.cosma_scalapack_cmake_arg()), diff --git a/spack/packages/tiled-mm/package.py b/spack_repo/cosma/packages/tiled-mm/package.py similarity index 84% rename from spack/packages/tiled-mm/package.py rename to spack_repo/cosma/packages/tiled-mm/package.py index 9a2ea91..bed4257 100644 --- a/spack/packages/tiled-mm/package.py +++ b/spack_repo/cosma/packages/tiled-mm/package.py @@ -1,8 +1,11 @@ -# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other -# Spack Project Developers. See the top-level COPYRIGHT file for details. +# Copyright Spack Project Developers. See COPYRIGHT file for details. # # SPDX-License-Identifier: (Apache-2.0 OR MIT) +from spack_repo.builtin.build_systems.cmake import CMakePackage +from spack_repo.builtin.build_systems.cuda import CudaPackage +from spack_repo.builtin.build_systems.rocm import ROCmPackage + from spack.package import * @@ -20,17 +23,18 @@ class TiledMm(CMakePackage, CudaPackage, ROCmPackage): version("master", branch="master") + version("2.3.2", sha256="1f91ca02f6ee8e400835fa90630618baf86a7b425b4bbbb4151068f72658b858") version("2.3.1", sha256="68914a483e62f796b790ea428210b1d5ef5943d6289e53d1aa62f56a20fbccc8") version("2.3", sha256="504c6201f5a9be9741c55036bf8e2656ae3f4bc19996295b264ee5e303c9253c") version("2.2", sha256="6d0b49c9588ece744166822fd44a7bc5bec3dc666b836de8bf4bf1a7bb675aac") version("2.0", sha256="ea554aea8c53d7c8e40044e6d478c0e8137d7e8b09d7cb9650703430d92cf32e") - depends_on("cxx", type="build") # generated - variant("shared", default=True, description="Build shared libraries") variant("examples", default=False, description="Enable examples") variant("tests", default=False, description="Enable tests") + depends_on("cxx", type="build") # generated + depends_on("rocblas", when="+rocm") depends_on("cxxopts", when="+tests") depends_on("cxxopts", when="+examples") diff --git a/spack_repo/cosma/repo.yaml b/spack_repo/cosma/repo.yaml new file mode 100644 index 0000000..3b46caa --- /dev/null +++ b/spack_repo/cosma/repo.yaml @@ -0,0 +1,2 @@ +repo: + namespace: cosma diff --git a/src/cosma/CMakeLists.txt b/src/cosma/CMakeLists.txt index 03d2bb7..10d49fe 100644 --- a/src/cosma/CMakeLists.txt +++ b/src/cosma/CMakeLists.txt @@ -1,5 +1,5 @@ set(INSTALLED_TARGETS_LIST "") -set(cosma_src_files blas.cpp +set(cosma_src_files buffer.cpp communicator.cpp context.cpp @@ -18,6 +18,10 @@ set(cosma_src_files blas.cpp cinterface.cpp environment_variables.cpp) +if (COSMA_GPU_BACKEND MATCHES "OFF") + LIST(APPEND cosma_src_files blas.cpp) +endif () + if (COSMA_GPU_BACKEND MATCHES "ROCM" OR COSMA_GPU_BACKEND MATCHES "CUDA") list(APPEND cosma_src_files "pinned_buffers.cpp") if (COSMA_WITH_NCCL OR COSMA_WITH_RCCL) @@ -40,7 +44,7 @@ target_link_libraries(cosma PUBLIC costa::costa $ $ - $<$>:cosma::BLAS::blas> + $<$:cosma::BLAS::blas> $ $<$:Tiled-MM::Tiled-MM> $<$:Tiled-MM::Tiled-MM> @@ -55,12 +59,13 @@ target_compile_definitions(cosma PUBLIC $<$:COSMA_WITH_NCCL> $<$:COSMA_WITH_MKL_BLAS> $<$:COSMA_WITH_BLIS_BLAS> - $<$>:COSMA_WITH_BLAS> + $<$>:COSMA_WITH_BLAS> $<$:COSMA_HAVE_GPU> $<$:COSMA_HAVE_GPU> PRIVATE $<$:COSMA_WITH_PROFILING>) + list(APPEND INSTALLED_TARGETS_LIST "cosma") # if SCALAPACK is found and cosma_pxgemm library is not already created diff --git a/src/cosma/aligned_allocator.hpp b/src/cosma/aligned_allocator.hpp index af49559..45fd3c0 100644 --- a/src/cosma/aligned_allocator.hpp +++ b/src/cosma/aligned_allocator.hpp @@ -3,11 +3,11 @@ #include #include +#include +#include #include #include #include -#include -#include /* * A custom allocator that: @@ -18,7 +18,7 @@ namespace cosma { template class aligned_allocator { -public: + public: using value_type = T; using pointer = value_type *; using const_pointer = const value_type *; @@ -38,10 +38,10 @@ class aligned_allocator { // the minimum alignment for given type T std::size_t min_alignment() { - return std::max(math_utils::next_power_of_2(sizeof(T)), sizeof(void*)); + return std::max(math_utils::next_power_of_2(sizeof(T)), sizeof(void *)); } - // Calculate how many additional elements we have to allocate for an array + // Calculate how many additional elements we have to allocate for an array // of length n and data type T. static std::size_t get_alignment_padding(std::size_t n) { auto alignment = get_alignment(); @@ -50,34 +50,35 @@ class aligned_allocator { auto remainder = (n * sizeof(T)) % alignment; // Convert the padding from bytes to the number of elements - remainder = remainder!=0 ? (alignment - remainder) / sizeof(T) : 0; + remainder = remainder != 0 ? (alignment - remainder) / sizeof(T) : 0; - // std::cout << "For size " << n << ", reminder = " << remainder << std::endl; - // std::cout << "sizeof(T) = " << sizeof(T) << std::endl; + // std::cout << "For size " << n << ", reminder = " << remainder << + // std::endl; std::cout << "sizeof(T) = " << sizeof(T) << std::endl; return remainder; } // allocate memory with alignment specified as a template parameter // returns nullptr on failure - T* aligned_malloc(std::size_t size) { + T *aligned_malloc(std::size_t size) { auto alignment = get_alignment(); // if alignment is disabled, use the standard malloc if (alignment <= 0) { - return reinterpret_cast(malloc(size*sizeof(T))); + return reinterpret_cast(malloc(size * sizeof(T))); } // check if the requested size is a multiple of the alignment assert(get_alignment_padding(size) == 0); // check if the alignment is >= min_alignment for this data type T assert(alignment >= min_alignment()); - // check if the alignment is a power of 2 and a multiple of sizeof(void*). + // check if the alignment is a power of 2 and a multiple of + // sizeof(void*). assert(math_utils::is_power_of_2(alignment)); // "Memory alignment must be a power of 2."); // This is required for the posix_memalign function. - assert(alignment % sizeof(void*) == 0); + assert(alignment % sizeof(void *) == 0); // "Memory alignment must be a multiple of sizeof(void*)"); void *ptr; - if (posix_memalign(&ptr, alignment, size*sizeof(T)) == 0) { - return reinterpret_cast(ptr); + if (posix_memalign(&ptr, alignment, size * sizeof(T)) == 0) { + return reinterpret_cast(ptr); } return nullptr; } @@ -94,7 +95,15 @@ class aligned_allocator { pointer allocate(size_type cnt, typename std::allocator::const_pointer = 0) { if (cnt > 0) { - pointer ptr = aligned_malloc(cnt); + pointer ptr; + if (!cosma::get_unified_memory()) { + ptr = aligned_malloc(cnt); + } +#if defined(COSMA_USE_UNIFIED_MEMORY) + else { + hipMalloc(&ptr, cnt * sizeof(T)); + } +#endif return ptr; } return nullptr; @@ -102,7 +111,12 @@ class aligned_allocator { void deallocate(pointer p, size_type cnt) { if (p) { - std::free(p); + if (!cosma::get_unified_memory()) + std::free(p); +#if defined(COSMA_USE_UNIFIED_MEMORY) + else + hipFree(p); +#endif } } @@ -110,9 +124,7 @@ class aligned_allocator { return std::numeric_limits::max() / sizeof(T); } - void construct(pointer p, const T &t) { - new (p) T(t); - } + void construct(pointer p, const T &t) { new (p) T(t); } void destroy(pointer p) { if (p) { diff --git a/src/cosma/context.cpp b/src/cosma/context.cpp index 3a3f027..5e6462c 100644 --- a/src/cosma/context.cpp +++ b/src/cosma/context.cpp @@ -9,7 +9,7 @@ namespace cosma { #ifdef COSMA_HAVE_GPU template -gpu::mm_handle* cosma_context::get_gpu_context() { +gpu::mm_handle *cosma_context::get_gpu_context() { return gpu_ctx_.get(); } #endif @@ -21,26 +21,29 @@ cosma_context::cosma_context() { overlap_comm_and_comp = get_overlap_comm_and_comp(); pin_host_buffers = get_memory_pinning(); #ifdef COSMA_HAVE_GPU - gpu_ctx_ = gpu::make_context(gpu_streams(), - gpu_max_tile_m(), - gpu_max_tile_n(), - gpu_max_tile_k()); + gpu_ctx_ = gpu::make_context( + gpu_streams(), gpu_max_tile_m(), gpu_max_tile_n(), gpu_max_tile_k()); #endif } template -cosma_context::cosma_context(size_t cpu_mem_limit, int streams, int tile_m, int tile_n, int tile_k) { - cpu_memory_limit = (long long) cpu_mem_limit; +cosma_context::cosma_context(size_t cpu_mem_limit, + int streams, + int tile_m, + int tile_n, + int tile_k) { + cpu_memory_limit = (long long)cpu_mem_limit; adapt_to_scalapack_strategy = get_adapt_strategy(); overlap_comm_and_comp = get_overlap_comm_and_comp(); pin_host_buffers = get_memory_pinning(); memory_pool_.amortization = get_memory_pool_amortization(); // do not reserve nor resize the memory pool // let this just serve as the upper bound when creating a strategy - // because otherwise, it might reserve/resize to much more than the problem requires - // memory_pool_.resize(cpu_mem_limit); + // because otherwise, it might reserve/resize to much more than the problem + // requires memory_pool_.resize(cpu_mem_limit); #ifdef COSMA_HAVE_GPU gpu_ctx_ = gpu::make_context(streams, tile_m, tile_n, tile_k); + use_unified_memory_ = cosma::get_unified_memory(); #else std::cout << "Ignoring parameters in make_context. These parameters only " "used in the CPU version." @@ -59,7 +62,7 @@ cosma_context::~cosma_context() { } template -memory_pool& cosma_context::get_memory_pool() { +memory_pool &cosma_context::get_memory_pool() { return memory_pool_; } @@ -69,14 +72,15 @@ long long cosma_context::get_cpu_memory_limit() { } template -cosma::communicator* cosma_context::get_cosma_comm() { +cosma::communicator *cosma_context::get_cosma_comm() { return prev_cosma_comm.get(); } template void cosma_context::register_state(MPI_Comm comm, const Strategy strategy) { - if (comm == MPI_COMM_NULL) return; + if (comm == MPI_COMM_NULL) + return; int same_comm = 0; @@ -90,22 +94,22 @@ void cosma_context::register_state(MPI_Comm comm, MPI_Comm prev_comm = prev_cosma_comm->full_comm(); int comm_compare; MPI_Comm_compare(prev_comm, comm, &comm_compare); - same_comm = comm_compare == MPI_CONGRUENT || - comm_compare == MPI_IDENT; + same_comm = comm_compare == MPI_CONGRUENT || comm_compare == MPI_IDENT; - bool same_strategy = strategy == prev_strategy; + bool same_strategy = strategy == prev_strategy; // if same_comm and same strategy -> reuse the communicators if (!same_comm || !same_strategy) { prev_strategy = strategy; PE(preprocessing_communicators); - prev_cosma_comm = std::make_unique(strategy, comm); + prev_cosma_comm = + std::make_unique(strategy, comm); PL(); - memory_pool_.unpin_all(); - memory_pool_.already_pinned = false; - memory_pool_.resized = false; + memory_pool_.unpin_all(); + memory_pool_.already_pinned = false; + memory_pool_.resized = false; } } @@ -113,15 +117,8 @@ void cosma_context::register_state(MPI_Comm comm, // if (prev_cosma_comm->is_idle()) return; #ifdef COSMA_HAVE_GPU - if ( - !prev_cosma_comm->is_idle() - && - !memory_pool_.resized - && - same_comm - && - strategy == prev_strategy - ) { + if (!prev_cosma_comm->is_idle() && !memory_pool_.resized && same_comm && + strategy == prev_strategy) { memory_pool_.already_pinned = true; } #endif @@ -139,8 +136,13 @@ context make_context() { } template -context make_context(size_t cpu_mem_limit, int streams, int tile_m, int tile_n, int tile_k) { - return std::make_unique>(cpu_mem_limit, streams, tile_m, tile_n, tile_k); +context make_context(size_t cpu_mem_limit, + int streams, + int tile_m, + int tile_n, + int tile_k) { + return std::make_unique>( + cpu_mem_limit, streams, tile_m, tile_n, tile_k); } // Meyer's singleton, thread-safe in C++11, but not in C++03. @@ -171,29 +173,29 @@ template context make_context(); template context make_context(); template context make_context(size_t cpu_mem_limit, - int streams, - int tile_m, - int tile_n, - int tile_k); + int streams, + int tile_m, + int tile_n, + int tile_k); template context make_context(size_t cpu_mem_limit, - int streams, - int tile_m, - int tile_n, - int tile_k); + int streams, + int tile_m, + int tile_n, + int tile_k); template context make_context(size_t cpu_mem_limit, - int streams, - int tile_m, - int tile_n, - int tile_k); + int streams, + int tile_m, + int tile_n, + int tile_k); template context make_context(size_t cpu_mem_limit, - int streams, - int tile_m, - int tile_n, - int tile_k); + int streams, + int tile_m, + int tile_n, + int tile_k); // template instantiation for get_context_instance template global_context get_context_instance(); template global_context get_context_instance(); template global_context get_context_instance(); template global_context get_context_instance(); -} +} // namespace cosma diff --git a/src/cosma/context.hpp b/src/cosma/context.hpp index 8f756ec..e15c760 100644 --- a/src/cosma/context.hpp +++ b/src/cosma/context.hpp @@ -1,14 +1,14 @@ #pragma once -#include -#include #include #include +#include +#include #include #ifdef COSMA_HAVE_GPU -#include #include +#include #endif namespace cosma { @@ -18,25 +18,30 @@ class communicator; template class cosma_context { -public: + public: cosma_context(); - cosma_context(size_t cpu_mem_limit, int streams, int tile_m, int tile_n, int tile_k); + cosma_context(size_t cpu_mem_limit, + int streams, + int tile_m, + int tile_n, + int tile_k); ~cosma_context(); - void register_state(MPI_Comm comm, - const Strategy strategy); + void register_state(MPI_Comm comm, const Strategy strategy); - memory_pool& get_memory_pool(); + memory_pool &get_memory_pool(); #ifdef COSMA_HAVE_GPU - gpu::mm_handle* get_gpu_context(); + gpu::mm_handle *get_gpu_context(); #endif - cosma::communicator* get_cosma_comm(); + cosma::communicator *get_cosma_comm(); long long get_cpu_memory_limit(); void turn_on_output(); + bool unified_memory(); + bool adapt_to_scalapack_strategy = true; bool overlap_comm_and_comp = false; @@ -47,7 +52,7 @@ class cosma_context { gpu::device_stream gpu_stream; #endif -private: + private: long long cpu_memory_limit = std::numeric_limits::max(); memory_pool memory_pool_; #ifdef COSMA_HAVE_GPU @@ -55,12 +60,13 @@ class cosma_context { // gpu::mm_handle gpu_ctx_; #endif bool output = false; + bool use_unified_memory_ = false; Strategy prev_strategy; std::unique_ptr prev_cosma_comm; }; template -using global_context = cosma_context*; +using global_context = cosma_context *; template using context = std::unique_ptr>; @@ -69,7 +75,11 @@ template context make_context(); template -context make_context(size_t cpu_mem_limit, int streams, int tile_m, int tile_n, int tile_k); +context make_context(size_t cpu_mem_limit, + int streams, + int tile_m, + int tile_n, + int tile_k); // Meyer's singleton, thread-safe in C++11, but not in C++03. // The thread-safety is guaranteed by the standard in C++11: diff --git a/src/cosma/environment_variables.cpp b/src/cosma/environment_variables.cpp index 2277b84..aee1fb6 100644 --- a/src/cosma/environment_variables.cpp +++ b/src/cosma/environment_variables.cpp @@ -1,29 +1,27 @@ -#include #include +#include -bool cosma::env_var_defined(const char* var_name) { - char* var = getenv (var_name); +bool cosma::env_var_defined(const char *var_name) { + char *var = getenv(var_name); return var != nullptr; } bool cosma::get_bool_env_var(std::string name, bool default_value) { - char* var; + char *var; var = getenv(name.c_str()); bool value = default_value; if (var != nullptr) { std::string s(var); - std::transform(s.begin(), s.end(), s.begin(), - [&](char c) { - return std::toupper(c); - } - ); + std::transform(s.begin(), s.end(), s.begin(), [&](char c) { + return std::toupper(c); + }); value = (s == "ON"); } return value; } int cosma::get_int_env_var(std::string name, int default_value) { - char* var; + char *var; var = getenv(name.c_str()); int value = default_value; if (var != nullptr) @@ -32,7 +30,7 @@ int cosma::get_int_env_var(std::string name, int default_value) { } float cosma::get_float_env_var(std::string name, float default_value) { - char* var; + char *var; var = getenv(name.c_str()); float value = default_value; if (var != nullptr) @@ -41,7 +39,7 @@ float cosma::get_float_env_var(std::string name, float default_value) { } double cosma::get_double_env_var(std::string name, double default_value) { - char* var; + char *var; var = getenv(name.c_str()); double value = default_value; if (var != nullptr) @@ -50,12 +48,12 @@ double cosma::get_double_env_var(std::string name, double default_value) { } std::size_t cosma::get_ull_env_var(std::string name, size_t default_value) { - char* var; + char *var; var = getenv(name.c_str()); size_t value = default_value; if (var != nullptr) value = std::stoull(std::string(var)); - return std::size_t (value); + return std::size_t(value); } int cosma::gpu_streams() { @@ -84,8 +82,7 @@ bool cosma::get_adapt_strategy() { } bool cosma::get_overlap_comm_and_comp() { - return get_bool_env_var(env_var_names::overlap, - env_var_defaults::overlap); + return get_bool_env_var(env_var_names::overlap, env_var_defaults::overlap); } bool cosma::get_memory_pinning() { @@ -93,9 +90,14 @@ bool cosma::get_memory_pinning() { env_var_defaults::memory_pinning_enabled); } +bool cosma::get_unified_memory() { + return get_bool_env_var(env_var_names::cosma_gpu_unified_memory, + env_var_defaults::unified_memory); +} + double cosma::get_memory_pool_amortization() { return get_double_env_var(env_var_names::memory_pool_amortization, - env_var_defaults::memory_pool_amortization); + env_var_defaults::memory_pool_amortization); } int cosma::get_min_local_dimension() { @@ -117,7 +119,7 @@ int cosma::get_cosma_cpu_memory_alignment() { // and converts the limit to #elements that each rank is allowed to use template long long cosma::get_cpu_max_memory() { - char* var; + char *var; var = getenv(env_var_names::cpu_max_memory.c_str()); long long value = env_var_defaults::cpu_max_memory; long long megabytes = env_var_defaults::cpu_max_memory; @@ -135,4 +137,3 @@ template long long cosma::get_cpu_max_memory(); template long long cosma::get_cpu_max_memory(); template long long cosma::get_cpu_max_memory>(); template long long cosma::get_cpu_max_memory>(); - diff --git a/src/cosma/environment_variables.hpp b/src/cosma/environment_variables.hpp index e486407..78561b7 100644 --- a/src/cosma/environment_variables.hpp +++ b/src/cosma/environment_variables.hpp @@ -1,100 +1,103 @@ #pragma once -#include -#include #include -#include #include +#include +#include namespace cosma { // names of supported environment variables namespace env_var_names { - // number of GPU streams to be used per rank - const std::string gpu_n_streams = "COSMA_GPU_STREAMS"; - // max sizes of GPU tiles (in #elements) - // MxN corresponds to matrix C and K to the shared dimension - const std::string gpu_tile_m = "COSMA_GPU_MAX_TILE_M"; - const std::string gpu_tile_n = "COSMA_GPU_MAX_TILE_N"; - const std::string gpu_tile_k = "COSMA_GPU_MAX_TILE_K"; - // if ON, COSMA will try to natively use scalapack layout - // without transformation. Only used in the pxgemm wrapper. - const std::string adapt_strategy = "COSMA_ADAPT_STRATEGY"; - // if ON, COSMA will try to overlap communication and computation - const std::string overlap = "COSMA_OVERLAP_COMM_AND_COMP"; - // specifies the maximum available CPU memory per rank in MB - const std::string cpu_max_memory = "COSMA_CPU_MAX_MEMORY"; - // if true, local host matrices will be pinned - // (only used when GPU backend enabled) - // which increases the efficiency - const std::string memory_pinning_enabled = "COSMA_GPU_MEMORY_PINNING"; - // The scaling factor used for the memory-pool allocation size.(cpu-only). - // If amortization = 1.2, then the memory allocator - // will request 1.2x the requested size (thus, 20% more than needed). - // Higher values better amortize the cost of memory buffers resizing - // which can occur when the algorithm is invoked for different matrix sizes. - // However, higher amortization values also mean that - // potentially more memory is allocated than used which can be - // a problem when the memory resource is tight. - // There is just a single memory pool in COSMA and all the required - // memory is taken from this memory pool only. - const std::string memory_pool_amortization = "COSMA_MEMORY_POOL_AMORTIZATION"; - // minimum local matrix size -- if P is too large, so that after - // splitting the local matrix size get lower than this, - // then P will be reduced so that the problem size - // never gets smaller than specified by this variable - const std::string min_local_dimension = "COSMA_MIN_LOCAL_DIMENSION"; - // if any dimension is smaller than this threshold, it will be dispatched to SCALAPACK - // since it's too "thin" for COSMA in that case - const std::string cosma_dim_threshold = "COSMA_DIM_THRESHOLD"; - // number of bytes to which all host buffers are aligned - const std::string cosma_cpu_memory_alignment = "COSMA_CPU_MEMORY_ALIGNMENT"; -}; +// number of GPU streams to be used per rank +const std::string gpu_n_streams = "COSMA_GPU_STREAMS"; +// max sizes of GPU tiles (in #elements) +// MxN corresponds to matrix C and K to the shared dimension +const std::string gpu_tile_m = "COSMA_GPU_MAX_TILE_M"; +const std::string gpu_tile_n = "COSMA_GPU_MAX_TILE_N"; +const std::string gpu_tile_k = "COSMA_GPU_MAX_TILE_K"; +// if ON, COSMA will try to natively use scalapack layout +// without transformation. Only used in the pxgemm wrapper. +const std::string adapt_strategy = "COSMA_ADAPT_STRATEGY"; +// if ON, COSMA will try to overlap communication and computation +const std::string overlap = "COSMA_OVERLAP_COMM_AND_COMP"; +// specifies the maximum available CPU memory per rank in MB +const std::string cpu_max_memory = "COSMA_CPU_MAX_MEMORY"; +// if true, local host matrices will be pinned +// (only used when GPU backend enabled) +// which increases the efficiency +const std::string memory_pinning_enabled = "COSMA_GPU_MEMORY_PINNING"; +// The scaling factor used for the memory-pool allocation size.(cpu-only). +// If amortization = 1.2, then the memory allocator +// will request 1.2x the requested size (thus, 20% more than needed). +// Higher values better amortize the cost of memory buffers resizing +// which can occur when the algorithm is invoked for different matrix sizes. +// However, higher amortization values also mean that +// potentially more memory is allocated than used which can be +// a problem when the memory resource is tight. +// There is just a single memory pool in COSMA and all the required +// memory is taken from this memory pool only. +const std::string memory_pool_amortization = "COSMA_MEMORY_POOL_AMORTIZATION"; +// minimum local matrix size -- if P is too large, so that after +// splitting the local matrix size get lower than this, +// then P will be reduced so that the problem size +// never gets smaller than specified by this variable +const std::string min_local_dimension = "COSMA_MIN_LOCAL_DIMENSION"; +// if any dimension is smaller than this threshold, it will be dispatched to +// SCALAPACK since it's too "thin" for COSMA in that case +const std::string cosma_dim_threshold = "COSMA_DIM_THRESHOLD"; +// number of bytes to which all host buffers are aligned +const std::string cosma_cpu_memory_alignment = "COSMA_CPU_MEMORY_ALIGNMENT"; +// IF ON, use unified memory +const std::string cosma_gpu_unified_memory = "COSMA_GPU_UNIFIED_MEMORY"; +}; // namespace env_var_names // default values of supported environment variables namespace env_var_defaults { - // number of GPU streams to be used per rank - const int gpu_n_streams = 2; - // max sizes of GPU tiles (in #elements) - // MxN corresponds to matrix C and K to the shared dimension - const int gpu_tile_m = 5000; - const int gpu_tile_n = 5000; - const int gpu_tile_k = 5000; - // if ON, COSMA will try to natively use scalapack layout - // without transformation. Only used in the pxgemm wrapper. - const bool adapt_strategy = true; - // if ON, COSMA will try to overlap communication and computation - const bool overlap = false; - // specifies the maximum available CPU memory per rank in MB - const long long cpu_max_memory = std::numeric_limits::max(); // inf - // if true, local host matrices will be pinned - // (only used when GPU backend enabled) - // which increases the efficiency - const bool memory_pinning_enabled = true; - // The scaling factor used for the memory-pool allocation size.(cpu-only). - // If amortization = 1.2, then the memory allocator - // will request 1.2x the requested size (thus, 20% more than needed). - // Higher values better amortize the cost of memory buffers resizing - // which can occur when the algorithm is invoked for different matrix sizes. - // However, higher amortization values also mean that - // potentially more memory is allocated than used which can be - // a problem when the memory resource is tight. - // There is just a single memory pool in COSMA and all the required - // memory is taken from this memory pool only. - const double memory_pool_amortization = 1.2; - // minimum local matrix size -- if P is too large, so that after - // splitting the local matrix size get lower than this, - // then P will be reduced so that the problem size - // never gets smaller than specified by this variable - const int min_local_dimension = 200; - // if any dimension is smaller than this threshold, it will be dispatched to SCALAPACK - // since it's too "thin" for COSMA in that case - const int cosma_dim_threshold = 0; - // cpu memory alignment (currently disabled) - const int cosma_cpu_memory_alignment = 0; // 256; -}; +// number of GPU streams to be used per rank +const int gpu_n_streams = 2; +// max sizes of GPU tiles (in #elements) +// MxN corresponds to matrix C and K to the shared dimension +const int gpu_tile_m = 5000; +const int gpu_tile_n = 5000; +const int gpu_tile_k = 5000; +// if ON, COSMA will try to natively use scalapack layout +// without transformation. Only used in the pxgemm wrapper. +const bool adapt_strategy = true; +// if ON, COSMA will try to overlap communication and computation +const bool overlap = false; +// specifies the maximum available CPU memory per rank in MB +const long long cpu_max_memory = std::numeric_limits::max(); // inf +// if true, local host matrices will be pinned +// (only used when GPU backend enabled) +// which increases the efficiency +const bool memory_pinning_enabled = true; +// The scaling factor used for the memory-pool allocation size.(cpu-only). +// If amortization = 1.2, then the memory allocator +// will request 1.2x the requested size (thus, 20% more than needed). +// Higher values better amortize the cost of memory buffers resizing +// which can occur when the algorithm is invoked for different matrix sizes. +// However, higher amortization values also mean that +// potentially more memory is allocated than used which can be +// a problem when the memory resource is tight. +// There is just a single memory pool in COSMA and all the required +// memory is taken from this memory pool only. +const double memory_pool_amortization = 1.2; +// minimum local matrix size -- if P is too large, so that after +// splitting the local matrix size get lower than this, +// then P will be reduced so that the problem size +// never gets smaller than specified by this variable +const int min_local_dimension = 200; +// if any dimension is smaller than this threshold, it will be dispatched to +// SCALAPACK since it's too "thin" for COSMA in that case +const int cosma_dim_threshold = 0; +// cpu memory alignment (currently disabled) +const int cosma_cpu_memory_alignment = 0; // 256; +// gpu unified memory mechanism +const bool unified_memory = false; +}; // namespace env_var_defaults // checks if the specified environment variable is defined -bool env_var_defined(const char* var_name); +bool env_var_defined(const char *var_name); // checks if the environment variable with given name // is set to ON or OFF. If the variable is not defined, @@ -145,13 +148,13 @@ bool get_overlap_comm_and_comp(); // will request 1.2x the requested size (thus, 20% more than needed). // Higher values better amortize the cost of memory buffers resizing // which can occur when the algorithm is invoked for different matrix sizes. -// However, higher amortization values also mean that +// However, higher amortization values also mean that // potentially more memory is allocated than used which can be // a problem when the memory resource is tight. double get_memory_pool_amortization(); // reads the environment variable corresponding to -// the memory limit in MB per rank, converts the limit +// the memory limit in MB per rank, converts the limit // to #elements that each rank is allowed to use. // returns the default value if the variable is undefined template @@ -178,4 +181,6 @@ int get_cosma_dim_threshold(); // number of bytes to which all the buffers should be aligned int get_cosma_cpu_memory_alignment(); -} +// check if we use unified memory or not +bool get_unified_memory(); +} // namespace cosma diff --git a/src/cosma/local_multiply.cpp b/src/cosma/local_multiply.cpp index d0bbed5..780e347 100644 --- a/src/cosma/local_multiply.cpp +++ b/src/cosma/local_multiply.cpp @@ -6,6 +6,10 @@ #ifdef COSMA_HAVE_GPU #include #include + +#ifdef COSMA_USE_UNIFIED_MEMORY +#include +#endif #endif #if defined(COSMA_WITH_BLAS) || defined(COSMA_WITH_MKL_BLAS) @@ -23,6 +27,143 @@ namespace cosma { using clock_t = std::chrono::high_resolution_clock; using ms_t = std::chrono::milliseconds; +#ifdef COSMA_USE_UNIFIED_MEMORY +using zfloat = std::complex; +using zdouble = std::complex; + +int get_first(char trans, int m, int n) { return trans == 'N' ? m : n; } + +int get_second(char trans, int m, int n) { return trans == 'N' ? n : m; } + +gpu::blas_api::OperationType get_blas_operation(char trans) { + gpu::blas_api::OperationType op = + trans == 'T' + ? gpu::blas_api::operation::Transpose + : (trans == 'C' ? gpu::blas_api::operation::ConjugateTranspose + : gpu::blas_api::operation::None); + return op; +} + +gpu::blas_api::StatusType cublas_gemm_wrapper(gpu::blas_api::HandleType handle, + char trans_a, + char trans_b, + int m, + int n, + int k, + const float *alpha, + const float *a, + const float *b, + const float *beta, + float *c, + int lld_c) { + gpu::blas_api::OperationType op_a = get_blas_operation(trans_a); + gpu::blas_api::OperationType op_b = get_blas_operation(trans_b); + + int ld_a = get_first(trans_a, m, k); + int ld_b = get_first(trans_b, k, n); + + return gpu::blas_api::sgemm( + handle, op_a, op_b, m, n, k, alpha, a, ld_a, b, ld_b, beta, c, lld_c); +} + +gpu::blas_api::StatusType cublas_gemm_wrapper(gpu::blas_api::HandleType handle, + char trans_a, + char trans_b, + int m, + int n, + int k, + const double *alpha, + const double *a, + const double *b, + const double *beta, + double *c, + int lld_c) { + gpu::blas_api::OperationType op_a = get_blas_operation(trans_a); + gpu::blas_api::OperationType op_b = get_blas_operation(trans_b); + + int ld_a = get_first(trans_a, m, k); + int ld_b = get_first(trans_b, k, n); + + return gpu::blas_api::dgemm( + handle, op_a, op_b, m, n, k, alpha, a, ld_a, b, ld_b, beta, c, lld_c); +} + +// Note: Converting from std::complex to cuComplex and cuDoubleComple +// works because they are binary compatible. +// +// http://icl.cs.utk.edu/magma/forum/viewtopic.php?f=2&t=902 +// +gpu::blas_api::StatusType cublas_gemm_wrapper(gpu::blas_api::HandleType handle, + char trans_a, + char trans_b, + int m, + int n, + int k, + const zfloat *alpha, + const zfloat *a, + const zfloat *b, + const zfloat *beta, + zfloat *c, + int lld_c) { + gpu::blas_api::OperationType op_a = get_blas_operation(trans_a); + gpu::blas_api::OperationType op_b = get_blas_operation(trans_b); + + int ld_a = get_first(trans_a, m, k); + int ld_b = get_first(trans_b, k, n); + + return gpu::blas_api::cgemm( + handle, + op_a, + op_b, + m, + n, + k, + reinterpret_cast(alpha), + reinterpret_cast(a), + ld_a, + reinterpret_cast(b), + ld_b, + reinterpret_cast(beta), + reinterpret_cast(c), + lld_c); +} + +gpu::blas_api::StatusType cublas_gemm_wrapper(gpu::blas_api::HandleType handle, + char trans_a, + char trans_b, + int m, + int n, + int k, + const zdouble *alpha, + const zdouble *a, + const zdouble *b, + const zdouble *beta, + zdouble *c, + int lld_c) { + gpu::blas_api::OperationType op_a = get_blas_operation(trans_a); + gpu::blas_api::OperationType op_b = get_blas_operation(trans_b); + + int ld_a = get_first(trans_a, m, k); + int ld_b = get_first(trans_b, k, n); + + return gpu::blas_api::zgemm( + handle, + op_a, + op_b, + m, + n, + k, + reinterpret_cast(alpha), + reinterpret_cast(a), + ld_a, + reinterpret_cast(b), + ld_b, + reinterpret_cast(beta), + reinterpret_cast(c), + lld_c); +} +#endif + template void print_matrix(int m, int n, Scalar *A, char label) { std::cout << "Matrix " << label << std::endl; @@ -76,7 +217,7 @@ clock_t::time_point debug_gemm_end(Scalar *matrixA, #ifdef COSMA_HAVE_GPU template -void local_multiply(gpu::mm_handle* gpu_ctx, +void local_multiply(gpu::mm_handle *gpu_ctx, Scalar *matrixA, Scalar *matrixB, Scalar *matrixC, @@ -93,47 +234,62 @@ void local_multiply(gpu::mm_handle* gpu_ctx, if (rank == 0) { // print_matrix(m, k, matrixA, 'A'); // print_matrix(k, n, matrixB, 'B'); - // std::cout << "m = " << m << ", n = " << n << ", k = " << k << std::endl; + // std::cout << "m = " << m << ", n = " << n << ", k = " << k << + std::endl; } */ int ld_a = m; int ld_b = k; int ld_c = m; - gpu::gemm(*gpu_ctx, 'N', 'N', m, n, k, alpha, matrixA, ld_a, matrixB, ld_b, beta, matrixC, ld_c, pin_host_buffers, copy_c_back); + gpu::gemm(*gpu_ctx, + 'N', + 'N', + m, + n, + k, + alpha, + matrixA, + ld_a, + matrixB, + ld_b, + beta, + matrixC, + ld_c, + pin_host_buffers, + copy_c_back); + /* if (rank == 0) { - gpu::copy_to_host(gpu_ctx->get_full_device_buffer_c().data(), matrixC, m * n); - print_matrix(m, n, matrixC, 'C'); - std::cout << "alpha = " << alpha << ", beta = " << beta << std::endl; + gpu::copy_to_host(gpu_ctx->get_full_device_buffer_c().data(), matrixC, m + * n); print_matrix(m, n, matrixC, 'C'); std::cout << "alpha = " << alpha << + ", beta = " << beta << std::endl; } */ - } #endif template -Scalar& get_element(Scalar* mat, int m, int n, int i, int j) { +Scalar &get_element(Scalar *mat, int m, int n, int i, int j) { return mat[j * m + i]; } template -void local_multiply_cpu( - Scalar *matrixA, - Scalar *matrixB, - Scalar *matrixC, - int m, - int n, - int k, - Scalar alpha, - Scalar beta) { +void local_multiply_cpu(Scalar *matrixA, + Scalar *matrixB, + Scalar *matrixC, + int m, + int n, + int k, + Scalar alpha, + Scalar beta) { for (int mi = 0; mi < m; ++mi) { for (int ni = 0; ni < n; ++ni) { - Scalar& Cvalue = get_element(matrixC, m, n, mi, ni); + Scalar &Cvalue = get_element(matrixC, m, n, mi, ni); Cvalue *= beta; for (int ki = 0; ki < k; ++ki) { - Scalar& Avalue = get_element(matrixA, m, k, mi, ki); - Scalar& Bvalue = get_element(matrixB, k, n, ki, ni); + Scalar &Avalue = get_element(matrixA, m, k, mi, ki); + Scalar &Bvalue = get_element(matrixB, k, n, ki, ni); Cvalue += alpha * Avalue * Bvalue; } } @@ -141,7 +297,7 @@ void local_multiply_cpu( } template -void local_multiply(cosma_context* ctx, +void local_multiply(cosma_context *ctx, Scalar *matrixA, Scalar *matrixB, Scalar *matrixC, @@ -157,22 +313,58 @@ void local_multiply(cosma_context* ctx, #endif #ifdef COSMA_HAVE_GPU - PE(multiply_computation_pinning); - if (ctx->pin_host_buffers) { - ctx->get_memory_pool().pin(matrixA, m * k); - ctx->get_memory_pool().pin(matrixB, k * n); - // if (copy_c_back || std::abs(beta) > 0) { - ctx->get_memory_pool().pin(matrixC, m * n); - // } +#ifdef COSMA_USE_UNIFIED_MEMORY + if (ctx.unified_memory()) { + PE(multiply_computation_gemm); + auto status = cublas_gemm_wrapper( + ctx->get_gpu_context()->get_gpu_context().get_blas_handle(0), + 'N', + 'N', + m, + n, + k, + &alpha, + matrixA, + matrixB, + &beta, + matrixC, + m); + + gpu::check_blas_status(status); + // we need explicit synchronization over the stream to trigger the copy + // back to CPU memory + hipStreamSynchronize( + ctx->get_gpu_context()->get_gpu_context().get_stream(0)); + PL(); + } else { +#endif // COSMA_USE_UNIFIED_MEMORY + PE(multiply_computation_pinning); + if (ctx->pin_host_buffers) { + ctx->get_memory_pool().pin(matrixA, m * k); + ctx->get_memory_pool().pin(matrixB, k * n); + // if (copy_c_back || std::abs(beta) > 0) { + ctx->get_memory_pool().pin(matrixC, m * n); + // } + } + PL(); + + PE(multiply_computation_gemm); + local_multiply(ctx->get_gpu_context(), + matrixA, + matrixB, + matrixC, + m, + n, + k, + alpha, + beta, + false, + copy_c_back); + PL(); +#ifdef COSMA_USE_UNIFIED_MEMORY } - PL(); +#endif - PE(multiply_computation_gemm); - local_multiply(ctx->get_gpu_context(), - matrixA, matrixB, matrixC, - m, n, k, alpha, beta, - false, copy_c_back); - PL(); #else PE(multiply_computation_gemm); gemm(m, n, k, alpha, matrixA, m, matrixB, k, beta, matrixC, m); @@ -182,8 +374,9 @@ void local_multiply(cosma_context* ctx, #ifdef DEBUG auto t_end = debug_gemm_end(matrixA, matrixB, matrixC, m, n, k, alpha, beta); - std::cout << "time(" << m << ", " << n << ", " << k - << ") = " << std::chrono::duration_cast(t_end - t_start).count() << std::endl; + std::cout << "time(" << m << ", " << n << ", " << k << ") = " + << std::chrono::duration_cast(t_end - t_start).count() + << std::endl; #endif } @@ -198,14 +391,19 @@ void local_multiply(Scalar *matrixA, Scalar beta, bool copy_c_back) { local_multiply(get_context_instance(), - matrixA, matrixB, matrixC, - m, n, k, - alpha, beta, + matrixA, + matrixB, + matrixC, + m, + n, + k, + alpha, + beta, copy_c_back); } template -void local_multiply(context& ctx, +void local_multiply(context &ctx, Scalar *matrixA, Scalar *matrixB, Scalar *matrixC, @@ -215,7 +413,16 @@ void local_multiply(context& ctx, Scalar alpha, Scalar beta, bool copy_c_back) { - local_multiply(ctx.get(), matrixA, matrixB, matrixC, m, n, k, alpha, beta, copy_c_back); + local_multiply(ctx.get(), + matrixA, + matrixB, + matrixC, + m, + n, + k, + alpha, + beta, + copy_c_back); } // explicit template instantiation using context @@ -266,47 +473,43 @@ local_multiply>(cosma_context> *ctx, bool copy_c_back); // explicit template instantiation using context - no pinning -template void local_multiply_cpu( - double *matrixA, - double *matrixB, - double *matrixC, - int m, - int n, - int k, - double alpha, - double beta); - -template void local_multiply_cpu( - float *matrixA, - float *matrixB, - float *matrixC, - int m, - int n, - int k, - float alpha, - float beta); +template void local_multiply_cpu(double *matrixA, + double *matrixB, + double *matrixC, + int m, + int n, + int k, + double alpha, + double beta); + +template void local_multiply_cpu(float *matrixA, + float *matrixB, + float *matrixC, + int m, + int n, + int k, + float alpha, + float beta); template void -local_multiply_cpu>( - std::complex *matrixA, - std::complex *matrixB, - std::complex *matrixC, - int m, - int n, - int k, - std::complex alpha, - std::complex beta); +local_multiply_cpu>(std::complex *matrixA, + std::complex *matrixB, + std::complex *matrixC, + int m, + int n, + int k, + std::complex alpha, + std::complex beta); template void -local_multiply_cpu>( - std::complex *matrixA, - std::complex *matrixB, - std::complex *matrixC, - int m, - int n, - int k, - std::complex alpha, - std::complex beta); +local_multiply_cpu>(std::complex *matrixA, + std::complex *matrixB, + std::complex *matrixC, + int m, + int n, + int k, + std::complex alpha, + std::complex beta); // explicit template instantiation using context with unique_ptr context template void local_multiply(context &ctx, @@ -387,16 +590,15 @@ local_multiply>(std::complex *matrixA, std::complex beta, bool copy_c_back); -template void -local_multiply>(std::complex *matrixA, - std::complex *matrixB, - std::complex *matrixC, - int m, - int n, - int k, - std::complex alpha, - std::complex beta, - bool copy_c_back); +template void local_multiply>(std::complex *matrixA, + std::complex *matrixB, + std::complex *matrixC, + int m, + int n, + int k, + std::complex alpha, + std::complex beta, + bool copy_c_back); #ifdef COSMA_HAVE_GPU // explicit template instantiation using gpu context diff --git a/src/cosma/local_multiply.hpp b/src/cosma/local_multiply.hpp index b4daa9c..f3b15fe 100644 --- a/src/cosma/local_multiply.hpp +++ b/src/cosma/local_multiply.hpp @@ -4,7 +4,7 @@ namespace cosma { template -void local_multiply(cosma_context* ctx, +void local_multiply(cosma_context *ctx, Scalar *a, Scalar *b, Scalar *c, @@ -16,18 +16,17 @@ void local_multiply(cosma_context* ctx, bool copy_c_back); template -void local_multiply_cpu( - Scalar *a, - Scalar *b, - Scalar *c, - int m, - int n, - int k, - Scalar alpha, - Scalar beta); +void local_multiply_cpu(Scalar *a, + Scalar *b, + Scalar *c, + int m, + int n, + int k, + Scalar alpha, + Scalar beta); template -void local_multiply(context& ctx, +void local_multiply(context &ctx, scalar *a, scalar *b, scalar *c, @@ -51,7 +50,7 @@ void local_multiply(scalar *a, #ifdef COSMA_HAVE_GPU template -void local_multiply(gpu::mm_handle* gpu_ctx, +void local_multiply(gpu::mm_handle *gpu_ctx, scalar *a, scalar *b, scalar *c,