eth-cscs · mtaillefumier · Oct 24, 2024 · Aug 18, 2023 · Aug 22, 2023 · Aug 23, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -27,6 +27,7 @@ option(COSMA_WITH_PROFILING "Enable profiling." OFF)
 option(COSMA_WITH_NCCL "Use NCCL as communication backend." OFF)
 option(COSMA_WITH_RCCL "Use RCCL as communication backend." OFF)
 option(COSMA_WITH_GPU_AWARE_MPI "Use gpu-aware MPI for communication." OFF)
+option(COSMA_USE_UNIFIED_MEMORY "Use unified memory when GPU acceleration is ON" OFF)
 option(BUILD_SHARED_LIBS "Build shared libraries." OFF)
 set(COSMA_SCALAPACK "OFF" CACHE STRING "scalapack implementation. Can be MKL, CRAY_LIBSCI, NVPL, CUSTOM or OFF.")
 set(COSMA_BLAS "OFF" CACHE STRING "Blas library for computations on host or GPU")
@@ -42,13 +43,15 @@ set_property(CACHE COSMA_BLAS PROPERTY STRINGS ${COSMA_BLAS_LIST})
 # GPU supports since they are treated as separate components
 
 if(COSMA_BLAS STREQUAL "OFF")
-    message(FATAL_ERROR "A Blas implementation is needed when running on CPU only: choices are : auto, MKL, OPENBLAS, CRAY_LIBSCI, NVPL, CUSTOM, BLIS, ATLAS, FLEXIBLAS, ARMPL, GenericBLAS, CUDA or ROCM")
+    message(FATAL_ERROR "A Blas implementation is needed when running on CPU only: choices are : auto, MKL, OPENBLAS, CRAY_LIBSCI, CUSTOM, BLIS, ATLAS, FLEXIBLAS, ARMPL, GenericBLAS, CUDA or ROCM")
 endif()
 
 if (COSMA_BLAS MATCHES "CUDA|ROCM")
   set(COSMA_GPU_BACKEND ${COSMA_BLAS})
+  set(COSMA_BLAS_VENDOR "OFF")
 else()
   set(COSMA_BLAS_VENDOR ${COSMA_BLAS})
+  set(COSMA_GPU_BACKEND "OFF")
 endif()
 
 if ((COSMA_WITH_NCCL OR COSMA_WITH_RCCL) AND NOT COSMA_GPU_BACKEND IN_LIST COSMA_GPU_BACKENDS_LIST)
@@ -103,6 +106,7 @@ FetchContent_Declare(
   GIT_TAG        03847e66f05ad4a1eb371b85be628e218ce46f11 # v2.2.3
   FIND_PACKAGE_ARGS NAMES costa
 )
+
 # the joy of fetch_content. if we build costa and cosma together
 # fetch_content will pick up the FindSCALAPACK from cosma NOT costa.
 if (NOT TARGET costa::scalapack::scalapack AND NOT COSMA_SCALAPACK MATCHES "OFF")
@@ -141,7 +145,8 @@ if (COSMA_WITH_PROFILING)
     semiprof
     GIT_REPOSITORY  https://github.com/bcumming/semiprof.git
     GIT_TAG         f132142ff2215dfa073e416fa7911d8877d62752
-    FIND_PACKAGE_ARGS NAMES semiprof)
+    FIND_PACKAGE_ARGS NAMES semiprof
+  )
   FetchContent_MakeAvailable(semiprof)
 endif ()
 
@@ -239,3 +244,4 @@ endif()
 if(COSMA_WITH_BENCHMARKS AND NOT COSMA_BLAS MATCHES "OPENBLAS")
   add_subdirectory(benchmarks)
 endif()
+
diff --git a/spack/repo.yaml b/spack/repo.yaml
diff --git a/spack/packages/cosma/fj-ssl2.patch → ...k_repo/cosma/packages/cosma/fj-ssl2.patch b/spack/packages/cosma/fj-ssl2.patch → ...k_repo/cosma/packages/cosma/fj-ssl2.patch
diff --git a/spack/packages/cosma/package.py → spack_repo/cosma/packages/cosma/package.py b/spack/packages/cosma/package.py → spack_repo/cosma/packages/cosma/package.py
@@ -1,9 +1,10 @@
-# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
-# Spack Project Developers. See the top-level COPYRIGHT file for details.
+# Copyright Spack Project Developers. See COPYRIGHT file for details.
 #
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 
 
+from spack_repo.builtin.build_systems.cmake import CMakePackage
+
 from spack.package import *
 
 
@@ -22,6 +23,7 @@ class Cosma(CMakePackage):
     # note: The default archives produced with github do not have the archives
     #       of the submodules.
     version("master", branch="master", submodules=False)
+    version("2.7.0", sha256="f4775d18379539d7bb5053bff8acb4e13d6ed31a9677f498d9099a7500488789")
     version("2.6.6", sha256="1604be101e77192fbcc5551236bc87888d336e402f5409bbdd9dea900401cc37")
     version("2.6.5", sha256="10d9b7ecc1ce44ec5b9e0c0bf89278a63029912ec3ea99661be8576b553ececf")
     version("2.6.4", sha256="6d7bd5e3005874af9542a329c93e7ccd29ca1a5573dae27618fac2704fa2b6ab")
@@ -36,8 +38,6 @@ class Cosma(CMakePackage):
     version("2.0.7", sha256="8d70bfcbda6239b6a8fbeaca138790bbe58c0c3aa576879480d2632d4936cf7e")
     version("2.0.2", sha256="4f3354828bc718f3eef2f0098c3bdca3499297497a220da32db1acd57920c68d")
 
-    depends_on("cxx", type="build")  # generated
-
     # We just need the libraries of cuda and rocm, so no need to extend
     # CudaPackage or ROCmPackage.
     variant("cuda", default=False, description="Build with cuBLAS support")
@@ -55,6 +55,13 @@ class Cosma(CMakePackage):
     with when("+rocm"):
         variant("rccl", default=False, description="Use rocm rccl")
 
+    with when("@2.8.0:+rocm"):
+        variant("unified_memory", default=False)
+
+    depends_on("cxx", type="build")
+    depends_on("c", type="build")
+    depends_on("fortran", type="build")
+
     depends_on("[email protected]:", type="build")
     depends_on("mpi@3:")
     depends_on("blas", when="~cuda ~rocm")
@@ -82,25 +89,28 @@ class Cosma(CMakePackage):
 
     patch("fj-ssl2.patch", when="^fujitsu-ssl2")
 
-    def setup_build_environment(self, env):
+    def setup_build_environment(self, env: EnvironmentModifications) -> None:
         if self.spec.satisfies("+cuda"):
             env.set("CUDA_PATH", self.spec["cuda"].prefix)
 
     def cosma_blas_cmake_arg(self):
         query_to_cmake_arg = [
             ("+cuda", "CUDA"),
             ("+rocm", "ROCM"),
-            ("^intel-mkl", "MKL"),
-            ("^intel-oneapi-mkl", "MKL"),
-            ("^cray-libsci", "CRAY_LIBSCI"),
-            ("^netlib-lapack", "CUSTOM"),
-            ("^openblas", "OPENBLAS"),
-            ("^fujitsu-ssl2", "SSL2"),
+            ("^[virtuals=blas] intel-oneapi-mkl", "MKL"),
+            ("^[virtuals=blas] cray-libsci", "CRAY_LIBSCI"),
+            ("^[virtuals=blas] netlib-lapack", "CUSTOM"),
+            ("^[virtuals=blas] openblas", "OPENBLAS"),
+            ("^[virtuals=blas] fujitsu-ssl2", "SSL2"),
         ]
 
         if self.version >= Version("2.4.0"):
             query_to_cmake_arg.extend(
-                [("^blis", "BLIS"), ("^amdblis", "BLIS"), ("^atlas", "ATLAS")]
+                [
+                    ("^[virtuals=blas] blis", "BLIS"),
+                    ("^[virtuals=blas] amdblis", "BLIS"),
+                    ("^[virtuals=blas] atlas", "ATLAS"),
+                ]
             )
 
         for query, cmake_arg in query_to_cmake_arg:
@@ -114,7 +124,7 @@ def cosma_scalapack_cmake_arg(self):
 
         if spec.satisfies("~scalapack"):
             return "OFF"
-        elif spec.satisfies("^intel-mkl") or spec.satisfies("^intel-oneapi-mkl"):
+        elif spec.satisfies("^[virtuals=scalapack] intel-oneapi-mkl"):
             return "MKL"
         elif spec.satisfies("^cray-libsci"):
             return "CRAY_LIBSCI"
@@ -129,6 +139,7 @@ def cmake_args(self):
             self.define_from_variant("COSMA_WITH_RCCL", "rccl"),
             self.define_from_variant("COSMA_WITH_GPU_AWARE_MPI", "gpu_direct"),
             self.define_from_variant("COSMA_WITH_PROFILING", "profiling"),
+            self.define_from_variant("COSMA_USE_UNIFIED_MEMORY", "unified_memory"),
             self.define("COSMA_WITH_BENCHMARKS", False),
             self.define("COSMA_BLAS", self.cosma_blas_cmake_arg()),
             self.define("COSMA_SCALAPACK", self.cosma_scalapack_cmake_arg()),

diff --git a/spack/packages/tiled-mm/package.py → ...k_repo/cosma/packages/tiled-mm/package.py b/spack/packages/tiled-mm/package.py → ...k_repo/cosma/packages/tiled-mm/package.py
@@ -1,8 +1,11 @@
-# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
-# Spack Project Developers. See the top-level COPYRIGHT file for details.
+# Copyright Spack Project Developers. See COPYRIGHT file for details.
 #
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 
+from spack_repo.builtin.build_systems.cmake import CMakePackage
+from spack_repo.builtin.build_systems.cuda import CudaPackage
+from spack_repo.builtin.build_systems.rocm import ROCmPackage
+
 from spack.package import *
 
 
@@ -20,17 +23,18 @@ class TiledMm(CMakePackage, CudaPackage, ROCmPackage):
 
     version("master", branch="master")
 
+    version("2.3.2", sha256="1f91ca02f6ee8e400835fa90630618baf86a7b425b4bbbb4151068f72658b858")
     version("2.3.1", sha256="68914a483e62f796b790ea428210b1d5ef5943d6289e53d1aa62f56a20fbccc8")
     version("2.3", sha256="504c6201f5a9be9741c55036bf8e2656ae3f4bc19996295b264ee5e303c9253c")
     version("2.2", sha256="6d0b49c9588ece744166822fd44a7bc5bec3dc666b836de8bf4bf1a7bb675aac")
     version("2.0", sha256="ea554aea8c53d7c8e40044e6d478c0e8137d7e8b09d7cb9650703430d92cf32e")
 
-    depends_on("cxx", type="build")  # generated
-
     variant("shared", default=True, description="Build shared libraries")
     variant("examples", default=False, description="Enable examples")
     variant("tests", default=False, description="Enable tests")
 
+    depends_on("cxx", type="build")  # generated
+
     depends_on("rocblas", when="+rocm")
     depends_on("cxxopts", when="+tests")
     depends_on("cxxopts", when="+examples")

diff --git a/spack_repo/cosma/repo.yaml b/spack_repo/cosma/repo.yaml
@@ -0,0 +1,2 @@
+repo:
+  namespace: cosma
diff --git a/src/cosma/CMakeLists.txt b/src/cosma/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(INSTALLED_TARGETS_LIST "")
-set(cosma_src_files blas.cpp
+set(cosma_src_files
   buffer.cpp
   communicator.cpp
   context.cpp
@@ -18,6 +18,10 @@ set(cosma_src_files blas.cpp
   cinterface.cpp
   environment_variables.cpp)
 
+if (COSMA_GPU_BACKEND MATCHES "OFF")
+	LIST(APPEND cosma_src_files blas.cpp)
+endif ()
+
 if (COSMA_GPU_BACKEND MATCHES "ROCM" OR COSMA_GPU_BACKEND MATCHES "CUDA")
   list(APPEND cosma_src_files "pinned_buffers.cpp")
   if (COSMA_WITH_NCCL OR COSMA_WITH_RCCL)
@@ -40,7 +44,7 @@ target_link_libraries(cosma PUBLIC
   costa::costa
   $<TARGET_NAME_IF_EXISTS:roc::rccl>
   $<TARGET_NAME_IF_EXISTS:cosma::nccl>
-  $<$<NOT:$<STREQUAL:${COSMA_BLAS_VENDOR},"OFF">>:cosma::BLAS::blas>
+  $<$<STREQUAL:${COSMA_GPU_VENDOR},"OFF">:cosma::BLAS::blas>
   $<TARGET_NAME_IF_EXISTS:Tiled-MM::Tiled-MM>
   $<$<STREQUAL:${COSMA_GPU_BACKEND},CUDA>:Tiled-MM::Tiled-MM>
   $<$<STREQUAL:${COSMA_GPU_BACKEND},ROCM>:Tiled-MM::Tiled-MM>
@@ -55,12 +59,13 @@ target_compile_definitions(cosma PUBLIC
                               $<$<BOOL:${COSMA_WITH_RCCL}>:COSMA_WITH_NCCL>
                               $<$<STREQUAL:${COSMA_BLAS_VENDOR},MKL>:COSMA_WITH_MKL_BLAS>
                               $<$<STREQUAL:${COSMA_BLAS_VENDOR},BLIS>:COSMA_WITH_BLIS_BLAS>
-                              $<$<NOT:$<IN_LIST:${COSMA_BLAS_VENDOR},"MKL;BLIS">>:COSMA_WITH_BLAS>
+                              $<$<NOT:$<IN_LIST:${COSMA_BLAS_VENDOR},"MKL;BLISi;OFF">>:COSMA_WITH_BLAS>
                               $<$<STREQUAL:${COSMA_GPU_BACKEND},CUDA>:COSMA_HAVE_GPU>
                               $<$<STREQUAL:${COSMA_GPU_BACKEND},ROCM>:COSMA_HAVE_GPU>
                               PRIVATE
                               $<$<BOOL:${COSMA_WITH_PROFILING}>:COSMA_WITH_PROFILING>)
 
+
 list(APPEND INSTALLED_TARGETS_LIST "cosma")
 
 # if SCALAPACK is found and cosma_pxgemm library is not already created

diff --git a/src/cosma/aligned_allocator.hpp b/src/cosma/aligned_allocator.hpp
@@ -3,11 +3,11 @@
 #include <mpi.h>
 
 #include <cassert>
+#include <cosma/environment_variables.hpp>
+#include <cosma/math_utils.hpp>
 #include <exception>
 #include <iostream>
 #include <limits>
-#include <cosma/math_utils.hpp>
-#include <cosma/environment_variables.hpp>
 
 /*
  * A custom allocator that:
@@ -18,7 +18,7 @@
 namespace cosma {
 template <typename T>
 class aligned_allocator {
-public:
+  public:
     using value_type = T;
     using pointer = value_type *;
     using const_pointer = const value_type *;
@@ -38,10 +38,10 @@ class aligned_allocator {
 
     // the minimum alignment for given type T
     std::size_t min_alignment() {
-        return std::max(math_utils::next_power_of_2(sizeof(T)), sizeof(void*));
+        return std::max(math_utils::next_power_of_2(sizeof(T)), sizeof(void *));
     }
 
-    // Calculate how many additional elements we have to allocate for an array 
+    // Calculate how many additional elements we have to allocate for an array
     // of length n and data type T.
     static std::size_t get_alignment_padding(std::size_t n) {
         auto alignment = get_alignment();
@@ -50,34 +50,35 @@ class aligned_allocator {
         auto remainder = (n * sizeof(T)) % alignment;
 
         // Convert the padding from bytes to the number of elements
-        remainder = remainder!=0 ? (alignment - remainder) / sizeof(T) : 0;
+        remainder = remainder != 0 ? (alignment - remainder) / sizeof(T) : 0;
 
-        // std::cout << "For size " << n << ", reminder = " << remainder << std::endl;
-        // std::cout << "sizeof(T) = " << sizeof(T) << std::endl;
+        // std::cout << "For size " << n << ", reminder = " << remainder <<
+        // std::endl; std::cout << "sizeof(T) = " << sizeof(T) << std::endl;
         return remainder;
     }
 
     // allocate memory with alignment specified as a template parameter
     // returns nullptr on failure
-    T* aligned_malloc(std::size_t size) {
+    T *aligned_malloc(std::size_t size) {
         auto alignment = get_alignment();
         // if alignment is disabled, use the standard malloc
         if (alignment <= 0) {
-            return reinterpret_cast<T*>(malloc(size*sizeof(T)));
+            return reinterpret_cast<T *>(malloc(size * sizeof(T)));
         }
         // check if the requested size is a multiple of the alignment
         assert(get_alignment_padding(size) == 0);
         // check if the alignment is >= min_alignment for this data type T
         assert(alignment >= min_alignment());
-        // check if the alignment is a power of 2 and a multiple of sizeof(void*).
+        // check if the alignment is a power of 2 and a multiple of
+        // sizeof(void*).
         assert(math_utils::is_power_of_2(alignment));
         // "Memory alignment must be a power of 2.");
         // This is required for the posix_memalign function.
-        assert(alignment % sizeof(void*) == 0);
+        assert(alignment % sizeof(void *) == 0);
         // "Memory alignment must be a multiple of sizeof(void*)");
         void *ptr;
-        if (posix_memalign(&ptr, alignment, size*sizeof(T)) == 0) {
-            return reinterpret_cast<T*>(ptr);
+        if (posix_memalign(&ptr, alignment, size * sizeof(T)) == 0) {
+            return reinterpret_cast<T *>(ptr);
         }
         return nullptr;
     }
@@ -94,25 +95,36 @@ class aligned_allocator {
     pointer allocate(size_type cnt,
                      typename std::allocator<void>::const_pointer = 0) {
         if (cnt > 0) {
-            pointer ptr = aligned_malloc(cnt);
+            pointer ptr;
+            if (!cosma::get_unified_memory()) {
+                ptr = aligned_malloc(cnt);
+	    }
+#if defined(COSMA_USE_UNIFIED_MEMORY)
+            else {
+                hipMalloc(&ptr, cnt * sizeof(T));
+	    }
+#endif
             return ptr;
         }
         return nullptr;
     }
 
     void deallocate(pointer p, size_type cnt) {
         if (p) {
-            std::free(p);
+            if (!cosma::get_unified_memory())
+                std::free(p);
+#if defined(COSMA_USE_UNIFIED_MEMORY)
+            else
+                hipFree(p);
+#endif
         }
     }
 
     size_type max_size() const {
         return std::numeric_limits<size_type>::max() / sizeof(T);
     }
 
-    void construct(pointer p, const T &t) {
-        new (p) T(t);
-    }
+    void construct(pointer p, const T &t) { new (p) T(t); }
 
     void destroy(pointer p) {
         if (p) {