Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ option(COSMA_WITH_PROFILING "Enable profiling." OFF)
option(COSMA_WITH_NCCL "Use NCCL as communication backend." OFF)
option(COSMA_WITH_RCCL "Use RCCL as communication backend." OFF)
option(COSMA_WITH_GPU_AWARE_MPI "Use gpu-aware MPI for communication." OFF)
option(COSMA_USE_UNIFIED_MEMORY "Use unified memory when GPU acceleration is ON" OFF)
option(BUILD_SHARED_LIBS "Build shared libraries." OFF)
set(COSMA_SCALAPACK "OFF" CACHE STRING "scalapack implementation. Can be MKL, CRAY_LIBSCI, NVPL, CUSTOM or OFF.")
set(COSMA_BLAS "OFF" CACHE STRING "Blas library for computations on host or GPU")
Expand All @@ -42,13 +43,15 @@ set_property(CACHE COSMA_BLAS PROPERTY STRINGS ${COSMA_BLAS_LIST})
# GPU supports since they are treated as separate components

if(COSMA_BLAS STREQUAL "OFF")
message(FATAL_ERROR "A Blas implementation is needed when running on CPU only: choices are : auto, MKL, OPENBLAS, CRAY_LIBSCI, NVPL, CUSTOM, BLIS, ATLAS, FLEXIBLAS, ARMPL, GenericBLAS, CUDA or ROCM")
message(FATAL_ERROR "A Blas implementation is needed when running on CPU only: choices are : auto, MKL, OPENBLAS, CRAY_LIBSCI, CUSTOM, BLIS, ATLAS, FLEXIBLAS, ARMPL, GenericBLAS, CUDA or ROCM")
endif()

if (COSMA_BLAS MATCHES "CUDA|ROCM")
set(COSMA_GPU_BACKEND ${COSMA_BLAS})
set(COSMA_BLAS_VENDOR "OFF")
else()
set(COSMA_BLAS_VENDOR ${COSMA_BLAS})
set(COSMA_GPU_BACKEND "OFF")
endif()

if ((COSMA_WITH_NCCL OR COSMA_WITH_RCCL) AND NOT COSMA_GPU_BACKEND IN_LIST COSMA_GPU_BACKENDS_LIST)
Expand Down Expand Up @@ -103,6 +106,7 @@ FetchContent_Declare(
GIT_TAG 03847e66f05ad4a1eb371b85be628e218ce46f11 # v2.2.3
FIND_PACKAGE_ARGS NAMES costa
)

# the joy of fetch_content. if we build costa and cosma together
# fetch_content will pick up the FindSCALAPACK from cosma NOT costa.
if (NOT TARGET costa::scalapack::scalapack AND NOT COSMA_SCALAPACK MATCHES "OFF")
Expand Down Expand Up @@ -141,7 +145,8 @@ if (COSMA_WITH_PROFILING)
semiprof
GIT_REPOSITORY https://github.com/bcumming/semiprof.git
GIT_TAG f132142ff2215dfa073e416fa7911d8877d62752
FIND_PACKAGE_ARGS NAMES semiprof)
FIND_PACKAGE_ARGS NAMES semiprof
)
FetchContent_MakeAvailable(semiprof)
endif ()

Expand Down Expand Up @@ -239,3 +244,4 @@ endif()
if(COSMA_WITH_BENCHMARKS AND NOT COSMA_BLAS MATCHES "OPENBLAS")
add_subdirectory(benchmarks)
endif()

2 changes: 0 additions & 2 deletions spack/repo.yaml

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
# Copyright Spack Project Developers. See COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)


from spack_repo.builtin.build_systems.cmake import CMakePackage

from spack.package import *


Expand All @@ -22,6 +23,7 @@ class Cosma(CMakePackage):
# note: The default archives produced with github do not have the archives
# of the submodules.
version("master", branch="master", submodules=False)
version("2.7.0", sha256="f4775d18379539d7bb5053bff8acb4e13d6ed31a9677f498d9099a7500488789")
version("2.6.6", sha256="1604be101e77192fbcc5551236bc87888d336e402f5409bbdd9dea900401cc37")
version("2.6.5", sha256="10d9b7ecc1ce44ec5b9e0c0bf89278a63029912ec3ea99661be8576b553ececf")
version("2.6.4", sha256="6d7bd5e3005874af9542a329c93e7ccd29ca1a5573dae27618fac2704fa2b6ab")
Expand All @@ -36,8 +38,6 @@ class Cosma(CMakePackage):
version("2.0.7", sha256="8d70bfcbda6239b6a8fbeaca138790bbe58c0c3aa576879480d2632d4936cf7e")
version("2.0.2", sha256="4f3354828bc718f3eef2f0098c3bdca3499297497a220da32db1acd57920c68d")

depends_on("cxx", type="build") # generated

# We just need the libraries of cuda and rocm, so no need to extend
# CudaPackage or ROCmPackage.
variant("cuda", default=False, description="Build with cuBLAS support")
Expand All @@ -55,6 +55,13 @@ class Cosma(CMakePackage):
with when("+rocm"):
variant("rccl", default=False, description="Use rocm rccl")

with when("@2.8.0:+rocm"):
variant("unified_memory", default=False)

depends_on("cxx", type="build")
depends_on("c", type="build")
depends_on("fortran", type="build")

depends_on("[email protected]:", type="build")
depends_on("mpi@3:")
depends_on("blas", when="~cuda ~rocm")
Expand Down Expand Up @@ -82,25 +89,28 @@ class Cosma(CMakePackage):

patch("fj-ssl2.patch", when="^fujitsu-ssl2")

def setup_build_environment(self, env):
def setup_build_environment(self, env: EnvironmentModifications) -> None:
if self.spec.satisfies("+cuda"):
env.set("CUDA_PATH", self.spec["cuda"].prefix)

def cosma_blas_cmake_arg(self):
query_to_cmake_arg = [
("+cuda", "CUDA"),
("+rocm", "ROCM"),
("^intel-mkl", "MKL"),
("^intel-oneapi-mkl", "MKL"),
("^cray-libsci", "CRAY_LIBSCI"),
("^netlib-lapack", "CUSTOM"),
("^openblas", "OPENBLAS"),
("^fujitsu-ssl2", "SSL2"),
("^[virtuals=blas] intel-oneapi-mkl", "MKL"),
("^[virtuals=blas] cray-libsci", "CRAY_LIBSCI"),
("^[virtuals=blas] netlib-lapack", "CUSTOM"),
("^[virtuals=blas] openblas", "OPENBLAS"),
("^[virtuals=blas] fujitsu-ssl2", "SSL2"),
]

if self.version >= Version("2.4.0"):
query_to_cmake_arg.extend(
[("^blis", "BLIS"), ("^amdblis", "BLIS"), ("^atlas", "ATLAS")]
[
("^[virtuals=blas] blis", "BLIS"),
("^[virtuals=blas] amdblis", "BLIS"),
("^[virtuals=blas] atlas", "ATLAS"),
]
)

for query, cmake_arg in query_to_cmake_arg:
Expand All @@ -114,7 +124,7 @@ def cosma_scalapack_cmake_arg(self):

if spec.satisfies("~scalapack"):
return "OFF"
elif spec.satisfies("^intel-mkl") or spec.satisfies("^intel-oneapi-mkl"):
elif spec.satisfies("^[virtuals=scalapack] intel-oneapi-mkl"):
return "MKL"
elif spec.satisfies("^cray-libsci"):
return "CRAY_LIBSCI"
Expand All @@ -129,6 +139,7 @@ def cmake_args(self):
self.define_from_variant("COSMA_WITH_RCCL", "rccl"),
self.define_from_variant("COSMA_WITH_GPU_AWARE_MPI", "gpu_direct"),
self.define_from_variant("COSMA_WITH_PROFILING", "profiling"),
self.define_from_variant("COSMA_USE_UNIFIED_MEMORY", "unified_memory"),
self.define("COSMA_WITH_BENCHMARKS", False),
self.define("COSMA_BLAS", self.cosma_blas_cmake_arg()),
self.define("COSMA_SCALAPACK", self.cosma_scalapack_cmake_arg()),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
# Copyright Spack Project Developers. See COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)

from spack_repo.builtin.build_systems.cmake import CMakePackage
from spack_repo.builtin.build_systems.cuda import CudaPackage
from spack_repo.builtin.build_systems.rocm import ROCmPackage

from spack.package import *


Expand All @@ -20,17 +23,18 @@ class TiledMm(CMakePackage, CudaPackage, ROCmPackage):

version("master", branch="master")

version("2.3.2", sha256="1f91ca02f6ee8e400835fa90630618baf86a7b425b4bbbb4151068f72658b858")
version("2.3.1", sha256="68914a483e62f796b790ea428210b1d5ef5943d6289e53d1aa62f56a20fbccc8")
version("2.3", sha256="504c6201f5a9be9741c55036bf8e2656ae3f4bc19996295b264ee5e303c9253c")
version("2.2", sha256="6d0b49c9588ece744166822fd44a7bc5bec3dc666b836de8bf4bf1a7bb675aac")
version("2.0", sha256="ea554aea8c53d7c8e40044e6d478c0e8137d7e8b09d7cb9650703430d92cf32e")

depends_on("cxx", type="build") # generated

variant("shared", default=True, description="Build shared libraries")
variant("examples", default=False, description="Enable examples")
variant("tests", default=False, description="Enable tests")

depends_on("cxx", type="build") # generated

depends_on("rocblas", when="+rocm")
depends_on("cxxopts", when="+tests")
depends_on("cxxopts", when="+examples")
Expand Down
2 changes: 2 additions & 0 deletions spack_repo/cosma/repo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
repo:
namespace: cosma
11 changes: 8 additions & 3 deletions src/cosma/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set(INSTALLED_TARGETS_LIST "")
set(cosma_src_files blas.cpp
set(cosma_src_files
buffer.cpp
communicator.cpp
context.cpp
Expand All @@ -18,6 +18,10 @@ set(cosma_src_files blas.cpp
cinterface.cpp
environment_variables.cpp)

if (COSMA_GPU_BACKEND MATCHES "OFF")
LIST(APPEND cosma_src_files blas.cpp)
endif ()

if (COSMA_GPU_BACKEND MATCHES "ROCM" OR COSMA_GPU_BACKEND MATCHES "CUDA")
list(APPEND cosma_src_files "pinned_buffers.cpp")
if (COSMA_WITH_NCCL OR COSMA_WITH_RCCL)
Expand All @@ -40,7 +44,7 @@ target_link_libraries(cosma PUBLIC
costa::costa
$<TARGET_NAME_IF_EXISTS:roc::rccl>
$<TARGET_NAME_IF_EXISTS:cosma::nccl>
$<$<NOT:$<STREQUAL:${COSMA_BLAS_VENDOR},"OFF">>:cosma::BLAS::blas>
$<$<STREQUAL:${COSMA_GPU_VENDOR},"OFF">:cosma::BLAS::blas>
$<TARGET_NAME_IF_EXISTS:Tiled-MM::Tiled-MM>
$<$<STREQUAL:${COSMA_GPU_BACKEND},CUDA>:Tiled-MM::Tiled-MM>
$<$<STREQUAL:${COSMA_GPU_BACKEND},ROCM>:Tiled-MM::Tiled-MM>
Expand All @@ -55,12 +59,13 @@ target_compile_definitions(cosma PUBLIC
$<$<BOOL:${COSMA_WITH_RCCL}>:COSMA_WITH_NCCL>
$<$<STREQUAL:${COSMA_BLAS_VENDOR},MKL>:COSMA_WITH_MKL_BLAS>
$<$<STREQUAL:${COSMA_BLAS_VENDOR},BLIS>:COSMA_WITH_BLIS_BLAS>
$<$<NOT:$<IN_LIST:${COSMA_BLAS_VENDOR},"MKL;BLIS">>:COSMA_WITH_BLAS>
$<$<NOT:$<IN_LIST:${COSMA_BLAS_VENDOR},"MKL;BLISi;OFF">>:COSMA_WITH_BLAS>
$<$<STREQUAL:${COSMA_GPU_BACKEND},CUDA>:COSMA_HAVE_GPU>
$<$<STREQUAL:${COSMA_GPU_BACKEND},ROCM>:COSMA_HAVE_GPU>
PRIVATE
$<$<BOOL:${COSMA_WITH_PROFILING}>:COSMA_WITH_PROFILING>)


list(APPEND INSTALLED_TARGETS_LIST "cosma")

# if SCALAPACK is found and cosma_pxgemm library is not already created
Expand Down
50 changes: 31 additions & 19 deletions src/cosma/aligned_allocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
#include <mpi.h>

#include <cassert>
#include <cosma/environment_variables.hpp>
#include <cosma/math_utils.hpp>
#include <exception>
#include <iostream>
#include <limits>
#include <cosma/math_utils.hpp>
#include <cosma/environment_variables.hpp>

/*
* A custom allocator that:
Expand All @@ -18,7 +18,7 @@
namespace cosma {
template <typename T>
class aligned_allocator {
public:
public:
using value_type = T;
using pointer = value_type *;
using const_pointer = const value_type *;
Expand All @@ -38,10 +38,10 @@ class aligned_allocator {

// the minimum alignment for given type T
std::size_t min_alignment() {
return std::max(math_utils::next_power_of_2(sizeof(T)), sizeof(void*));
return std::max(math_utils::next_power_of_2(sizeof(T)), sizeof(void *));
}

// Calculate how many additional elements we have to allocate for an array
// Calculate how many additional elements we have to allocate for an array
// of length n and data type T.
static std::size_t get_alignment_padding(std::size_t n) {
auto alignment = get_alignment();
Expand All @@ -50,34 +50,35 @@ class aligned_allocator {
auto remainder = (n * sizeof(T)) % alignment;

// Convert the padding from bytes to the number of elements
remainder = remainder!=0 ? (alignment - remainder) / sizeof(T) : 0;
remainder = remainder != 0 ? (alignment - remainder) / sizeof(T) : 0;

// std::cout << "For size " << n << ", reminder = " << remainder << std::endl;
// std::cout << "sizeof(T) = " << sizeof(T) << std::endl;
// std::cout << "For size " << n << ", reminder = " << remainder <<
// std::endl; std::cout << "sizeof(T) = " << sizeof(T) << std::endl;
return remainder;
}

// allocate memory with alignment specified as a template parameter
// returns nullptr on failure
T* aligned_malloc(std::size_t size) {
T *aligned_malloc(std::size_t size) {
auto alignment = get_alignment();
// if alignment is disabled, use the standard malloc
if (alignment <= 0) {
return reinterpret_cast<T*>(malloc(size*sizeof(T)));
return reinterpret_cast<T *>(malloc(size * sizeof(T)));
}
// check if the requested size is a multiple of the alignment
assert(get_alignment_padding(size) == 0);
// check if the alignment is >= min_alignment for this data type T
assert(alignment >= min_alignment());
// check if the alignment is a power of 2 and a multiple of sizeof(void*).
// check if the alignment is a power of 2 and a multiple of
// sizeof(void*).
assert(math_utils::is_power_of_2(alignment));
// "Memory alignment must be a power of 2.");
// This is required for the posix_memalign function.
assert(alignment % sizeof(void*) == 0);
assert(alignment % sizeof(void *) == 0);
// "Memory alignment must be a multiple of sizeof(void*)");
void *ptr;
if (posix_memalign(&ptr, alignment, size*sizeof(T)) == 0) {
return reinterpret_cast<T*>(ptr);
if (posix_memalign(&ptr, alignment, size * sizeof(T)) == 0) {
return reinterpret_cast<T *>(ptr);
}
return nullptr;
}
Expand All @@ -94,25 +95,36 @@ class aligned_allocator {
pointer allocate(size_type cnt,
typename std::allocator<void>::const_pointer = 0) {
if (cnt > 0) {
pointer ptr = aligned_malloc(cnt);
pointer ptr;
if (!cosma::get_unified_memory()) {
ptr = aligned_malloc(cnt);
}
#if defined(COSMA_USE_UNIFIED_MEMORY)
else {
hipMalloc(&ptr, cnt * sizeof(T));
}
#endif
return ptr;
}
return nullptr;
}

void deallocate(pointer p, size_type cnt) {
if (p) {
std::free(p);
if (!cosma::get_unified_memory())
std::free(p);
#if defined(COSMA_USE_UNIFIED_MEMORY)
else
hipFree(p);
#endif
}
}

size_type max_size() const {
return std::numeric_limits<size_type>::max() / sizeof(T);
}

void construct(pointer p, const T &t) {
new (p) T(t);
}
void construct(pointer p, const T &t) { new (p) T(t); }

void destroy(pointer p) {
if (p) {
Expand Down
Loading