diff --git a/CMakeLists.txt b/CMakeLists.txt
index 00297ae..51f1630 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,7 @@ option(COSMA_WITH_PROFILING "Enable profiling." OFF)
 option(COSMA_WITH_NCCL "Use NCCL as communication backend." OFF)
 option(COSMA_WITH_RCCL "Use RCCL as communication backend." OFF)
 option(COSMA_WITH_GPU_AWARE_MPI "Use gpu-aware MPI for communication." OFF)
+option(COSMA_USE_UNIFIED_MEMORY "Use unified memory when GPU acceleration is ON" OFF)
 option(BUILD_SHARED_LIBS "Build shared libraries." OFF)
 set(COSMA_SCALAPACK "OFF" CACHE STRING "scalapack implementation. Can be MKL, CRAY_LIBSCI, NVPL, CUSTOM or OFF.")
 set(COSMA_BLAS "OFF" CACHE STRING "Blas library for computations on host or GPU")
@@ -42,13 +43,15 @@ set_property(CACHE COSMA_BLAS PROPERTY STRINGS ${COSMA_BLAS_LIST})
 # GPU supports since they are treated as separate components
 
 if(COSMA_BLAS STREQUAL "OFF")
-    message(FATAL_ERROR "A Blas implementation is needed when running on CPU only: choices are : auto, MKL, OPENBLAS, CRAY_LIBSCI, NVPL, CUSTOM, BLIS, ATLAS, FLEXIBLAS, ARMPL, GenericBLAS, CUDA or ROCM")
+    message(FATAL_ERROR "A Blas implementation is needed when running on CPU only: choices are : auto, MKL, OPENBLAS, CRAY_LIBSCI, CUSTOM, BLIS, ATLAS, FLEXIBLAS, ARMPL, GenericBLAS, CUDA or ROCM")
 endif()
 
 if (COSMA_BLAS MATCHES "CUDA|ROCM")
   set(COSMA_GPU_BACKEND ${COSMA_BLAS})
+  set(COSMA_BLAS_VENDOR "OFF")
 else()
   set(COSMA_BLAS_VENDOR ${COSMA_BLAS})
+  set(COSMA_GPU_BACKEND "OFF")
 endif()
 
 if ((COSMA_WITH_NCCL OR COSMA_WITH_RCCL) AND NOT COSMA_GPU_BACKEND IN_LIST COSMA_GPU_BACKENDS_LIST)
@@ -103,6 +106,7 @@ FetchContent_Declare(
   GIT_TAG        03847e66f05ad4a1eb371b85be628e218ce46f11 # v2.2.3
   FIND_PACKAGE_ARGS NAMES costa
 )
+
 # the joy of fetch_content. if we build costa and cosma together
 # fetch_content will pick up the FindSCALAPACK from cosma NOT costa.
 if (NOT TARGET costa::scalapack::scalapack AND NOT COSMA_SCALAPACK MATCHES "OFF")
@@ -141,7 +145,8 @@ if (COSMA_WITH_PROFILING)
     semiprof
     GIT_REPOSITORY  https://github.com/bcumming/semiprof.git
     GIT_TAG         f132142ff2215dfa073e416fa7911d8877d62752
-    FIND_PACKAGE_ARGS NAMES semiprof)
+    FIND_PACKAGE_ARGS NAMES semiprof
+  )
   FetchContent_MakeAvailable(semiprof)
 endif ()
 
@@ -239,3 +244,4 @@ endif()
 if(COSMA_WITH_BENCHMARKS AND NOT COSMA_BLAS MATCHES "OPENBLAS")
   add_subdirectory(benchmarks)
 endif()
+
diff --git a/spack/repo.yaml b/spack/repo.yaml
deleted file mode 100644
index 0309bae..0000000
--- a/spack/repo.yaml
+++ /dev/null
@@ -1,2 +0,0 @@
-repo:
-  namespace: cosma-repo
diff --git a/spack/packages/cosma/fj-ssl2.patch b/spack_repo/cosma/packages/cosma/fj-ssl2.patch
similarity index 100%
rename from spack/packages/cosma/fj-ssl2.patch
rename to spack_repo/cosma/packages/cosma/fj-ssl2.patch
diff --git a/spack/packages/cosma/package.py b/spack_repo/cosma/packages/cosma/package.py
similarity index 81%
rename from spack/packages/cosma/package.py
rename to spack_repo/cosma/packages/cosma/package.py
index 6e62f71..befc51d 100644
--- a/spack/packages/cosma/package.py
+++ b/spack_repo/cosma/packages/cosma/package.py
@@ -1,9 +1,10 @@
-# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
-# Spack Project Developers. See the top-level COPYRIGHT file for details.
+# Copyright Spack Project Developers. See COPYRIGHT file for details.
 #
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 
 
+from spack_repo.builtin.build_systems.cmake import CMakePackage
+
 from spack.package import *
 
 
@@ -22,6 +23,7 @@ class Cosma(CMakePackage):
     # note: The default archives produced with github do not have the archives
     #       of the submodules.
     version("master", branch="master", submodules=False)
+    version("2.7.0", sha256="f4775d18379539d7bb5053bff8acb4e13d6ed31a9677f498d9099a7500488789")
     version("2.6.6", sha256="1604be101e77192fbcc5551236bc87888d336e402f5409bbdd9dea900401cc37")
     version("2.6.5", sha256="10d9b7ecc1ce44ec5b9e0c0bf89278a63029912ec3ea99661be8576b553ececf")
     version("2.6.4", sha256="6d7bd5e3005874af9542a329c93e7ccd29ca1a5573dae27618fac2704fa2b6ab")
@@ -36,8 +38,6 @@ class Cosma(CMakePackage):
     version("2.0.7", sha256="8d70bfcbda6239b6a8fbeaca138790bbe58c0c3aa576879480d2632d4936cf7e")
     version("2.0.2", sha256="4f3354828bc718f3eef2f0098c3bdca3499297497a220da32db1acd57920c68d")
 
-    depends_on("cxx", type="build")  # generated
-
     # We just need the libraries of cuda and rocm, so no need to extend
     # CudaPackage or ROCmPackage.
     variant("cuda", default=False, description="Build with cuBLAS support")
@@ -55,6 +55,13 @@ class Cosma(CMakePackage):
     with when("+rocm"):
         variant("rccl", default=False, description="Use rocm rccl")
 
+    with when("@2.8.0:+rocm"):
+        variant("unified_memory", default=False)
+
+    depends_on("cxx", type="build")
+    depends_on("c", type="build")
+    depends_on("fortran", type="build")
+
     depends_on("cmake@3.22:", type="build")
     depends_on("mpi@3:")
     depends_on("blas", when="~cuda ~rocm")
@@ -82,7 +89,7 @@ class Cosma(CMakePackage):
 
     patch("fj-ssl2.patch", when="^fujitsu-ssl2")
 
-    def setup_build_environment(self, env):
+    def setup_build_environment(self, env: EnvironmentModifications) -> None:
         if self.spec.satisfies("+cuda"):
             env.set("CUDA_PATH", self.spec["cuda"].prefix)
 
@@ -90,17 +97,20 @@ def cosma_blas_cmake_arg(self):
         query_to_cmake_arg = [
             ("+cuda", "CUDA"),
             ("+rocm", "ROCM"),
-            ("^intel-mkl", "MKL"),
-            ("^intel-oneapi-mkl", "MKL"),
-            ("^cray-libsci", "CRAY_LIBSCI"),
-            ("^netlib-lapack", "CUSTOM"),
-            ("^openblas", "OPENBLAS"),
-            ("^fujitsu-ssl2", "SSL2"),
+            ("^[virtuals=blas] intel-oneapi-mkl", "MKL"),
+            ("^[virtuals=blas] cray-libsci", "CRAY_LIBSCI"),
+            ("^[virtuals=blas] netlib-lapack", "CUSTOM"),
+            ("^[virtuals=blas] openblas", "OPENBLAS"),
+            ("^[virtuals=blas] fujitsu-ssl2", "SSL2"),
         ]
 
         if self.version >= Version("2.4.0"):
             query_to_cmake_arg.extend(
-                [("^blis", "BLIS"), ("^amdblis", "BLIS"), ("^atlas", "ATLAS")]
+                [
+                    ("^[virtuals=blas] blis", "BLIS"),
+                    ("^[virtuals=blas] amdblis", "BLIS"),
+                    ("^[virtuals=blas] atlas", "ATLAS"),
+                ]
             )
 
         for query, cmake_arg in query_to_cmake_arg:
@@ -114,7 +124,7 @@ def cosma_scalapack_cmake_arg(self):
 
         if spec.satisfies("~scalapack"):
             return "OFF"
-        elif spec.satisfies("^intel-mkl") or spec.satisfies("^intel-oneapi-mkl"):
+        elif spec.satisfies("^[virtuals=scalapack] intel-oneapi-mkl"):
             return "MKL"
         elif spec.satisfies("^cray-libsci"):
             return "CRAY_LIBSCI"
@@ -129,6 +139,7 @@ def cmake_args(self):
             self.define_from_variant("COSMA_WITH_RCCL", "rccl"),
             self.define_from_variant("COSMA_WITH_GPU_AWARE_MPI", "gpu_direct"),
             self.define_from_variant("COSMA_WITH_PROFILING", "profiling"),
+            self.define_from_variant("COSMA_USE_UNIFIED_MEMORY", "unified_memory"),
             self.define("COSMA_WITH_BENCHMARKS", False),
             self.define("COSMA_BLAS", self.cosma_blas_cmake_arg()),
             self.define("COSMA_SCALAPACK", self.cosma_scalapack_cmake_arg()),
diff --git a/spack/packages/tiled-mm/package.py b/spack_repo/cosma/packages/tiled-mm/package.py
similarity index 84%
rename from spack/packages/tiled-mm/package.py
rename to spack_repo/cosma/packages/tiled-mm/package.py
index 9a2ea91..bed4257 100644
--- a/spack/packages/tiled-mm/package.py
+++ b/spack_repo/cosma/packages/tiled-mm/package.py
@@ -1,8 +1,11 @@
-# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
-# Spack Project Developers. See the top-level COPYRIGHT file for details.
+# Copyright Spack Project Developers. See COPYRIGHT file for details.
 #
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 
+from spack_repo.builtin.build_systems.cmake import CMakePackage
+from spack_repo.builtin.build_systems.cuda import CudaPackage
+from spack_repo.builtin.build_systems.rocm import ROCmPackage
+
 from spack.package import *
 
 
@@ -20,17 +23,18 @@ class TiledMm(CMakePackage, CudaPackage, ROCmPackage):
 
     version("master", branch="master")
 
+    version("2.3.2", sha256="1f91ca02f6ee8e400835fa90630618baf86a7b425b4bbbb4151068f72658b858")
     version("2.3.1", sha256="68914a483e62f796b790ea428210b1d5ef5943d6289e53d1aa62f56a20fbccc8")
     version("2.3", sha256="504c6201f5a9be9741c55036bf8e2656ae3f4bc19996295b264ee5e303c9253c")
     version("2.2", sha256="6d0b49c9588ece744166822fd44a7bc5bec3dc666b836de8bf4bf1a7bb675aac")
     version("2.0", sha256="ea554aea8c53d7c8e40044e6d478c0e8137d7e8b09d7cb9650703430d92cf32e")
 
-    depends_on("cxx", type="build")  # generated
-
     variant("shared", default=True, description="Build shared libraries")
     variant("examples", default=False, description="Enable examples")
     variant("tests", default=False, description="Enable tests")
 
+    depends_on("cxx", type="build")  # generated
+
     depends_on("rocblas", when="+rocm")
     depends_on("cxxopts", when="+tests")
     depends_on("cxxopts", when="+examples")
diff --git a/spack_repo/cosma/repo.yaml b/spack_repo/cosma/repo.yaml
new file mode 100644
index 0000000..3b46caa
--- /dev/null
+++ b/spack_repo/cosma/repo.yaml
@@ -0,0 +1,2 @@
+repo:
+  namespace: cosma
diff --git a/src/cosma/CMakeLists.txt b/src/cosma/CMakeLists.txt
index 03d2bb7..10d49fe 100644
--- a/src/cosma/CMakeLists.txt
+++ b/src/cosma/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(INSTALLED_TARGETS_LIST "")
-set(cosma_src_files blas.cpp
+set(cosma_src_files
   buffer.cpp
   communicator.cpp
   context.cpp
@@ -18,6 +18,10 @@ set(cosma_src_files blas.cpp
   cinterface.cpp
   environment_variables.cpp)
 
+if (COSMA_GPU_BACKEND MATCHES "OFF")
+	LIST(APPEND cosma_src_files blas.cpp)
+endif ()
+
 if (COSMA_GPU_BACKEND MATCHES "ROCM" OR COSMA_GPU_BACKEND MATCHES "CUDA")
   list(APPEND cosma_src_files "pinned_buffers.cpp")
   if (COSMA_WITH_NCCL OR COSMA_WITH_RCCL)
@@ -40,7 +44,7 @@ target_link_libraries(cosma PUBLIC
   costa::costa
   $<TARGET_NAME_IF_EXISTS:roc::rccl>
   $<TARGET_NAME_IF_EXISTS:cosma::nccl>
-  $<$<NOT:$<STREQUAL:${COSMA_BLAS_VENDOR},"OFF">>:cosma::BLAS::blas>
+  $<$<STREQUAL:${COSMA_GPU_VENDOR},"OFF">:cosma::BLAS::blas>
   $<TARGET_NAME_IF_EXISTS:Tiled-MM::Tiled-MM>
   $<$<STREQUAL:${COSMA_GPU_BACKEND},CUDA>:Tiled-MM::Tiled-MM>
   $<$<STREQUAL:${COSMA_GPU_BACKEND},ROCM>:Tiled-MM::Tiled-MM>
@@ -55,12 +59,13 @@ target_compile_definitions(cosma PUBLIC
                               $<$<BOOL:${COSMA_WITH_RCCL}>:COSMA_WITH_NCCL>
                               $<$<STREQUAL:${COSMA_BLAS_VENDOR},MKL>:COSMA_WITH_MKL_BLAS>
                               $<$<STREQUAL:${COSMA_BLAS_VENDOR},BLIS>:COSMA_WITH_BLIS_BLAS>
-                              $<$<NOT:$<IN_LIST:${COSMA_BLAS_VENDOR},"MKL;BLIS">>:COSMA_WITH_BLAS>
+                              $<$<NOT:$<IN_LIST:${COSMA_BLAS_VENDOR},"MKL;BLISi;OFF">>:COSMA_WITH_BLAS>
                               $<$<STREQUAL:${COSMA_GPU_BACKEND},CUDA>:COSMA_HAVE_GPU>
                               $<$<STREQUAL:${COSMA_GPU_BACKEND},ROCM>:COSMA_HAVE_GPU>
                               PRIVATE
                               $<$<BOOL:${COSMA_WITH_PROFILING}>:COSMA_WITH_PROFILING>)
 
+
 list(APPEND INSTALLED_TARGETS_LIST "cosma")
 
 # if SCALAPACK is found and cosma_pxgemm library is not already created
diff --git a/src/cosma/aligned_allocator.hpp b/src/cosma/aligned_allocator.hpp
index af49559..45fd3c0 100644
--- a/src/cosma/aligned_allocator.hpp
+++ b/src/cosma/aligned_allocator.hpp
@@ -3,11 +3,11 @@
 #include <mpi.h>
 
 #include <cassert>
+#include <cosma/environment_variables.hpp>
+#include <cosma/math_utils.hpp>
 #include <exception>
 #include <iostream>
 #include <limits>
-#include <cosma/math_utils.hpp>
-#include <cosma/environment_variables.hpp>
 
 /*
  * A custom allocator that:
@@ -18,7 +18,7 @@
 namespace cosma {
 template <typename T>
 class aligned_allocator {
-public:
+  public:
     using value_type = T;
     using pointer = value_type *;
     using const_pointer = const value_type *;
@@ -38,10 +38,10 @@ class aligned_allocator {
 
     // the minimum alignment for given type T
     std::size_t min_alignment() {
-        return std::max(math_utils::next_power_of_2(sizeof(T)), sizeof(void*));
+        return std::max(math_utils::next_power_of_2(sizeof(T)), sizeof(void *));
     }
 
-    // Calculate how many additional elements we have to allocate for an array 
+    // Calculate how many additional elements we have to allocate for an array
     // of length n and data type T.
     static std::size_t get_alignment_padding(std::size_t n) {
         auto alignment = get_alignment();
@@ -50,34 +50,35 @@ class aligned_allocator {
         auto remainder = (n * sizeof(T)) % alignment;
 
         // Convert the padding from bytes to the number of elements
-        remainder = remainder!=0 ? (alignment - remainder) / sizeof(T) : 0;
+        remainder = remainder != 0 ? (alignment - remainder) / sizeof(T) : 0;
 
-        // std::cout << "For size " << n << ", reminder = " << remainder << std::endl;
-        // std::cout << "sizeof(T) = " << sizeof(T) << std::endl;
+        // std::cout << "For size " << n << ", reminder = " << remainder <<
+        // std::endl; std::cout << "sizeof(T) = " << sizeof(T) << std::endl;
         return remainder;
     }
 
     // allocate memory with alignment specified as a template parameter
     // returns nullptr on failure
-    T* aligned_malloc(std::size_t size) {
+    T *aligned_malloc(std::size_t size) {
         auto alignment = get_alignment();
         // if alignment is disabled, use the standard malloc
         if (alignment <= 0) {
-            return reinterpret_cast<T*>(malloc(size*sizeof(T)));
+            return reinterpret_cast<T *>(malloc(size * sizeof(T)));
         }
         // check if the requested size is a multiple of the alignment
         assert(get_alignment_padding(size) == 0);
         // check if the alignment is >= min_alignment for this data type T
         assert(alignment >= min_alignment());
-        // check if the alignment is a power of 2 and a multiple of sizeof(void*).
+        // check if the alignment is a power of 2 and a multiple of
+        // sizeof(void*).
         assert(math_utils::is_power_of_2(alignment));
         // "Memory alignment must be a power of 2.");
         // This is required for the posix_memalign function.
-        assert(alignment % sizeof(void*) == 0);
+        assert(alignment % sizeof(void *) == 0);
         // "Memory alignment must be a multiple of sizeof(void*)");
         void *ptr;
-        if (posix_memalign(&ptr, alignment, size*sizeof(T)) == 0) {
-            return reinterpret_cast<T*>(ptr);
+        if (posix_memalign(&ptr, alignment, size * sizeof(T)) == 0) {
+            return reinterpret_cast<T *>(ptr);
         }
         return nullptr;
     }
@@ -94,7 +95,15 @@ class aligned_allocator {
     pointer allocate(size_type cnt,
                      typename std::allocator<void>::const_pointer = 0) {
         if (cnt > 0) {
-            pointer ptr = aligned_malloc(cnt);
+            pointer ptr;
+            if (!cosma::get_unified_memory()) {
+                ptr = aligned_malloc(cnt);
+	    }
+#if defined(COSMA_USE_UNIFIED_MEMORY)
+            else {
+                hipMalloc(&ptr, cnt * sizeof(T));
+	    }
+#endif
             return ptr;
         }
         return nullptr;
@@ -102,7 +111,12 @@ class aligned_allocator {
 
     void deallocate(pointer p, size_type cnt) {
         if (p) {
-            std::free(p);
+            if (!cosma::get_unified_memory())
+                std::free(p);
+#if defined(COSMA_USE_UNIFIED_MEMORY)
+            else
+                hipFree(p);
+#endif
         }
     }
 
@@ -110,9 +124,7 @@ class aligned_allocator {
         return std::numeric_limits<size_type>::max() / sizeof(T);
     }
 
-    void construct(pointer p, const T &t) {
-        new (p) T(t);
-    }
+    void construct(pointer p, const T &t) { new (p) T(t); }
 
     void destroy(pointer p) {
         if (p) {
diff --git a/src/cosma/context.cpp b/src/cosma/context.cpp
index 3a3f027..5e6462c 100644
--- a/src/cosma/context.cpp
+++ b/src/cosma/context.cpp
@@ -9,7 +9,7 @@
 namespace cosma {
 #ifdef COSMA_HAVE_GPU
 template <typename Scalar>
-gpu::mm_handle<Scalar>* cosma_context<Scalar>::get_gpu_context() {
+gpu::mm_handle<Scalar> *cosma_context<Scalar>::get_gpu_context() {
     return gpu_ctx_.get();
 }
 #endif
@@ -21,26 +21,29 @@ cosma_context<Scalar>::cosma_context() {
     overlap_comm_and_comp = get_overlap_comm_and_comp();
     pin_host_buffers = get_memory_pinning();
 #ifdef COSMA_HAVE_GPU
-    gpu_ctx_ = gpu::make_context<Scalar>(gpu_streams(),
-                                         gpu_max_tile_m(),
-                                         gpu_max_tile_n(),
-                                         gpu_max_tile_k());
+    gpu_ctx_ = gpu::make_context<Scalar>(
+        gpu_streams(), gpu_max_tile_m(), gpu_max_tile_n(), gpu_max_tile_k());
 #endif
 }
 
 template <typename Scalar>
-cosma_context<Scalar>::cosma_context(size_t cpu_mem_limit, int streams, int tile_m, int tile_n, int tile_k) {
-    cpu_memory_limit = (long long) cpu_mem_limit;
+cosma_context<Scalar>::cosma_context(size_t cpu_mem_limit,
+                                     int streams,
+                                     int tile_m,
+                                     int tile_n,
+                                     int tile_k) {
+    cpu_memory_limit = (long long)cpu_mem_limit;
     adapt_to_scalapack_strategy = get_adapt_strategy();
     overlap_comm_and_comp = get_overlap_comm_and_comp();
     pin_host_buffers = get_memory_pinning();
     memory_pool_.amortization = get_memory_pool_amortization();
     // do not reserve nor resize the memory pool
     // let this just serve as the upper bound when creating a strategy
-    // because otherwise, it might reserve/resize to much more than the problem requires
-    // memory_pool_.resize(cpu_mem_limit);
+    // because otherwise, it might reserve/resize to much more than the problem
+    // requires memory_pool_.resize(cpu_mem_limit);
 #ifdef COSMA_HAVE_GPU
     gpu_ctx_ = gpu::make_context<Scalar>(streams, tile_m, tile_n, tile_k);
+    use_unified_memory_ = cosma::get_unified_memory();
 #else
     std::cout << "Ignoring parameters in make_context. These parameters only "
                  "used in the CPU version."
@@ -59,7 +62,7 @@ cosma_context<Scalar>::~cosma_context() {
 }
 
 template <typename Scalar>
-memory_pool<Scalar>& cosma_context<Scalar>::get_memory_pool() {
+memory_pool<Scalar> &cosma_context<Scalar>::get_memory_pool() {
     return memory_pool_;
 }
 
@@ -69,14 +72,15 @@ long long cosma_context<Scalar>::get_cpu_memory_limit() {
 }
 
 template <typename Scalar>
-cosma::communicator* cosma_context<Scalar>::get_cosma_comm() {
+cosma::communicator *cosma_context<Scalar>::get_cosma_comm() {
     return prev_cosma_comm.get();
 }
 
 template <typename Scalar>
 void cosma_context<Scalar>::register_state(MPI_Comm comm,
                                            const Strategy strategy) {
-    if (comm == MPI_COMM_NULL) return;
+    if (comm == MPI_COMM_NULL)
+        return;
 
     int same_comm = 0;
 
@@ -90,22 +94,22 @@ void cosma_context<Scalar>::register_state(MPI_Comm comm,
         MPI_Comm prev_comm = prev_cosma_comm->full_comm();
         int comm_compare;
         MPI_Comm_compare(prev_comm, comm, &comm_compare);
-        same_comm = comm_compare == MPI_CONGRUENT ||
-                    comm_compare == MPI_IDENT;
+        same_comm = comm_compare == MPI_CONGRUENT || comm_compare == MPI_IDENT;
 
-  bool same_strategy = strategy == prev_strategy;
+        bool same_strategy = strategy == prev_strategy;
 
         // if same_comm and same strategy -> reuse the communicators
         if (!same_comm || !same_strategy) {
             prev_strategy = strategy;
 
             PE(preprocessing_communicators);
-            prev_cosma_comm = std::make_unique<cosma::communicator>(strategy, comm);
+            prev_cosma_comm =
+                std::make_unique<cosma::communicator>(strategy, comm);
             PL();
 
-      memory_pool_.unpin_all();
-      memory_pool_.already_pinned = false;
-      memory_pool_.resized = false;
+            memory_pool_.unpin_all();
+            memory_pool_.already_pinned = false;
+            memory_pool_.resized = false;
         }
     }
 
@@ -113,15 +117,8 @@ void cosma_context<Scalar>::register_state(MPI_Comm comm,
     // if (prev_cosma_comm->is_idle()) return;
 
 #ifdef COSMA_HAVE_GPU
-    if (
-            !prev_cosma_comm->is_idle()
-                &&
-            !memory_pool_.resized
-                &&
-            same_comm
-                &&
-            strategy == prev_strategy
-        ) {
+    if (!prev_cosma_comm->is_idle() && !memory_pool_.resized && same_comm &&
+        strategy == prev_strategy) {
         memory_pool_.already_pinned = true;
     }
 #endif
@@ -139,8 +136,13 @@ context<Scalar> make_context() {
 }
 
 template <typename Scalar>
-context<Scalar> make_context(size_t cpu_mem_limit, int streams, int tile_m, int tile_n, int tile_k) {
-    return std::make_unique<cosma_context<Scalar>>(cpu_mem_limit, streams, tile_m, tile_n, tile_k);
+context<Scalar> make_context(size_t cpu_mem_limit,
+                             int streams,
+                             int tile_m,
+                             int tile_n,
+                             int tile_k) {
+    return std::make_unique<cosma_context<Scalar>>(
+        cpu_mem_limit, streams, tile_m, tile_n, tile_k);
 }
 
 // Meyer's singleton, thread-safe in C++11, but not in C++03.
@@ -171,29 +173,29 @@ template context<zfloat> make_context();
 template context<zdouble> make_context();
 
 template context<float> make_context(size_t cpu_mem_limit,
-                                            int streams,
-                                            int tile_m,
-                                            int tile_n,
-                                            int tile_k);
+                                     int streams,
+                                     int tile_m,
+                                     int tile_n,
+                                     int tile_k);
 template context<double> make_context(size_t cpu_mem_limit,
-                                             int streams,
-                                             int tile_m,
-                                             int tile_n,
-                                             int tile_k);
+                                      int streams,
+                                      int tile_m,
+                                      int tile_n,
+                                      int tile_k);
 template context<zfloat> make_context(size_t cpu_mem_limit,
-                                             int streams,
-                                             int tile_m,
-                                             int tile_n,
-                                             int tile_k);
+                                      int streams,
+                                      int tile_m,
+                                      int tile_n,
+                                      int tile_k);
 template context<zdouble> make_context(size_t cpu_mem_limit,
-                                              int streams,
-                                              int tile_m,
-                                              int tile_n,
-                                              int tile_k);
+                                       int streams,
+                                       int tile_m,
+                                       int tile_n,
+                                       int tile_k);
 
 // template instantiation for get_context_instance
 template global_context<float> get_context_instance();
 template global_context<double> get_context_instance();
 template global_context<zfloat> get_context_instance();
 template global_context<zdouble> get_context_instance();
-}
+} // namespace cosma
diff --git a/src/cosma/context.hpp b/src/cosma/context.hpp
index 8f756ec..e15c760 100644
--- a/src/cosma/context.hpp
+++ b/src/cosma/context.hpp
@@ -1,14 +1,14 @@
 #pragma once
-#include <iostream>
-#include <memory>
 #include <cosma/memory_pool.hpp>
 #include <cosma/strategy.hpp>
+#include <iostream>
+#include <memory>
 
 #include <mpi.h>
 
 #ifdef COSMA_HAVE_GPU
-#include <Tiled-MM/tiled_mm.hpp>
 #include <Tiled-MM/device_stream.hpp>
+#include <Tiled-MM/tiled_mm.hpp>
 #endif
 
 namespace cosma {
@@ -18,25 +18,30 @@ class communicator;
 
 template <typename Scalar>
 class cosma_context {
-public:
+  public:
     cosma_context();
-    cosma_context(size_t cpu_mem_limit, int streams, int tile_m, int tile_n, int tile_k);
+    cosma_context(size_t cpu_mem_limit,
+                  int streams,
+                  int tile_m,
+                  int tile_n,
+                  int tile_k);
     ~cosma_context();
 
-    void register_state(MPI_Comm comm, 
-                        const Strategy strategy);
+    void register_state(MPI_Comm comm, const Strategy strategy);
 
-    memory_pool<Scalar>& get_memory_pool();
+    memory_pool<Scalar> &get_memory_pool();
 #ifdef COSMA_HAVE_GPU
-    gpu::mm_handle<Scalar>* get_gpu_context();
+    gpu::mm_handle<Scalar> *get_gpu_context();
 #endif
 
-    cosma::communicator* get_cosma_comm();
+    cosma::communicator *get_cosma_comm();
 
     long long get_cpu_memory_limit();
 
     void turn_on_output();
 
+    bool unified_memory();
+
     bool adapt_to_scalapack_strategy = true;
 
     bool overlap_comm_and_comp = false;
@@ -47,7 +52,7 @@ class cosma_context {
     gpu::device_stream gpu_stream;
 #endif
 
-private:
+  private:
     long long cpu_memory_limit = std::numeric_limits<long long>::max();
     memory_pool<Scalar> memory_pool_;
 #ifdef COSMA_HAVE_GPU
@@ -55,12 +60,13 @@ class cosma_context {
     // gpu::mm_handle<Scalar> gpu_ctx_;
 #endif
     bool output = false;
+    bool use_unified_memory_ = false;
     Strategy prev_strategy;
     std::unique_ptr<cosma::communicator> prev_cosma_comm;
 };
 
 template <typename Scalar>
-using global_context = cosma_context<Scalar>*;
+using global_context = cosma_context<Scalar> *;
 
 template <typename Scalar>
 using context = std::unique_ptr<cosma_context<Scalar>>;
@@ -69,7 +75,11 @@ template <typename Scalar>
 context<Scalar> make_context();
 
 template <typename Scalar>
-context<Scalar> make_context(size_t cpu_mem_limit, int streams, int tile_m, int tile_n, int tile_k);
+context<Scalar> make_context(size_t cpu_mem_limit,
+                             int streams,
+                             int tile_m,
+                             int tile_n,
+                             int tile_k);
 
 // Meyer's singleton, thread-safe in C++11, but not in C++03.
 // The thread-safety is guaranteed by the standard in C++11:
diff --git a/src/cosma/environment_variables.cpp b/src/cosma/environment_variables.cpp
index 2277b84..aee1fb6 100644
--- a/src/cosma/environment_variables.cpp
+++ b/src/cosma/environment_variables.cpp
@@ -1,29 +1,27 @@
-#include <cosma/environment_variables.hpp>
 #include <algorithm>
+#include <cosma/environment_variables.hpp>
 
-bool cosma::env_var_defined(const char* var_name) {
-    char* var = getenv (var_name);
+bool cosma::env_var_defined(const char *var_name) {
+    char *var = getenv(var_name);
     return var != nullptr;
 }
 
 bool cosma::get_bool_env_var(std::string name, bool default_value) {
-    char* var;
+    char *var;
     var = getenv(name.c_str());
     bool value = default_value;
     if (var != nullptr) {
         std::string s(var);
-        std::transform(s.begin(), s.end(), s.begin(), 
-            [&](char c) {
-                return std::toupper(c);
-            }
-        );
+        std::transform(s.begin(), s.end(), s.begin(), [&](char c) {
+            return std::toupper(c);
+        });
         value = (s == "ON");
     }
     return value;
 }
 
 int cosma::get_int_env_var(std::string name, int default_value) {
-    char* var;
+    char *var;
     var = getenv(name.c_str());
     int value = default_value;
     if (var != nullptr)
@@ -32,7 +30,7 @@ int cosma::get_int_env_var(std::string name, int default_value) {
 }
 
 float cosma::get_float_env_var(std::string name, float default_value) {
-    char* var;
+    char *var;
     var = getenv(name.c_str());
     float value = default_value;
     if (var != nullptr)
@@ -41,7 +39,7 @@ float cosma::get_float_env_var(std::string name, float default_value) {
 }
 
 double cosma::get_double_env_var(std::string name, double default_value) {
-    char* var;
+    char *var;
     var = getenv(name.c_str());
     double value = default_value;
     if (var != nullptr)
@@ -50,12 +48,12 @@ double cosma::get_double_env_var(std::string name, double default_value) {
 }
 
 std::size_t cosma::get_ull_env_var(std::string name, size_t default_value) {
-    char* var;
+    char *var;
     var = getenv(name.c_str());
     size_t value = default_value;
     if (var != nullptr)
         value = std::stoull(std::string(var));
-    return std::size_t (value);
+    return std::size_t(value);
 }
 
 int cosma::gpu_streams() {
@@ -84,8 +82,7 @@ bool cosma::get_adapt_strategy() {
 }
 
 bool cosma::get_overlap_comm_and_comp() {
-    return get_bool_env_var(env_var_names::overlap,
-                            env_var_defaults::overlap);
+    return get_bool_env_var(env_var_names::overlap, env_var_defaults::overlap);
 }
 
 bool cosma::get_memory_pinning() {
@@ -93,9 +90,14 @@ bool cosma::get_memory_pinning() {
                             env_var_defaults::memory_pinning_enabled);
 }
 
+bool cosma::get_unified_memory() {
+    return get_bool_env_var(env_var_names::cosma_gpu_unified_memory,
+                            env_var_defaults::unified_memory);
+}
+
 double cosma::get_memory_pool_amortization() {
     return get_double_env_var(env_var_names::memory_pool_amortization,
-                           env_var_defaults::memory_pool_amortization);
+                              env_var_defaults::memory_pool_amortization);
 }
 
 int cosma::get_min_local_dimension() {
@@ -117,7 +119,7 @@ int cosma::get_cosma_cpu_memory_alignment() {
 // and converts the limit to #elements that each rank is allowed to use
 template <typename T>
 long long cosma::get_cpu_max_memory() {
-    char* var;
+    char *var;
     var = getenv(env_var_names::cpu_max_memory.c_str());
     long long value = env_var_defaults::cpu_max_memory;
     long long megabytes = env_var_defaults::cpu_max_memory;
@@ -135,4 +137,3 @@ template long long cosma::get_cpu_max_memory<float>();
 template long long cosma::get_cpu_max_memory<double>();
 template long long cosma::get_cpu_max_memory<std::complex<float>>();
 template long long cosma::get_cpu_max_memory<std::complex<double>>();
-
diff --git a/src/cosma/environment_variables.hpp b/src/cosma/environment_variables.hpp
index e486407..78561b7 100644
--- a/src/cosma/environment_variables.hpp
+++ b/src/cosma/environment_variables.hpp
@@ -1,100 +1,103 @@
 #pragma once
-#include <string>
-#include <limits>
 #include <complex>
-#include <stdlib.h>
 #include <limits>
+#include <stdlib.h>
+#include <string>
 
 namespace cosma {
 
 // names of supported environment variables
 namespace env_var_names {
-    // number of GPU streams to be used per rank
-    const std::string gpu_n_streams = "COSMA_GPU_STREAMS";
-    // max sizes of GPU tiles (in #elements)
-    // MxN corresponds to matrix C and K to the shared dimension
-    const std::string gpu_tile_m = "COSMA_GPU_MAX_TILE_M";
-    const std::string gpu_tile_n = "COSMA_GPU_MAX_TILE_N";
-    const std::string gpu_tile_k = "COSMA_GPU_MAX_TILE_K";
-    // if ON, COSMA will try to natively use scalapack layout
-    // without transformation. Only used in the pxgemm wrapper.
-    const std::string adapt_strategy = "COSMA_ADAPT_STRATEGY";
-    // if ON, COSMA will try to overlap communication and computation
-    const std::string overlap = "COSMA_OVERLAP_COMM_AND_COMP";
-    // specifies the maximum available CPU memory per rank in MB
-    const std::string cpu_max_memory = "COSMA_CPU_MAX_MEMORY";
-    // if true, local host matrices will be pinned 
-    // (only used when GPU backend enabled)
-    // which increases the efficiency
-    const std::string memory_pinning_enabled = "COSMA_GPU_MEMORY_PINNING";
-    // The scaling factor used for the memory-pool allocation size.(cpu-only).
-    // If amortization = 1.2, then the memory allocator
-    // will request 1.2x the requested size (thus, 20% more than needed).
-    // Higher values better amortize the cost of memory buffers resizing
-    // which can occur when the algorithm is invoked for different matrix sizes.
-    // However, higher amortization values also mean that 
-    // potentially more memory is allocated than used which can be
-    // a problem when the memory resource is tight.
-    // There is just a single memory pool in COSMA and all the required 
-    // memory is taken from this memory pool only.
-    const std::string memory_pool_amortization = "COSMA_MEMORY_POOL_AMORTIZATION";
-    // minimum local matrix size -- if P is too large, so that after
-    // splitting the local matrix size get lower than this, 
-    // then P will be reduced so that the problem size
-    // never gets smaller than specified by this variable
-    const std::string min_local_dimension = "COSMA_MIN_LOCAL_DIMENSION";
-    // if any dimension is smaller than this threshold, it will be dispatched to SCALAPACK
-    // since it's too "thin" for COSMA in that case
-    const std::string cosma_dim_threshold = "COSMA_DIM_THRESHOLD";
-    // number of bytes to which all host buffers are aligned
-    const std::string cosma_cpu_memory_alignment = "COSMA_CPU_MEMORY_ALIGNMENT";
-};
+// number of GPU streams to be used per rank
+const std::string gpu_n_streams = "COSMA_GPU_STREAMS";
+// max sizes of GPU tiles (in #elements)
+// MxN corresponds to matrix C and K to the shared dimension
+const std::string gpu_tile_m = "COSMA_GPU_MAX_TILE_M";
+const std::string gpu_tile_n = "COSMA_GPU_MAX_TILE_N";
+const std::string gpu_tile_k = "COSMA_GPU_MAX_TILE_K";
+// if ON, COSMA will try to natively use scalapack layout
+// without transformation. Only used in the pxgemm wrapper.
+const std::string adapt_strategy = "COSMA_ADAPT_STRATEGY";
+// if ON, COSMA will try to overlap communication and computation
+const std::string overlap = "COSMA_OVERLAP_COMM_AND_COMP";
+// specifies the maximum available CPU memory per rank in MB
+const std::string cpu_max_memory = "COSMA_CPU_MAX_MEMORY";
+// if true, local host matrices will be pinned
+// (only used when GPU backend enabled)
+// which increases the efficiency
+const std::string memory_pinning_enabled = "COSMA_GPU_MEMORY_PINNING";
+// The scaling factor used for the memory-pool allocation size.(cpu-only).
+// If amortization = 1.2, then the memory allocator
+// will request 1.2x the requested size (thus, 20% more than needed).
+// Higher values better amortize the cost of memory buffers resizing
+// which can occur when the algorithm is invoked for different matrix sizes.
+// However, higher amortization values also mean that
+// potentially more memory is allocated than used which can be
+// a problem when the memory resource is tight.
+// There is just a single memory pool in COSMA and all the required
+// memory is taken from this memory pool only.
+const std::string memory_pool_amortization = "COSMA_MEMORY_POOL_AMORTIZATION";
+// minimum local matrix size -- if P is too large, so that after
+// splitting the local matrix size get lower than this,
+// then P will be reduced so that the problem size
+// never gets smaller than specified by this variable
+const std::string min_local_dimension = "COSMA_MIN_LOCAL_DIMENSION";
+// if any dimension is smaller than this threshold, it will be dispatched to
+// SCALAPACK since it's too "thin" for COSMA in that case
+const std::string cosma_dim_threshold = "COSMA_DIM_THRESHOLD";
+// number of bytes to which all host buffers are aligned
+const std::string cosma_cpu_memory_alignment = "COSMA_CPU_MEMORY_ALIGNMENT";
+// IF ON, use unified memory
+const std::string cosma_gpu_unified_memory = "COSMA_GPU_UNIFIED_MEMORY";
+}; // namespace env_var_names
 
 // default values of supported environment variables
 namespace env_var_defaults {
-    // number of GPU streams to be used per rank
-    const int gpu_n_streams = 2;
-    // max sizes of GPU tiles (in #elements)
-    // MxN corresponds to matrix C and K to the shared dimension
-    const int gpu_tile_m = 5000;
-    const int gpu_tile_n = 5000;
-    const int gpu_tile_k = 5000;
-    // if ON, COSMA will try to natively use scalapack layout
-    // without transformation. Only used in the pxgemm wrapper.
-    const bool adapt_strategy = true;
-    // if ON, COSMA will try to overlap communication and computation
-    const bool overlap = false;
-    // specifies the maximum available CPU memory per rank in MB
-    const long long cpu_max_memory = std::numeric_limits<long long>::max(); // inf
-    // if true, local host matrices will be pinned 
-    // (only used when GPU backend enabled)
-    // which increases the efficiency
-    const bool memory_pinning_enabled = true;
-    // The scaling factor used for the memory-pool allocation size.(cpu-only).
-    // If amortization = 1.2, then the memory allocator
-    // will request 1.2x the requested size (thus, 20% more than needed).
-    // Higher values better amortize the cost of memory buffers resizing
-    // which can occur when the algorithm is invoked for different matrix sizes.
-    // However, higher amortization values also mean that 
-    // potentially more memory is allocated than used which can be
-    // a problem when the memory resource is tight.
-    // There is just a single memory pool in COSMA and all the required 
-    // memory is taken from this memory pool only.
-    const double memory_pool_amortization = 1.2;
-    // minimum local matrix size -- if P is too large, so that after
-    // splitting the local matrix size get lower than this, 
-    // then P will be reduced so that the problem size
-    // never gets smaller than specified by this variable
-    const int min_local_dimension = 200;
-    // if any dimension is smaller than this threshold, it will be dispatched to SCALAPACK
-    // since it's too "thin" for COSMA in that case
-    const int cosma_dim_threshold = 0;
-    // cpu memory alignment (currently disabled)
-    const int cosma_cpu_memory_alignment = 0; // 256;
-};
+// number of GPU streams to be used per rank
+const int gpu_n_streams = 2;
+// max sizes of GPU tiles (in #elements)
+// MxN corresponds to matrix C and K to the shared dimension
+const int gpu_tile_m = 5000;
+const int gpu_tile_n = 5000;
+const int gpu_tile_k = 5000;
+// if ON, COSMA will try to natively use scalapack layout
+// without transformation. Only used in the pxgemm wrapper.
+const bool adapt_strategy = true;
+// if ON, COSMA will try to overlap communication and computation
+const bool overlap = false;
+// specifies the maximum available CPU memory per rank in MB
+const long long cpu_max_memory = std::numeric_limits<long long>::max(); // inf
+// if true, local host matrices will be pinned
+// (only used when GPU backend enabled)
+// which increases the efficiency
+const bool memory_pinning_enabled = true;
+// The scaling factor used for the memory-pool allocation size.(cpu-only).
+// If amortization = 1.2, then the memory allocator
+// will request 1.2x the requested size (thus, 20% more than needed).
+// Higher values better amortize the cost of memory buffers resizing
+// which can occur when the algorithm is invoked for different matrix sizes.
+// However, higher amortization values also mean that
+// potentially more memory is allocated than used which can be
+// a problem when the memory resource is tight.
+// There is just a single memory pool in COSMA and all the required
+// memory is taken from this memory pool only.
+const double memory_pool_amortization = 1.2;
+// minimum local matrix size -- if P is too large, so that after
+// splitting the local matrix size get lower than this,
+// then P will be reduced so that the problem size
+// never gets smaller than specified by this variable
+const int min_local_dimension = 200;
+// if any dimension is smaller than this threshold, it will be dispatched to
+// SCALAPACK since it's too "thin" for COSMA in that case
+const int cosma_dim_threshold = 0;
+// cpu memory alignment (currently disabled)
+const int cosma_cpu_memory_alignment = 0; // 256;
+// gpu unified memory mechanism
+const bool unified_memory = false;
+}; // namespace env_var_defaults
 
 // checks if the specified environment variable is defined
-bool env_var_defined(const char* var_name);
+bool env_var_defined(const char *var_name);
 
 // checks if the environment variable with given name
 // is set to ON or OFF. If the variable is not defined,
@@ -145,13 +148,13 @@ bool get_overlap_comm_and_comp();
 // will request 1.2x the requested size (thus, 20% more than needed).
 // Higher values better amortize the cost of memory buffers resizing
 // which can occur when the algorithm is invoked for different matrix sizes.
-// However, higher amortization values also mean that 
+// However, higher amortization values also mean that
 // potentially more memory is allocated than used which can be
 // a problem when the memory resource is tight.
 double get_memory_pool_amortization();
 
 // reads the environment variable corresponding to
-// the memory limit in MB per rank, converts the limit 
+// the memory limit in MB per rank, converts the limit
 // to #elements that each rank is allowed to use.
 // returns the default value if the variable is undefined
 template <typename T>
@@ -178,4 +181,6 @@ int get_cosma_dim_threshold();
 // number of bytes to which all the buffers should be aligned
 int get_cosma_cpu_memory_alignment();
 
-}
+// check if we use unified memory or not
+bool get_unified_memory();
+} // namespace cosma
diff --git a/src/cosma/local_multiply.cpp b/src/cosma/local_multiply.cpp
index d0bbed5..780e347 100644
--- a/src/cosma/local_multiply.cpp
+++ b/src/cosma/local_multiply.cpp
@@ -6,6 +6,10 @@
 #ifdef COSMA_HAVE_GPU
 #include <Tiled-MM/tiled_mm.hpp>
 #include <Tiled-MM/util.hpp>
+
+#ifdef COSMA_USE_UNIFIED_MEMORY
+#include <Tiled-MM/gpu_blas_api.hpp>
+#endif
 #endif
 
 #if defined(COSMA_WITH_BLAS) || defined(COSMA_WITH_MKL_BLAS)
@@ -23,6 +27,143 @@ namespace cosma {
 using clock_t = std::chrono::high_resolution_clock;
 using ms_t = std::chrono::milliseconds;
 
+#ifdef COSMA_USE_UNIFIED_MEMORY
+using zfloat = std::complex<float>;
+using zdouble = std::complex<double>;
+
+int get_first(char trans, int m, int n) { return trans == 'N' ? m : n; }
+
+int get_second(char trans, int m, int n) { return trans == 'N' ? n : m; }
+
+gpu::blas_api::OperationType get_blas_operation(char trans) {
+    gpu::blas_api::OperationType op =
+        trans == 'T'
+            ? gpu::blas_api::operation::Transpose
+            : (trans == 'C' ? gpu::blas_api::operation::ConjugateTranspose
+                            : gpu::blas_api::operation::None);
+    return op;
+}
+
+gpu::blas_api::StatusType cublas_gemm_wrapper(gpu::blas_api::HandleType handle,
+                                              char trans_a,
+                                              char trans_b,
+                                              int m,
+                                              int n,
+                                              int k,
+                                              const float *alpha,
+                                              const float *a,
+                                              const float *b,
+                                              const float *beta,
+                                              float *c,
+                                              int lld_c) {
+    gpu::blas_api::OperationType op_a = get_blas_operation(trans_a);
+    gpu::blas_api::OperationType op_b = get_blas_operation(trans_b);
+
+    int ld_a = get_first(trans_a, m, k);
+    int ld_b = get_first(trans_b, k, n);
+
+    return gpu::blas_api::sgemm(
+        handle, op_a, op_b, m, n, k, alpha, a, ld_a, b, ld_b, beta, c, lld_c);
+}
+
+gpu::blas_api::StatusType cublas_gemm_wrapper(gpu::blas_api::HandleType handle,
+                                              char trans_a,
+                                              char trans_b,
+                                              int m,
+                                              int n,
+                                              int k,
+                                              const double *alpha,
+                                              const double *a,
+                                              const double *b,
+                                              const double *beta,
+                                              double *c,
+                                              int lld_c) {
+    gpu::blas_api::OperationType op_a = get_blas_operation(trans_a);
+    gpu::blas_api::OperationType op_b = get_blas_operation(trans_b);
+
+    int ld_a = get_first(trans_a, m, k);
+    int ld_b = get_first(trans_b, k, n);
+
+    return gpu::blas_api::dgemm(
+        handle, op_a, op_b, m, n, k, alpha, a, ld_a, b, ld_b, beta, c, lld_c);
+}
+
+// Note: Converting from std::complex to cuComplex and cuDoubleComple
+//       works because they are binary compatible.
+//
+//       http://icl.cs.utk.edu/magma/forum/viewtopic.php?f=2&t=902
+//
+gpu::blas_api::StatusType cublas_gemm_wrapper(gpu::blas_api::HandleType handle,
+                                              char trans_a,
+                                              char trans_b,
+                                              int m,
+                                              int n,
+                                              int k,
+                                              const zfloat *alpha,
+                                              const zfloat *a,
+                                              const zfloat *b,
+                                              const zfloat *beta,
+                                              zfloat *c,
+                                              int lld_c) {
+    gpu::blas_api::OperationType op_a = get_blas_operation(trans_a);
+    gpu::blas_api::OperationType op_b = get_blas_operation(trans_b);
+
+    int ld_a = get_first(trans_a, m, k);
+    int ld_b = get_first(trans_b, k, n);
+
+    return gpu::blas_api::cgemm(
+        handle,
+        op_a,
+        op_b,
+        m,
+        n,
+        k,
+        reinterpret_cast<const gpu::blas_api::ComplexFloatType *>(alpha),
+        reinterpret_cast<const gpu::blas_api::ComplexFloatType *>(a),
+        ld_a,
+        reinterpret_cast<const gpu::blas_api::ComplexFloatType *>(b),
+        ld_b,
+        reinterpret_cast<const gpu::blas_api::ComplexFloatType *>(beta),
+        reinterpret_cast<gpu::blas_api::ComplexFloatType *>(c),
+        lld_c);
+}
+
+gpu::blas_api::StatusType cublas_gemm_wrapper(gpu::blas_api::HandleType handle,
+                                              char trans_a,
+                                              char trans_b,
+                                              int m,
+                                              int n,
+                                              int k,
+                                              const zdouble *alpha,
+                                              const zdouble *a,
+                                              const zdouble *b,
+                                              const zdouble *beta,
+                                              zdouble *c,
+                                              int lld_c) {
+    gpu::blas_api::OperationType op_a = get_blas_operation(trans_a);
+    gpu::blas_api::OperationType op_b = get_blas_operation(trans_b);
+
+    int ld_a = get_first(trans_a, m, k);
+    int ld_b = get_first(trans_b, k, n);
+
+    return gpu::blas_api::zgemm(
+        handle,
+        op_a,
+        op_b,
+        m,
+        n,
+        k,
+        reinterpret_cast<const gpu::blas_api::ComplexDoubleType *>(alpha),
+        reinterpret_cast<const gpu::blas_api::ComplexDoubleType *>(a),
+        ld_a,
+        reinterpret_cast<const gpu::blas_api::ComplexDoubleType *>(b),
+        ld_b,
+        reinterpret_cast<const gpu::blas_api::ComplexDoubleType *>(beta),
+        reinterpret_cast<gpu::blas_api::ComplexDoubleType *>(c),
+        lld_c);
+}
+#endif
+
 template <typename Scalar>
 void print_matrix(int m, int n, Scalar *A, char label) {
     std::cout << "Matrix " << label << std::endl;
@@ -76,7 +217,7 @@ clock_t::time_point debug_gemm_end(Scalar *matrixA,
 
 #ifdef COSMA_HAVE_GPU
 template <typename Scalar>
-void local_multiply(gpu::mm_handle<Scalar>* gpu_ctx,
+void local_multiply(gpu::mm_handle<Scalar> *gpu_ctx,
                     Scalar *matrixA,
                     Scalar *matrixB,
                     Scalar *matrixC,
@@ -93,47 +234,62 @@ void local_multiply(gpu::mm_handle<Scalar>* gpu_ctx,
     if (rank == 0) {
         // print_matrix(m, k, matrixA, 'A');
         // print_matrix(k, n, matrixB, 'B');
-        // std::cout << "m = " << m << ", n = " << n << ", k = " << k << std::endl;
+        // std::cout << "m = " << m << ", n = " << n << ", k = " << k <<
+    std::endl;
     }
     */
     int ld_a = m;
     int ld_b = k;
     int ld_c = m;
 
-    gpu::gemm(*gpu_ctx, 'N', 'N', m, n, k, alpha, matrixA, ld_a, matrixB, ld_b, beta, matrixC, ld_c, pin_host_buffers, copy_c_back);
+    gpu::gemm(*gpu_ctx,
+              'N',
+              'N',
+              m,
+              n,
+              k,
+              alpha,
+              matrixA,
+              ld_a,
+              matrixB,
+              ld_b,
+              beta,
+              matrixC,
+              ld_c,
+              pin_host_buffers,
+              copy_c_back);
+
     /*
     if (rank == 0) {
-        gpu::copy_to_host(gpu_ctx->get_full_device_buffer_c().data(), matrixC, m * n);
-        print_matrix(m, n, matrixC, 'C');
-        std::cout << "alpha = " << alpha << ", beta = " << beta << std::endl;
+        gpu::copy_to_host(gpu_ctx->get_full_device_buffer_c().data(), matrixC, m
+    * n); print_matrix(m, n, matrixC, 'C'); std::cout << "alpha = " << alpha <<
+    ", beta = " << beta << std::endl;
     }
     */
-
 }
 #endif
 
 template <typename Scalar>
-Scalar& get_element(Scalar* mat, int m, int n, int i, int j) {
+Scalar &get_element(Scalar *mat, int m, int n, int i, int j) {
     return mat[j * m + i];
 }
 
 template <typename Scalar>
-void local_multiply_cpu(
-                    Scalar *matrixA,
-                    Scalar *matrixB,
-                    Scalar *matrixC,
-                    int m,
-                    int n,
-                    int k,
-                    Scalar alpha,
-                    Scalar beta) {
+void local_multiply_cpu(Scalar *matrixA,
+                        Scalar *matrixB,
+                        Scalar *matrixC,
+                        int m,
+                        int n,
+                        int k,
+                        Scalar alpha,
+                        Scalar beta) {
     for (int mi = 0; mi < m; ++mi) {
         for (int ni = 0; ni < n; ++ni) {
-            Scalar& Cvalue = get_element(matrixC, m, n, mi, ni);
+            Scalar &Cvalue = get_element(matrixC, m, n, mi, ni);
             Cvalue *= beta;
             for (int ki = 0; ki < k; ++ki) {
-                Scalar& Avalue = get_element(matrixA, m, k, mi, ki);
-                Scalar& Bvalue = get_element(matrixB, k, n, ki, ni);
+                Scalar &Avalue = get_element(matrixA, m, k, mi, ki);
+                Scalar &Bvalue = get_element(matrixB, k, n, ki, ni);
                 Cvalue += alpha * Avalue * Bvalue;
             }
         }
@@ -141,7 +297,7 @@ void local_multiply_cpu(
 }
 
 template <typename Scalar>
-void local_multiply(cosma_context<Scalar>* ctx,
+void local_multiply(cosma_context<Scalar> *ctx,
                     Scalar *matrixA,
                     Scalar *matrixB,
                     Scalar *matrixC,
@@ -157,22 +313,58 @@ void local_multiply(cosma_context<Scalar>* ctx,
 #endif
 
 #ifdef COSMA_HAVE_GPU
-    PE(multiply_computation_pinning);
-    if (ctx->pin_host_buffers) {
-        ctx->get_memory_pool().pin(matrixA, m * k);
-        ctx->get_memory_pool().pin(matrixB, k * n);
-        // if (copy_c_back || std::abs(beta) > 0) {
-        ctx->get_memory_pool().pin(matrixC, m * n);
-        // }
+#ifdef COSMA_USE_UNIFIED_MEMORY
+    if (ctx.unified_memory()) {
+        PE(multiply_computation_gemm);
+        auto status = cublas_gemm_wrapper(
+            ctx->get_gpu_context()->get_gpu_context().get_blas_handle(0),
+            'N',
+            'N',
+            m,
+            n,
+            k,
+            &alpha,
+            matrixA,
+            matrixB,
+            &beta,
+            matrixC,
+            m);
+
+        gpu::check_blas_status(status);
+        // we need explicit synchronization over the stream to trigger the copy
+        // back to CPU memory
+        hipStreamSynchronize(
+            ctx->get_gpu_context()->get_gpu_context().get_stream(0));
+        PL();
+    } else {
+#endif // COSMA_USE_UNIFIED_MEMORY
+        PE(multiply_computation_pinning);
+        if (ctx->pin_host_buffers) {
+            ctx->get_memory_pool().pin(matrixA, m * k);
+            ctx->get_memory_pool().pin(matrixB, k * n);
+            // if (copy_c_back || std::abs(beta) > 0) {
+            ctx->get_memory_pool().pin(matrixC, m * n);
+            // }
+        }
+        PL();
+
+        PE(multiply_computation_gemm);
+        local_multiply(ctx->get_gpu_context(),
+                       matrixA,
+                       matrixB,
+                       matrixC,
+                       m,
+                       n,
+                       k,
+                       alpha,
+                       beta,
+                       false,
+                       copy_c_back);
+        PL();
+#ifdef COSMA_USE_UNIFIED_MEMORY
     }
-    PL();
+#endif
 
-    PE(multiply_computation_gemm);
-    local_multiply(ctx->get_gpu_context(),
-                   matrixA, matrixB, matrixC,
-                   m, n, k, alpha, beta,
-                   false, copy_c_back);
-    PL();
 #else
     PE(multiply_computation_gemm);
     gemm(m, n, k, alpha, matrixA, m, matrixB, k, beta, matrixC, m);
@@ -182,8 +374,9 @@ void local_multiply(cosma_context<Scalar>* ctx,
 #ifdef DEBUG
     auto t_end =
         debug_gemm_end(matrixA, matrixB, matrixC, m, n, k, alpha, beta);
-    std::cout << "time(" << m << ", " << n << ", " << k
-              << ") = " << std::chrono::duration_cast<ms_t>(t_end - t_start).count() << std::endl;
+    std::cout << "time(" << m << ", " << n << ", " << k << ") = "
+              << std::chrono::duration_cast<ms_t>(t_end - t_start).count()
+              << std::endl;
 #endif
 }
 
@@ -198,14 +391,19 @@ void local_multiply(Scalar *matrixA,
                     Scalar beta,
                     bool copy_c_back) {
     local_multiply(get_context_instance<Scalar>(),
-                   matrixA, matrixB, matrixC,
-                   m, n, k,
-                   alpha, beta,
+                   matrixA,
+                   matrixB,
+                   matrixC,
+                   m,
+                   n,
+                   k,
+                   alpha,
+                   beta,
                    copy_c_back);
 }
 
 template <typename Scalar>
-void local_multiply(context<Scalar>& ctx,
+void local_multiply(context<Scalar> &ctx,
                     Scalar *matrixA,
                     Scalar *matrixB,
                     Scalar *matrixC,
@@ -215,7 +413,16 @@ void local_multiply(context<Scalar>& ctx,
                     Scalar alpha,
                     Scalar beta,
                     bool copy_c_back) {
-    local_multiply(ctx.get(), matrixA, matrixB, matrixC, m, n, k, alpha, beta, copy_c_back);
+    local_multiply(ctx.get(),
+                   matrixA,
+                   matrixB,
+                   matrixC,
+                   m,
+                   n,
+                   k,
+                   alpha,
+                   beta,
+                   copy_c_back);
 }
 
 // explicit template instantiation using context
@@ -266,47 +473,43 @@ local_multiply<std::complex<float>>(cosma_context<std::complex<float>> *ctx,
                                     bool copy_c_back);
 
 // explicit template instantiation using context - no pinning
-template void local_multiply_cpu<double>(
-                                     double *matrixA,
-                                     double *matrixB,
-                                     double *matrixC,
-                                     int m,
-                                     int n,
-                                     int k,
-                                     double alpha,
-                                     double beta);
-
-template void local_multiply_cpu<float>(
-                                    float *matrixA,
-                                    float *matrixB,
-                                    float *matrixC,
-                                    int m,
-                                    int n,
-                                    int k,
-                                    float alpha,
-                                    float beta);
+template void local_multiply_cpu<double>(double *matrixA,
+                                         double *matrixB,
+                                         double *matrixC,
+                                         int m,
+                                         int n,
+                                         int k,
+                                         double alpha,
+                                         double beta);
+
+template void local_multiply_cpu<float>(float *matrixA,
+                                        float *matrixB,
+                                        float *matrixC,
+                                        int m,
+                                        int n,
+                                        int k,
+                                        float alpha,
+                                        float beta);
 
 template void
-local_multiply_cpu<std::complex<double>>(
-                                     std::complex<double> *matrixA,
-                                     std::complex<double> *matrixB,
-                                     std::complex<double> *matrixC,
-                                     int m,
-                                     int n,
-                                     int k,
-                                     std::complex<double> alpha,
-                                     std::complex<double> beta);
+local_multiply_cpu<std::complex<double>>(std::complex<double> *matrixA,
+                                         std::complex<double> *matrixB,
+                                         std::complex<double> *matrixC,
+                                         int m,
+                                         int n,
+                                         int k,
+                                         std::complex<double> alpha,
+                                         std::complex<double> beta);
 
 template void
-local_multiply_cpu<std::complex<float>>(
-                                    std::complex<float> *matrixA,
-                                    std::complex<float> *matrixB,
-                                    std::complex<float> *matrixC,
-                                    int m,
-                                    int n,
-                                    int k,
-                                    std::complex<float> alpha,
-                                    std::complex<float> beta);
+local_multiply_cpu<std::complex<float>>(std::complex<float> *matrixA,
+                                        std::complex<float> *matrixB,
+                                        std::complex<float> *matrixC,
+                                        int m,
+                                        int n,
+                                        int k,
+                                        std::complex<float> alpha,
+                                        std::complex<float> beta);
 
 // explicit template instantiation using context with unique_ptr context
 template void local_multiply<double>(context<double> &ctx,
@@ -387,16 +590,15 @@ local_multiply<std::complex<double>>(std::complex<double> *matrixA,
                                      std::complex<double> beta,
                                      bool copy_c_back);
 
-template void
-local_multiply<std::complex<float>>(std::complex<float> *matrixA,
-                                    std::complex<float> *matrixB,
-                                    std::complex<float> *matrixC,
-                                    int m,
-                                    int n,
-                                    int k,
-                                    std::complex<float> alpha,
-                                    std::complex<float> beta,
-                                    bool copy_c_back);
+template void local_multiply<std::complex<float>>(std::complex<float> *matrixA,
+                                                  std::complex<float> *matrixB,
+                                                  std::complex<float> *matrixC,
+                                                  int m,
+                                                  int n,
+                                                  int k,
+                                                  std::complex<float> alpha,
+                                                  std::complex<float> beta,
+                                                  bool copy_c_back);
 
 #ifdef COSMA_HAVE_GPU
 // explicit template instantiation using gpu context
diff --git a/src/cosma/local_multiply.hpp b/src/cosma/local_multiply.hpp
index b4daa9c..f3b15fe 100644
--- a/src/cosma/local_multiply.hpp
+++ b/src/cosma/local_multiply.hpp
@@ -4,7 +4,7 @@
 namespace cosma {
 
 template <typename Scalar>
-void local_multiply(cosma_context<Scalar>* ctx,
+void local_multiply(cosma_context<Scalar> *ctx,
                     Scalar *a,
                     Scalar *b,
                     Scalar *c,
@@ -16,18 +16,17 @@ void local_multiply(cosma_context<Scalar>* ctx,
                     bool copy_c_back);
 
 template <typename Scalar>
-void local_multiply_cpu(
-                    Scalar *a,
-                    Scalar *b,
-                    Scalar *c,
-                    int m,
-                    int n,
-                    int k,
-                    Scalar alpha,
-                    Scalar beta);
+void local_multiply_cpu(Scalar *a,
+                        Scalar *b,
+                        Scalar *c,
+                        int m,
+                        int n,
+                        int k,
+                        Scalar alpha,
+                        Scalar beta);
 
 template <typename scalar>
-void local_multiply(context<scalar>& ctx,
+void local_multiply(context<scalar> &ctx,
                     scalar *a,
                     scalar *b,
                     scalar *c,
@@ -51,7 +50,7 @@ void local_multiply(scalar *a,
 
 #ifdef COSMA_HAVE_GPU
 template <typename scalar>
-void local_multiply(gpu::mm_handle<scalar>* gpu_ctx,
+void local_multiply(gpu::mm_handle<scalar> *gpu_ctx,
                     scalar *a,
                     scalar *b,
                     scalar *c,