dbsanfte
diff --git a/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion b/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 14 additions & 3 deletions b/‎README.md‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎ci/cscs.yml‎
Lines changed: 27 additions & 0 deletions b/‎ci/cscs.yml‎
Lines changed: 27 additions & 0 deletions
@@ -3,7 +3,7 @@
 	url = https://github.com/eth-cscs/Tiled-MM.git
 [submodule "libs/COSTA"]
 	path = libs/COSTA
-	url = https://github.com/eth-cscs/COSTA
+	url = https://github.com/dbsanfte/COSTA
 [submodule "libs/cxxopts"]
 	path = libs/cxxopts
 	url = https://github.com/jarro2783/cxxopts
@@ -97,10 +97,10 @@ endif ()
 set(COSTA_WITH_PROFILING ${COSMA_WITH_PROFILING} CACHE INTERNAL "")
 set(COSTA_SCALAPACK ${COSMA_SCALAPACK} CACHE INTERNAL "")
 
+# Use local COSTA submodule (forked with bfloat16 support)
 FetchContent_Declare(
   costa
-  GIT_REPOSITORY https://github.com/eth-cscs/costa.git
-  GIT_TAG        03847e66f05ad4a1eb371b85be628e218ce46f11 # v2.2.3
+  SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/libs/COSTA
   FIND_PACKAGE_ARGS NAMES costa
 )
 # the joy of fetch_content. if we build costa and cosma together
 
@@ -58,9 +58,10 @@ The paper and other materials on COSMA are available under the following link:
 ## Features
 
 - **[NEW] Multi-GPU Systems Support:** COSMA is now able to take advantage of fast GPU-to-GPU interconnects either through the use of NCCL/RCCL libraries or by using the GPU-aware MPI. Both, NVIDIA and AMD GPUs are supported.
+- **[NEW] BFloat16 Support:** COSMA now supports BFloat16 (BF16) reduced precision arithmetic for AI/ML workloads, enabling memory-efficient distributed matrix multiplication with automatic precision handling.
 - **ScaLAPACK API Support:** it is enough to link to COSMA, without changing the code and all `p?gemm` calls will use ScaLAPACK wrappers provided by COSMA.
 - **C/Fortran Interface:** written in `C++`, but provides `C` and `Fortran` interfaces.
-- **Custom Types:** fully templatized types.
+- **Custom Types:** fully templatized types including support for `float`, `double`, complex types (`zfloat`, `zdouble`), and **BFloat16** (`bfloat16`).
 - **GPU acceleration:** supports both **NVIDIA** and **AMD** GPUs.
 - **Supported BLAS (CPU) backends:** MKL, LibSci, NETLIB, BLIS, ATLAS.
 - **Custom Data Layout Support:** natively uses its own blocked data layout of matrices, but supports arbitrary grid-like data layout of matrices.
@@ -273,10 +274,20 @@ The overview of all supported options is given below:
   step. The third parameter is an integer which defines the divisor. This
   parameter can be omitted. In that case the default strategy will be used. An example of a possible value for the upper example: `--steps=sm2,pn2,pk2`.
 - `-r (--n_rep)` (optional, default: `2`): the number of repetitions.
-- `-t (--type)` (optional, default: `double`): data type of matrix entries. Can be one of: `float`, `double`, `zfloat` and `zdouble`. The last two correspond to complex numbers.
+- `-t (--type)` (optional, default: `double`): data type of matrix entries. Can be one of: `float`, `double`, `zfloat`, `zdouble`, and `bfloat16`. The `bfloat16` type enables reduced-precision arithmetic for AI/ML workloads. Complex types are `zfloat` and `zdouble`.
 - `--test` (optional): if present, the result of COSMA will be verified with the result of the available SCALAPACK.
 - `-h (--help) (optional)`: print available options.
 
+**Example: Testing BFloat16 matrix multiplication:**
+```bash
+# BFloat16 matrix multiplication with verification
+mpirun -np 4 ./build/miniapp/cosma_miniapp -m 2000 -n 2000 -k 2000 -t bfloat16 --test -r 5
+
+# Large-scale BFloat16 multiplication without verification (performance testing)
+mpirun -np 16 ./build/miniapp/cosma_miniapp -m 10000 -n 10000 -k 10000 -t bfloat16 -r 2
+```
+**Note:** BFloat16 provides approximately the same dynamic range as FP32 but uses only 16 bits per element, reducing memory bandwidth requirements by 50% compared to single precision. This is particularly beneficial for large-scale distributed matrix operations in AI/ML workloads.
+
 ### COSMA pxgemm wrapper
 
 COSMA also contains a wrapper for ScaLAPACK `pxgemm` calls which offers scalapack interface (pxgemm functions with exactly the same signatures as ScaLAPACK). Running these functions will take care of transforming the matrices between ScaLAPACK and COSMA data layout, perform the multiplication using COSMA algorithm and transform the result back to the specified ScaLAPACK data layout.
@@ -311,7 +322,7 @@ The overview of all supported options is given below:
 - `--alpha` (optional, default: 1): alpha parameter in `C = alpha*A*B + beta*C`.
 - `--beta` (optional, default: 0): beta parameter in `C = alpha*A*B + beta*C`.
 - `-r (--n_rep)` (optional, default: 2): number of repetitions.
-- `-t (--type)` (optional, default: `double`): data type of matrix entries. Can be one of: `float`, `double`, `zfloat` and `zdouble`. The last two correspond to complex numbers.
+- `-t (--type)` (optional, default: `double`): data type of matrix entries. Can be one of: `float`, `double`, `zfloat`, `zdouble`, and `bfloat16`. The `bfloat16` type enables reduced-precision arithmetic.
 - `--test` (optional): if present, the result of COSMA will be verified with the result of the available SCALAPACK.
 - `--algorithm` (optional, default: `both`): defines which algorithm (`cosma`, `scalapack` or `both`) to run.
 - `-h (--help) (optional)`: print available options.
 
@@ -90,3 +90,30 @@ multiply_using_layout:
   variables:
     SLURM_JOB_NUM_NODES: 1
     SLURM_NTASKS: 4
+
+bfloat16_basic:
+  extends: .run_tests
+  stage: test
+  script: /cosma-env-cuda/.spack-env/view/bin/test.bfloat16_basic
+  variables:
+    SLURM_JOB_NUM_NODES: 1
+    SLURM_NTASKS: 1
+    USE_MPI: 'NO'
+
+bfloat16_mpi:
+  extends: .run_tests
+  stage: test
+  script: /cosma-env-cuda/.spack-env/view/bin/test.bfloat16_mpi
+  variables:
+    SLURM_JOB_NUM_NODES: 1
+    SLURM_NTASKS: 2
+    USE_MPI: 'YES'
+
+bfloat16_multiply:
+  extends: .run_tests
+  stage: test
+  script: /cosma-env-cuda/.spack-env/view/bin/test.bfloat16_multiply
+  variables:
+    SLURM_JOB_NUM_NODES: 1
+    SLURM_NTASKS: 8
+    USE_MPI: 'YES'