From c1a3cd7acba859d9df200e557bd3454dc93c1abf Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Fri, 11 Oct 2024 15:15:21 +0100
Subject: [PATCH 01/38] rebsing

 src/ | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/ b/src/
index 2d046e3..c61df37 100644
--- a/src/
+++ b/src/
@@ -1,7 +1,6 @@
 #include "../include/main.hh"
 int iters = 10;
-int startDim = 1;
 int upperLimit = 128;
 bool doCpu = CPU_ENABLED;
@@ -141,6 +140,32 @@ void getParameters(int argc, char* argv[]) {
       doCpu = false;
     } else if (!strcmp(argv[i], "--no_gpu")) {
       doGpu = false;
+    } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) {
+	    sgemm = dgemm = sp_sgemm = sp_dgemm = false;
+	    std::string kernelList = argv[++i];
+	    if (kernelList.find("sp-sgemm") != std::string::npos) {
+		    sp_sgemm = true;
+		    if (kernelList.find("sgemm") != std::string::npos &&
+						kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) {
+			    sgemm = true;
+		    }
+	    } else if (kernelList.find("sgemm") != std::string::npos) {
+			    sgemm = true;
+			}
+	    if (kernelList.find("sp-dgemm") != std::string::npos) {
+		    sp_dgemm = true;
+		    if (kernelList.find("dgemm") != std::string::npos &&
+		        kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) {
+			    dgemm = true;
+		    }
+	    } else if (kernelList.find("dgemm") != std::string::npos) {
+		    dgemm = true;
+	    }
+	    if (!sgemm && !dgemm && !sp_sgemm && !sp_dgemm) {
+		    std::cout << "ERROR - no implemented kernels in list" << std::endl;
+		    exit(1);
+	    }
     } else if (!strcmp(argv[i], "--output_dir") || !strcmp(argv[i], "-o")) {
       if (++i >= argc) {
         std::cout << "ERROR - Invalid output directory" << std::endl;

From 21366b4359101379b640faf814173620f0635e4d Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Fri, 11 Oct 2024 15:22:26 +0100
Subject: [PATCH 02/38] rebsing

 DefaultCPU/sp_gemm.hh          |  55 ++++++
 DefaultGPU/sp_gemm.hh          |  54 ++++++
 cuBLAS/sp_gemm.hh              | 295 +++++++++++++++++++++++++++++++++
 include/doGemm.hh              |  94 +++++++++--
 include/kernels/CPU/sp_gemm.hh | 110 ++++++++++++
 include/kernels/GPU/sp_gemm.hh |  27 +++
 src/                    |   4 +
 7 files changed, 626 insertions(+), 13 deletions(-)
 create mode 100644 DefaultCPU/sp_gemm.hh
 create mode 100644 DefaultGPU/sp_gemm.hh
 create mode 100644 cuBLAS/sp_gemm.hh
 create mode 100644 include/kernels/CPU/sp_gemm.hh
 create mode 100644 include/kernels/GPU/sp_gemm.hh

diff --git a/DefaultCPU/sp_gemm.hh b/DefaultCPU/sp_gemm.hh
new file mode 100644
index 0000000..d7ecb37
--- /dev/null
+++ b/DefaultCPU/sp_gemm.hh
@@ -0,0 +1,55 @@
+#pragma once
+#if defined CPU_DEFAULT
+#include "../include/kernels/CPU/sp_gemm.hh"
+#include "../include/utilities.hh"
+namespace cpu {
+/** A class for GEMM CPU BLAS kernels. */
+template <typename T>
+class sp_gemm_cpu : public sp_gemm<T> {
+ public:
+  using sp_gemm<T>::sp_gemm;
+  using sp_gemm<T>::callConsume;
+  using sp_gemm<T>::m_;
+  using sp_gemm<T>::n_;
+  using sp_gemm<T>::k_;
+  using sp_gemm<T>::A_;
+  using sp_gemm<T>::B_;
+  using sp_gemm<T>::C_;
+ private:
+  /** Perform the GEMM kernel. */
+  void callGemm() override {
+    /** A naive implementation of a column-major GEMM. Alpha and Beta are always
+     * 1 and 0 respectively.
+     * Operation takes the form of C[M,N] = A[M,K] * B[K,N].
+     * callConsume() is required to ensure that the compiler does not optimise
+     * away this function. */
+    int x, y, z;
+    T acc;
+    for (x = 0; x < m_; x++) {
+      for (y = 0; y < n_; y++) {
+        acc = 0.0;
+        for (z = 0; z < k_; z++) {
+          acc += A_[z * m_ + x] * B_[y * k_ + z];
+        }
+        C_[y * m_ + x] = acc;
+      }
+    }
+    // Ensure compiler doesn't optimise away the work being done
+    callConsume();
+  }
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {}
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {}
+}  // namespace cpu
diff --git a/DefaultGPU/sp_gemm.hh b/DefaultGPU/sp_gemm.hh
new file mode 100644
index 0000000..92d157c
--- /dev/null
+++ b/DefaultGPU/sp_gemm.hh
@@ -0,0 +1,54 @@
+#pragma once
+#if defined GPU_DEFAULT
+#include <cmath>
+#include "../include/kernels/GPU/sp_gemm.hh"
+#include "../include/utilities.hh"
+namespace gpu {
+/** A class for GEMM GPU BLAS kernels. */
+template <typename T>
+class sp_gemm_gpu : public sp_gemm<T> {
+ public:
+  using sp_gemm<T>::sp_gemm;
+  /** Call the BLAS kernel n times, with 1 warmup run.
+   * Returns the time elapsed for n BLAS calls in seconds. */
+  time_checksum_gflop compute() {
+    // Override function in base `kernel` class as DefaultGPU should do nothing.
+    return {INFINITY, INFINITY, 0.0};
+  }
+  /** Initialise the required data structures. */
+  void initialise(gpuOffloadType offload, int m, int n, int k) override {
+    // Default GPU implementation - do nothing.
+  }
+ private:
+  /** Make a call to the BLAS Library Kernel. */
+  void callGemm() override {
+    // Default GPU implementation - do nothing.
+  }
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {
+    // Default GPU implementation - do nothing.
+  }
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {
+    // Default GPU implementation - do nothing.
+  }
+  /** Do any necessary cleanup (free pointers, close library handles, etc.)
+   * after Kernel has been called. */
+  void postCallKernelCleanup() override {
+    // Default GPU implementation - do nothing.
+  }
+}  // namespace gpu
\ No newline at end of file
diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
new file mode 100644
index 0000000..3a9cff0
--- /dev/null
+++ b/cuBLAS/sp_gemm.hh
@@ -0,0 +1,295 @@
+#pragma once
+#ifdef GPU_CUBLAS
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include "../include/kernels/GPU/gemm.hh"
+#include "../include/utilities.hh"
+#include "common.hh"
+namespace gpu {
+/** A class for GEMM GPU BLAS kernels. */
+template <typename T>
+class sp_gemm_gpu : public gemm<T> {
+ public:
+  using gemm<T>::gemm;
+  using gemm<T>::m_;
+  using gemm<T>::n_;
+  using gemm<T>::k_;
+  using gemm<T>::A_;
+  using gemm<T>::B_;
+  using gemm<T>::C_;
+  using gemm<T>::offload_;
+  /** Initialise the required data structures.
+   * `offload` refers to the data offload type:
+   *  - Once:    Move data from host to device before all iterations & move from
+   *             device to host after all iterations
+   *  - Always:  Move data from host to device and device to host each iteration
+   *  - Unified: Initialise data as unified memory; no data movement semantics
+   *             required */
+  void initialise(gpuOffloadType offload, int m, int n, int k) override {
+    offload_ = offload;
+    m_ = m;
+    n_ = n;
+    k_ = k;
+    // Create a handle for CUBLAS
+    cublasCreate(&handle_);
+    // Get device identifier
+    cudaCheckError(cudaGetDevice(&gpuDevice_));
+    // Initialise 3 streams to asynchronously move data between host and device
+    cudaCheckError(cudaStreamCreate(&s1_));
+    cudaCheckError(cudaStreamCreate(&s2_));
+    cudaCheckError(cudaStreamCreate(&s3_));
+    if (offload_ == gpuOffloadType::unified) {
+      cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * m_ * k_));
+      cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * k_ * n_));
+      cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * m_ * n_));
+    } else {
+      // Allocate matrices on host
+      A_ = (T*)malloc(sizeof(T) * m_ * k_);
+      B_ = (T*)malloc(sizeof(T) * k_ * n_);
+      C_ = (T*)malloc(sizeof(T) * m_ * n_);
+      // Allocate matrices on device
+      cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * m_ * k_));
+      cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * k_ * n_));
+      cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * m_ * n_));
+    }
+    // Initialise the host matricies
+    srand(SEED);
+    for (int y = 0; y < m_; y++) {
+      for (int x = 0; x < k_; x++) {
+        A_[y * k_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0);
+      }
+    }
+    for (int y = 0; y < k_; y++) {
+      for (int x = 0; x < n_; x++) {
+        B_[y * n_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0);
+      }
+    }
+  }
+ private:
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {
+    switch (offload_) {
+      case gpuOffloadType::always: {
+        // Offload data each iteration - no requirements
+        break;
+      }
+      case gpuOffloadType::once: {
+        // Offload data from host to the device.
+        cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_,
+                                       cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_,
+                                       cudaMemcpyHostToDevice, s2_));
+        cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_,
+                                       cudaMemcpyHostToDevice, s3_));
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Prefetch memory to device
+        cudaCheckError(
+            cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_, gpuDevice_, s1_));
+        cudaCheckError(
+            cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_, gpuDevice_, s2_));
+        cudaCheckError(
+            cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_, gpuDevice_, s3_));
+        break;
+      }
+    }
+  }
+  /** Make a call to the BLAS Library Kernel. */
+  void callGemm() override {
+    switch (offload_) {
+      case gpuOffloadType::always: {
+        // Offload data from host to the device.
+        cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_,
+                                       cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_,
+                                       cudaMemcpyHostToDevice, s2_));
+        cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_,
+                                       cudaMemcpyHostToDevice, s3_));
+        // Call cuBLAS GEMM kernel
+        if constexpr (std::is_same_v<T, float>) {
+          cublasStatus_t stat =
+              cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
+                          A_device_, std::max(1, m_), B_device_,
+                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
+          if (stat != CUBLAS_STATUS_SUCCESS) {
+            std::cout << "cuBLAS error:" << stat << std::endl;
+            exit(1);
+          }
+        } else if constexpr (std::is_same_v<T, double>) {
+          cublasStatus_t stat =
+              cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
+                          A_device_, std::max(1, m_), B_device_,
+                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
+          if (stat != CUBLAS_STATUS_SUCCESS) {
+            std::cout << "cuBLAS error:" << stat << std::endl;
+            exit(1);
+          }
+        }
+        // Offload data from device to host
+        cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_,
+                                       cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_,
+                                       cudaMemcpyDeviceToHost, s2_));
+        cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_,
+                                       cudaMemcpyDeviceToHost, s3_));
+        // Ensure device has finished all work.
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+      case gpuOffloadType::once: {
+        // Call cuBLAS GEMM kernel
+        if constexpr (std::is_same_v<T, float>) {
+          cublasStatus_t stat =
+              cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
+                          A_device_, std::max(1, m_), B_device_,
+                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
+          if (stat != CUBLAS_STATUS_SUCCESS) {
+            std::cout << "cuBLAS error:" << stat << std::endl;
+            exit(1);
+          }
+        } else if constexpr (std::is_same_v<T, double>) {
+          cublasStatus_t stat =
+              cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
+                          A_device_, std::max(1, m_), B_device_,
+                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
+          if (stat != CUBLAS_STATUS_SUCCESS) {
+            std::cout << "cuBLAS error:" << stat << std::endl;
+            exit(1);
+          }
+        }
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Call cuBLAS GEMM kernel
+        if constexpr (std::is_same_v<T, float>) {
+          cublasStatus_t stat = cublasSgemm(
+              handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_,
+              std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_));
+          if (stat != CUBLAS_STATUS_SUCCESS) {
+            std::cout << "cuBLAS error:" << stat << std::endl;
+            exit(1);
+          }
+        } else if constexpr (std::is_same_v<T, double>) {
+          cublasStatus_t stat = cublasDgemm(
+              handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_,
+              std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_));
+          if (stat != CUBLAS_STATUS_SUCCESS) {
+            std::cout << "cuBLAS error:" << stat << std::endl;
+            exit(1);
+          }
+        }
+        break;
+      }
+    }
+  }
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {
+    switch (offload_) {
+      case gpuOffloadType::always: {
+        // Offload data each iteration - no requirements
+        break;
+      }
+      case gpuOffloadType::once: {
+        // Offload data from device to host
+        cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_,
+                                       cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_,
+                                       cudaMemcpyDeviceToHost, s2_));
+        cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_,
+                                       cudaMemcpyDeviceToHost, s3_));
+        // Ensure device has finished all work.
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Ensure all data resides on host once work has completed
+        cudaCheckError(cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_,
+                                            cudaCpuDeviceId, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_,
+                                            cudaCpuDeviceId, s2_));
+        cudaCheckError(cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_,
+                                            cudaCpuDeviceId, s3_));
+        // Ensure device has finished all work.
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+    }
+  }
+  /** Do any necessary cleanup (free pointers, close library handles, etc.)
+   * after Kernel has been called. */
+  void postCallKernelCleanup() override {
+    // Destroy the handle
+    cublasDestroy(handle_);
+    // Destroy streams after use
+    cudaCheckError(cudaStreamDestroy(s1_));
+    cudaCheckError(cudaStreamDestroy(s2_));
+    cudaCheckError(cudaStreamDestroy(s3_));
+    if (offload_ == gpuOffloadType::unified) {
+      cudaFree(A_);
+      cudaFree(B_);
+      cudaFree(C_);
+    } else {
+      // Free the memory held on host and device
+      free(A_);
+      free(B_);
+      free(C_);
+      cudaFree(A_device_);
+      cudaFree(B_device_);
+      cudaFree(C_device_);
+    }
+  }
+  /** Handle used when calling cuBLAS. */
+  cublasHandle_t handle_;
+  /** CUDA Stream 1 - used to asynchronously move data between host and device.
+   */
+  cudaStream_t s1_;
+  /** CUDA Stream 1 - used to asynchronously move data between host and device.
+   */
+  cudaStream_t s2_;
+  /** CUDA Stream 1 - used to asynchronously move data between host and device.
+   */
+  cudaStream_t s3_;
+  /** The ID of the target GPU Device. */
+  int gpuDevice_;
+  /** Input matrix A, held on the device. */
+  T* A_device_;
+  /** Input matrix B, held on the device. */
+  T* B_device_;
+  /** Input matrix C, held on the device. */
+  T* C_device_;
+  /** The constant value Alpha. */
+  const T alpha = ALPHA;
+  /** The constant value Beta. */
+  const T beta = BETA;
+}  // namespace gpu
\ No newline at end of file
diff --git a/include/doGemm.hh b/include/doGemm.hh
index c1aa742..4a7c564 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -20,6 +20,7 @@
 #if defined GPU_CUBLAS
 #include "../cuBLAS/gemm.hh"
+#include "../cuBLAS/sp_gemm.hh"
 #elif defined GPU_ONEMKL
 #include "../oneMKL/GPU/gemm.hh"
 #elif defined GPU_ROCBLAS
@@ -42,11 +43,13 @@ class doGemm {
-        gemmCpu_(iterations_)
+        gemmCpu_(iterations_),
+        spGemmCpu_(iterations_)
-        gemmGpu_(iterations_)
+        gemmGpu_(iterations_),
+        spGemmGpu_(iterations_)
     static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) &&
@@ -68,7 +71,7 @@ class doGemm {
     for (int dim = startDimention_; dim <= upperLimit_; dim++) {
       // M = dim, N = dim, K = dim;
-      callKernels(csvFile, dim, dim, dim);
+      callDenseKernels(csvFile, dim, dim, dim);
     // Close file
@@ -94,7 +97,7 @@ class doGemm {
     int M = 16 * K;
     int N = 16 * K;
     while (M <= upperLimit_) {
-      callKernels(csvFile, M, N, K);
+      callDenseKernels(csvFile, M, N, K);
       M += 16;
       N += 16;
@@ -121,7 +124,7 @@ class doGemm {
     if (upperLimit_ >= 32) {
       for (int dim = startDimention_; dim <= upperLimit_; dim++) {
         // M = dim, N = dim, K = 32;
-        callKernels(csvFile, dim, dim, 32);
+        callDenseKernels(csvFile, dim, dim, 32);
     // Close file
@@ -147,7 +150,7 @@ class doGemm {
     N = startDimention_;
     K = 16 * M;
     while (K <= upperLimit_) {
-      callKernels(csvFile, M, N, K);
+      callDenseKernels(csvFile, M, N, K);
       K += 16;
@@ -174,7 +177,7 @@ class doGemm {
     if (upperLimit_ >= 32) {
       for (int dim = startDimention_; dim <= upperLimit_; dim++) {
         // M = 32, N = 32, K = dim;
-        callKernels(csvFile, 32, 32, dim);
+        callDenseKernels(csvFile, 32, 32, dim);
     // Close file
@@ -200,7 +203,7 @@ class doGemm {
     N = startDimention_;
     M = 16 * K;
     while (M <= upperLimit_) {
-      callKernels(csvFile, M, N, K);
+      callDenseKernels(csvFile, M, N, K);
       M += 16;
@@ -227,7 +230,7 @@ class doGemm {
     if (upperLimit_ >= 32) {
       for (int dim = startDimention_; dim <= upperLimit_; dim++) {
         // M = dim, N = 32, K = 32;
-        callKernels(csvFile, dim, 32, 32);
+        callDenseKernels(csvFile, dim, 32, 32);
     // Close file
@@ -253,7 +256,7 @@ class doGemm {
     K = startDimention_;
     N = 16 * K;
     while (N <= upperLimit_) {
-      callKernels(csvFile, M, N, K);
+      callDenseKernels(csvFile, M, N, K);
       N += 16;
@@ -280,7 +283,7 @@ class doGemm {
     if (upperLimit_ >= 32) {
       for (int dim = startDimention_; dim <= upperLimit_; dim++) {
         // M = 32, N = dim, K = 32;
-        callKernels(csvFile, 32, dim, 32);
+        callDenseKernels(csvFile, 32, dim, 32);
     // Close file
@@ -291,12 +294,27 @@ class doGemm {
       printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)");
+    // Square sparse matrix - sparse matrix multiplication
+    cpuGpu_always_ = cpuGpu_offloadThreshold();
+    cpuGpu_once_ = cpuGpu_offloadThreshold();
+    cpuGpu_unified_ = cpuGpu_offloadThreshold();
+    csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() +
+                          "_sparse_square.csv");
+    if (upperLimit_ >= 32) {
+      for (int dim = 1; dim <= upperLimit_; dim++) {
+        const int N = dim;
+        callSparseKernels(csvFile, N, 0.99);
+      }
+    }
+    // Close file
+    csvFile.close();
   /** Call the appropriate CPU and GPU GEMM kernels. */
-  void callKernels(std::ofstream& csvFile, const int M, const int N,
-                   const int K) {
+  void callDenseKernels(std::ofstream& csvFile, const int M, const int N,
+                        const int K) {
     const double probSize = calcKib(M, N, K);
     const uint64_t flops = calcFlops(M, N, K);
     std::string kernelName = getKernelName();
@@ -488,6 +506,52 @@ class doGemm {
+	void callSparseKernels(std::ofstream& csvFile, const int N, const float
+	sparsity) {
+		const double probSize = calcKib(N, N, N);
+		const uint64_t flops = calcFlops(N, N, N);
+		std::string kernelName = getKernelName();
+		spGemmCpu_.initialise(N, sparsity);
+		time_checksum_gflop cpuResult = spGemmCpu_.compute();
+		cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
+		// Perform the GPU kernels
+		// - ONCE : Offload to/from GPU once before all iterations and once
+		// after
+		spGemmGpu_.initialise(gpuOffloadType::once, N, N, N);
+		time_checksum_gflop gpuResult_once = gemmGpu_.compute();
+		gpuResult_once.gflops =
+						calcGflops(flops, iterations_, gpuResult_once.runtime);
+		// - ALWAYS: Offload to/from GPU every iteration
+		spGemmGpu_.initialise(gpuOffloadType::always, N, N, N);
+		time_checksum_gflop gpuResult_always = gemmGpu_.compute();
+		gpuResult_always.gflops =
+						calcGflops(flops, iterations_, gpuResult_always.runtime);
+		// - UNIFIED : data passed from host to device (and device to host) as
+		//             needed
+		spGemmGpu_.initialise(gpuOffloadType::unified, N, N, N);
+		time_checksum_gflop gpuResult_unified = gemmGpu_.compute();
+		gpuResult_unified.gflops =
+						calcGflops(flops, iterations_, gpuResult_unified.runtime);
+		// ToDo -- non-default GPU operations
+		// Write lines to CSV file
+		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
+		               cpuResult.runtime, cpuResult.gflops);
+		writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
+		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
+		writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
+		               iterations_, gpuResult_always.runtime,
+		               gpuResult_always.gflops);
+		writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
+		               iterations_, gpuResult_unified.runtime,
+		               gpuResult_unified.gflops);
+	}
   /** A function for calculating FLOPs performed by a GEMM.
    * C = alpha*AB + beta*C */
   constexpr uint64_t calcFlops(const int M, const int N, const int K) const {
@@ -623,11 +687,15 @@ class doGemm {
   cpu::gemm_cpu<T> gemmCpu_;
+	cpu::sp_gemm_cpu<T> spGemmCpu_;
   /** The GEMM GPU kernel. */
   gpu::gemm_gpu<T> gemmGpu_;
+	gpu::sp_gemm_gpu<T> spGemmGpu_;
   /** The point at which offloading to GPU (offload once) becomes worthwhile. */
   cpuGpu_offloadThreshold cpuGpu_once_;
diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh
new file mode 100644
index 0000000..3de5ea5
--- /dev/null
+++ b/include/kernels/CPU/sp_gemm.hh
@@ -0,0 +1,110 @@
+#pragma once
+#include "../gemm.hh"
+#include <random>
+namespace cpu {
+/** An abstract class for GEMM BLAS kernels. */
+		template <typename T>
+		class sp_gemm : public ::gemm<T> {
+		public:
+				using ::gemm<T>::gemm;
+				using ::gemm<T>::m_;
+				using ::gemm<T>::n_;
+				using ::gemm<T>::k_;
+				using ::gemm<T>::A_;
+				using ::gemm<T>::B_;
+				using ::gemm<T>::C_;
+		public:
+			/** Initialise the required data structures. */
+			virtual void initialise(int n, double sparsity, bool binary = false) {
+				n_ = n;
+				A_ = (T*)malloc(sizeof(T) * n_ * n_);
+				B_ = (T*)malloc(sizeof(T) * n_ * n_);
+				C_ = (T*)malloc(sizeof(T) * n_ * n_);
+				// Set initial values to 0
+				for (int i = 0; i < (n_ * n_); i++) {
+					A_[i] = 0.0;
+					B_[i] = 0.0;
+				}
+				// Random number generator objects for use in descent
+				std::default_random_engine gen;
+				gen.seed(std::chrono::system_clock::now()
+								         .time_since_epoch().count());
+				std::uniform_real_distribution<double> dist(0.0, 1.0);
+				// Work out number of edges needed to achieve target sparsity
+				int edges = 1 + (int) (n * n * (1 - sparsity));
+				// Initialise the matrices
+				// Using a=0.45 and b=c=0.22 as default probabilities
+				for (int i = 0; i < edges; i++) {
+					while (!rMat(A_, n, 0, n - 1, 0, n - 1,
+					             0.45, 0.22, 0.22,
+					             &gen, dist, false)) {}
+					while (!rMat(B_, n, 0, n - 1, 0, n - 1,
+					             0.45, 0.22, 0.22,
+					             &gen, dist, false)) {}
+				}
+			}
+			private:
+				bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
+					        float a, float b, float c, std::default_random_engine* gen,
+					        std::uniform_real_distribution<double> dist, bool bin) {
+					// If a 1x1 submatrix, then add an edge and return out
+					if (x1 >= x2 && y1 >= y2) {
+						if (abs(M[(y1 * n) + x1]) > 0.1) {
+							return false;
+						} else {
+							// Add 1.0 if this is a binary graph, and a random real number otherwise
+							M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
+											100.0) - 50.0);
+							return true;
+						}
+					} else {
+						// Divide up the matrix
+						int xMidPoint = x1 + floor((x2 - x1) / 2);
+						int yMidPoint = y1 + floor((y2 - y1) / 2);
+						// ToDo -- add some noise to these values between iterations
+						float newA = a;
+						float newB = b;
+						float newC = c;
+						// Work out which quarter to recurse into
+						// There are some ugly ternary operators here to avoid going out of bounds in the edge case
+						// that we are already at 1 width or 1 height
+						float randomNum = dist(*gen);
+						if (randomNum < a) {
+							return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+							            newA, newB, newC, gen, dist, bin);
+						} else if (randomNum < (a + b)) {
+							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+							            newA, newB, newC, gen, dist, bin);
+						} else if (randomNum < (a + b + c)) {
+							return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+							            newA, newB, newC, gen, dist, bin);
+						} else {
+							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+							            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
+							            gen, dist, bin);
+						}
+					}
+					return true;
+				}
+				/** Do any necessary cleanup (free pointers, close library handles, etc.)
+				 * after Kernel has been called. */
+				void postCallKernelCleanup() {
+					free(A_);
+					free(B_);
+					free(C_);
+				}
+		};
+}  // namespace cpu
\ No newline at end of file
diff --git a/include/kernels/GPU/sp_gemm.hh b/include/kernels/GPU/sp_gemm.hh
new file mode 100644
index 0000000..684c166
--- /dev/null
+++ b/include/kernels/GPU/sp_gemm.hh
@@ -0,0 +1,27 @@
+#pragma once
+#include "../gemm.hh"
+namespace gpu {
+/** An abstract class for GEMM BLAS kernels. */
+		template <typename T>
+		class sp_gemm : public ::gemm<T> {
+		public:
+				using ::gemm<T>::gemm;
+				/** Initialise the required data structures.
+				 * `offload` refers to the data offload type:
+				 *  - Once:    Move data from host to device before all iterations & move from
+				 *             device to host after all iterations
+				 *  - Always:  Move data from host to device and device to host each iteration
+				 *  - Unified: Initialise data as unified memory; no data movement semantics
+				 *             required */
+				virtual void initialise(gpuOffloadType offload, int m, int n, int k) = 0;
+		protected:
+				/** Whether data should be offloaded to/from the GPU each iteration, or just
+				 * before & after. */
+				gpuOffloadType offload_ = gpuOffloadType::always;
+		};
+}  // namespace gpu
\ No newline at end of file
diff --git a/src/ b/src/
index c61df37..38e2b5a 100644
--- a/src/
+++ b/src/
@@ -2,6 +2,10 @@
 int iters = 10;
 int upperLimit = 128;
+bool sgemm = true;
+bool dgemm = true;
+bool sp_sgemm = true;
+bool sp_dgemm = true;
 bool doCpu = CPU_ENABLED;
 bool doGpu = GPU_ENABLED;

From f2ed11f5325e2e063d0f92e07d09b13db6b356d7 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Wed, 13 Mar 2024 13:43:05 +0000
Subject: [PATCH 03/38] Implementing cuSPARSE kernel

 cuBLAS/sp_gemm.hh | 208 +++++++++++++++++++++++++---------------------
 1 file changed, 111 insertions(+), 97 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 3a9cff0..67d030c 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -1,7 +1,7 @@
 #pragma once
 #ifdef GPU_CUBLAS
-#include <cublas_v2.h>
+#include "cusparse.h"
 #include <cuda_runtime.h>
 #include "../include/kernels/GPU/gemm.hh"
@@ -14,9 +14,7 @@ template <typename T>
 class sp_gemm_gpu : public gemm<T> {
   using gemm<T>::gemm;
-  using gemm<T>::m_;
   using gemm<T>::n_;
-  using gemm<T>::k_;
   using gemm<T>::A_;
   using gemm<T>::B_;
   using gemm<T>::C_;
@@ -29,15 +27,28 @@ class sp_gemm_gpu : public gemm<T> {
    *  - Always:  Move data from host to device and device to host each iteration
    *  - Unified: Initialise data as unified memory; no data movement semantics
    *             required */
-  void initialise(gpuOffloadType offload, int m, int n, int k) override {
+  void initialise(gpuOffloadType offload, int n, float sparsity) override {
     offload_ = offload;
-    m_ = m;
+		// Create a handle for cuSPARSE
+    cusparseCreate(&handle_);
     n_ = n;
-    k_ = k;
-    // Create a handle for CUBLAS
-    cublasCreate(&handle_);
+		// Create descriptors for matrices A->C
+		cusparseMatDescr_t descrA, descrB, descrC;
+		cusparseCreateMatDescr(&descrA);
+		cusparseCreateMatDescr(&descrB);
+		cusparseCreateMatDescr(&descrC);
+		cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
+		cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL);
+		cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL);
+		cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
+		cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO);
+		cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO);
     // Get device identifier
@@ -47,38 +58,96 @@ class sp_gemm_gpu : public gemm<T> {
+		// Work out number of edges needed to achieve target sparsity
+		int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
     if (offload_ == gpuOffloadType::unified) {
-      cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * m_ * k_));
-      cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * k_ * n_));
-      cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * m_ * n_));
+      cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * n_ * n_));
+      cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * n_ * n_));
+      cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * n_ * n_));
+			cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_));
     } else {
       // Allocate matrices on host
-      A_ = (T*)malloc(sizeof(T) * m_ * k_);
-      B_ = (T*)malloc(sizeof(T) * k_ * n_);
-      C_ = (T*)malloc(sizeof(T) * m_ * n_);
+			A_ = (T*)malloc(sizeof(T) * n_ * n_);
+			B_ = (T*)malloc(sizeof(T) * n_ * n_);
+			C_ = (T*)malloc(sizeof(T) * n_ * n_);
       // Allocate matrices on device
-      cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * m_ * k_));
-      cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * k_ * n_));
-      cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * m_ * n_));
+      cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * n_ * n_));
+      cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * n_ * n_));
+      cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * n_ * n_));
+			// Alloce non-zero vector for A
+			cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_));
-    // Initialise the host matricies
-    srand(SEED);
-    for (int y = 0; y < m_; y++) {
-      for (int x = 0; x < k_; x++) {
-        A_[y * k_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0);
-      }
-    }
-    for (int y = 0; y < k_; y++) {
-      for (int x = 0; x < n_; x++) {
-        B_[y * n_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0);
-      }
-    }
+		// Initialise the host matricies
+		// cusparseSpGEMM() works on CSR format only.  This helpfully makes our
+		// sparse matrix format decision for us!
+		// ToDo -- do the RMAT instantiation of A_ and B_.  Need to think about
+		//  how this can be done in the context of CSR.
+		// Initialise the matrices
+		// Using a=0.45 and b=c=0.22 as default probabilities
+		for (int i = 0; i < edges; i++) {
+			while (!rMat(A_, n, 0, n - 1, 0, n - 1,
+			             0.45, 0.22, 0.22,
+			             &gen, dist, false)) {}
+			while (!rMat(B_, n, 0, n - 1, 0, n - 1,
+			             0.45, 0.22, 0.22,
+			             &gen, dist, false)) {}
+		}
+		bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
+					        float a, float b, float c, std::default_random_engine* gen,
+					        std::uniform_real_distribution<double> dist, bool bin) {
+					// If a 1x1 submatrix, then add an edge and return out
+					if (x1 >= x2 && y1 >= y2) {
+						if (abs(M[(y1 * n) + x1]) > 0.1) {
+							return false;
+						} else {
+							// Add 1.0 if this is a binary graph, and a random real number otherwise
+							M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
+											100.0) - 50.0);
+							return true;
+						}
+					} else {
+						// Divide up the matrix
+						int xMidPoint = x1 + floor((x2 - x1) / 2);
+						int yMidPoint = y1 + floor((y2 - y1) / 2);
+						// ToDo -- add some noise to these values between iterations
+						float newA = a;
+						float newB = b;
+						float newC = c;
+						// Work out which quarter to recurse into
+						// There are some ugly ternary operators here to avoid going out of bounds in the edge case
+						// that we are already at 1 width or 1 height
+						float randomNum = dist(*gen);
+						if (randomNum < a) {
+							return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+							            newA, newB, newC, gen, dist, bin);
+						} else if (randomNum < (a + b)) {
+							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+							            newA, newB, newC, gen, dist, bin);
+						} else if (randomNum < (a + b + c)) {
+							return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+							            newA, newB, newC, gen, dist, bin);
+						} else {
+							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+							            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
+							            gen, dist, bin);
+						}
+					}
+					return true;
+				}
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
+	// ToDo -- update this to apply to CSR format
   void preLoopRequirements() override {
     switch (offload_) {
       case gpuOffloadType::always: {
@@ -119,79 +188,20 @@ class sp_gemm_gpu : public gemm<T> {
                                        cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_,
                                        cudaMemcpyHostToDevice, s3_));
-        // Call cuBLAS GEMM kernel
-        if constexpr (std::is_same_v<T, float>) {
-          cublasStatus_t stat =
-              cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
-                          A_device_, std::max(1, m_), B_device_,
-                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
-          if (stat != CUBLAS_STATUS_SUCCESS) {
-            std::cout << "cuBLAS error:" << stat << std::endl;
-            exit(1);
-          }
-        } else if constexpr (std::is_same_v<T, double>) {
-          cublasStatus_t stat =
-              cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
-                          A_device_, std::max(1, m_), B_device_,
-                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
-          if (stat != CUBLAS_STATUS_SUCCESS) {
-            std::cout << "cuBLAS error:" << stat << std::endl;
-            exit(1);
-          }
-        }
-        // Offload data from device to host
-        cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_,
-                                       cudaMemcpyDeviceToHost, s1_));
-        cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_,
-                                       cudaMemcpyDeviceToHost, s2_));
-        cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_,
-                                       cudaMemcpyDeviceToHost, s3_));
-        // Ensure device has finished all work.
-        cudaCheckError(cudaDeviceSynchronize());
+        // Call cuSPARSE SpGEMM kernel
+				// ToDo -- implement
       case gpuOffloadType::once: {
-        // Call cuBLAS GEMM kernel
-        if constexpr (std::is_same_v<T, float>) {
-          cublasStatus_t stat =
-              cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
-                          A_device_, std::max(1, m_), B_device_,
-                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
-          if (stat != CUBLAS_STATUS_SUCCESS) {
-            std::cout << "cuBLAS error:" << stat << std::endl;
-            exit(1);
-          }
-        } else if constexpr (std::is_same_v<T, double>) {
-          cublasStatus_t stat =
-              cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha,
-                          A_device_, std::max(1, m_), B_device_,
-                          std::max(1, k_), &beta, C_device_, std::max(1, m_));
-          if (stat != CUBLAS_STATUS_SUCCESS) {
-            std::cout << "cuBLAS error:" << stat << std::endl;
-            exit(1);
-          }
-        }
+        // Call cuSPRASE SpGEMM kernel
+				// ToDo -- implement
       case gpuOffloadType::unified: {
-        // Call cuBLAS GEMM kernel
-        if constexpr (std::is_same_v<T, float>) {
-          cublasStatus_t stat = cublasSgemm(
-              handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_,
-              std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_));
-          if (stat != CUBLAS_STATUS_SUCCESS) {
-            std::cout << "cuBLAS error:" << stat << std::endl;
-            exit(1);
-          }
-        } else if constexpr (std::is_same_v<T, double>) {
-          cublasStatus_t stat = cublasDgemm(
-              handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_,
-              std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_));
-          if (stat != CUBLAS_STATUS_SUCCESS) {
-            std::cout << "cuBLAS error:" << stat << std::endl;
-            exit(1);
-          }
-        }
+        // Call cuSPARSE SpGEMM kernel
+				// ToDo -- implement
@@ -199,6 +209,7 @@ class sp_gemm_gpu : public gemm<T> {
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
+	// ToDo -- check that this all still works
   void postLoopRequirements() override {
     switch (offload_) {
       case gpuOffloadType::always: {
@@ -236,7 +247,7 @@ class sp_gemm_gpu : public gemm<T> {
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
     // Destroy the handle
-    cublasDestroy(handle_);
+    cusparseDestroy(handle_);
     // Destroy streams after use
@@ -285,6 +296,9 @@ class sp_gemm_gpu : public gemm<T> {
   /** Input matrix C, held on the device. */
   T* C_device_;
+	/** Vector for number non-zeros, held on the device */
+	int* dANnzPerRow;
   /** The constant value Alpha. */
   const T alpha = ALPHA;

From c208246927e738615a94c0308e845cf42c198f98 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Wed, 13 Mar 2024 14:05:20 +0000
Subject: [PATCH 04/38] Trying to work out CSR malloc bug

 cuBLAS/sp_gemm.hh | 126 ++++++++++++++++++++++++++++------------------
 1 file changed, 76 insertions(+), 50 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 67d030c..3232293 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -66,7 +66,19 @@ class sp_gemm_gpu : public gemm<T> {
       cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * n_ * n_));
       cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * n_ * n_));
       cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * n_ * n_));
-			cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_));
+			cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges));
+			cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges));
+			cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * edges));
+			cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges));
+			cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges));
+			cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * edges));
+			cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * edges));
+			cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * edges));
+			cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * edges));
+//			cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_));
     } else {
       // Allocate matrices on host
 			A_ = (T*)malloc(sizeof(T) * n_ * n_);
@@ -78,7 +90,7 @@ class sp_gemm_gpu : public gemm<T> {
       cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * n_ * n_));
       cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * n_ * n_));
 			// Alloce non-zero vector for A
-			cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_));
+//			cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_));
 		// Initialise the host matricies
@@ -88,6 +100,11 @@ class sp_gemm_gpu : public gemm<T> {
 		//  how this can be done in the context of CSR.
 		// Initialise the matrices
+		// Set initial values to 0
+		for (int i = 0; i < (n_ * n_); i++) {
+			A_[i] = 0.0;
+			B_[i] = 0.0;
+		}
 		// Using a=0.45 and b=c=0.22 as default probabilities
 		for (int i = 0; i < edges; i++) {
 			while (!rMat(A_, n, 0, n - 1, 0, n - 1,
@@ -97,57 +114,17 @@ class sp_gemm_gpu : public gemm<T> {
 			             0.45, 0.22, 0.22,
 			             &gen, dist, false)) {}
+//		for (int i = 0; i < (n_ * n_); i++) {
+//			C_[i] = 0.0;
+//		}
-		bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
-					        float a, float b, float c, std::default_random_engine* gen,
-					        std::uniform_real_distribution<double> dist, bool bin) {
-					// If a 1x1 submatrix, then add an edge and return out
-					if (x1 >= x2 && y1 >= y2) {
-						if (abs(M[(y1 * n) + x1]) > 0.1) {
-							return false;
-						} else {
-							// Add 1.0 if this is a binary graph, and a random real number otherwise
-							M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
-											100.0) - 50.0);
-							return true;
-						}
-					} else {
-						// Divide up the matrix
-						int xMidPoint = x1 + floor((x2 - x1) / 2);
-						int yMidPoint = y1 + floor((y2 - y1) / 2);
-						// ToDo -- add some noise to these values between iterations
-						float newA = a;
-						float newB = b;
-						float newC = c;
-						// Work out which quarter to recurse into
-						// There are some ugly ternary operators here to avoid going out of bounds in the edge case
-						// that we are already at 1 width or 1 height
-						float randomNum = dist(*gen);
-						if (randomNum < a) {
-							return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
-							            newA, newB, newC, gen, dist, bin);
-						} else if (randomNum < (a + b)) {
-							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
-							            newA, newB, newC, gen, dist, bin);
-						} else if (randomNum < (a + b + c)) {
-							return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
-							            newA, newB, newC, gen, dist, bin);
-						} else {
-							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
-							            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
-							            gen, dist, bin);
-						}
-					}
-					return true;
-				}
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
-	// ToDo -- update this to apply to CSR format
   void preLoopRequirements() override {
     switch (offload_) {
       case gpuOffloadType::always: {
@@ -188,8 +165,8 @@ class sp_gemm_gpu : public gemm<T> {
                                        cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_,
                                        cudaMemcpyHostToDevice, s3_));
-        // Call cuSPARSE SpGEMM kernel
-				// ToDo -- implement
       case gpuOffloadType::once: {
@@ -269,6 +246,51 @@ class sp_gemm_gpu : public gemm<T> {
+	bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
+					        float a, float b, float c, std::default_random_engine* gen,
+					        std::uniform_real_distribution<double> dist, bool bin) {
+			// If a 1x1 submatrix, then add an edge and return out
+			if (x1 >= x2 && y1 >= y2) {
+				if (abs(M[(y1 * n) + x1]) > 0.1) {
+					return false;
+				} else {
+					// Add 1.0 if this is a binary graph, and a random real number otherwise
+					M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
+									100.0) - 50.0);
+					return true;
+				}
+			} else {
+				// Divide up the matrix
+				int xMidPoint = x1 + floor((x2 - x1) / 2);
+				int yMidPoint = y1 + floor((y2 - y1) / 2);
+				// ToDo -- add some noise to these values between iterations
+				float newA = a;
+				float newB = b;
+				float newC = c;
+				// Work out which quarter to recurse into
+				// There are some ugly ternary operators here to avoid going out of bounds in the edge case
+				// that we are already at 1 width or 1 height
+				float randomNum = dist(*gen);
+				if (randomNum < a) {
+					return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+					            newA, newB, newC, gen, dist, bin);
+				} else if (randomNum < (a + b)) {
+					return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+					            newA, newB, newC, gen, dist, bin);
+				} else if (randomNum < (a + b + c)) {
+					return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+					            newA, newB, newC, gen, dist, bin);
+				} else {
+					return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+					            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
+					            gen, dist, bin);
+				}
+			}
+			return true;
+		}
   /** Handle used when calling cuBLAS. */
   cublasHandle_t handle_;
@@ -297,7 +319,11 @@ class sp_gemm_gpu : public gemm<T> {
   T* C_device_;
 	/** Vector for number non-zeros, held on the device */
-	int* dANnzPerRow;
+//	int* dANnzPerRow;
+	/** CSR format vectors for matrices A, B and C on the device */
+	T* A_val_, B_val_, C_val_;
+	int* A_col_, A_row_, B_col_, B_row_, C_col_, C_row_;
   /** The constant value Alpha. */
   const T alpha = ALPHA;

From de14a5682aae00ab582f87a396eaf3da5b66b99f Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Wed, 13 Mar 2024 14:07:46 +0000
Subject: [PATCH 05/38] Trying to work out CSR malloc bug

 cuBLAS/sp_gemm.hh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 3232293..0765adb 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -96,8 +96,6 @@ class sp_gemm_gpu : public gemm<T> {
 		// Initialise the host matricies
 		// cusparseSpGEMM() works on CSR format only.  This helpfully makes our
 		// sparse matrix format decision for us!
-		// ToDo -- do the RMAT instantiation of A_ and B_.  Need to think about
-		//  how this can be done in the context of CSR.
 		// Initialise the matrices
 		// Set initial values to 0

From 49cddf02f8a50571d2eaa5b653bdf8fb49198d91 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Tue, 19 Mar 2024 13:05:58 +0000
Subject: [PATCH 06/38] cuSPARSE unified memory implementation

 cuBLAS/sp_gemm.hh | 433 ++++++++++++++++++++++++++--------------------
 1 file changed, 250 insertions(+), 183 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 0765adb..68e3b84 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -3,6 +3,7 @@
 #ifdef GPU_CUBLAS
 #include "cusparse.h"
 #include <cuda_runtime.h>
+#include <type_traits>
 #include "../include/kernels/GPU/gemm.hh"
 #include "../include/utilities.hh"
@@ -20,6 +21,8 @@ class sp_gemm_gpu : public gemm<T> {
   using gemm<T>::C_;
   using gemm<T>::offload_;
+	// ToDo -- just unified implemented so far.  Fill in Always and Once later
   /** Initialise the required data structures.
    * `offload` refers to the data offload type:
    *  - Once:    Move data from host to device before all iterations & move from
@@ -33,10 +36,10 @@ class sp_gemm_gpu : public gemm<T> {
 		// Create a handle for cuSPARSE
-    n_ = n;
+		cudaDataType_ = (std::is_same_v<T, float>) ? CUDA_R_32F :
+						CUDA_R_64F;
-		// Create descriptors for matrices A->C
-		cusparseMatDescr_t descrA, descrB, descrC;
+    n_ = n;
@@ -61,37 +64,30 @@ class sp_gemm_gpu : public gemm<T> {
 		// Work out number of edges needed to achieve target sparsity
 		int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
+		A_nnz_ = B_nnz_ = edges
+		// ToDo -- for all of this mallocing, bear in mind that row will probably
+		//  have fewer than 'edges' values (thats the whole point).  May need to
+		//  reorganise
+    cudaCheckError(cudaMallocManaged(A_num_rows_, sizeof(int)));
+		cudaCheckError(cudaMallocManaged(A_num_cols_, sizeof(int)));
+		cudaCheckError(cudaMallocManaged(A_nnz_, sizeof(int)));
+		cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges));
+		cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges));
+		cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1)));
+		cudaCheckError(cudaMallocManaged(B_num_rows_, sizeof(int)));
+		cudaCheckError(cudaMallocManaged(B_num_cols_, sizeof(int)));
+		cudaCheckError(cudaMallocManaged(B_nnz_, sizeof(int)));
+		cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges));
+		cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges));
+		cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1)));
+		C_val_ = NULL;
+		C_col_ = NULL;
+		C_row_ = NULL;
-    if (offload_ == gpuOffloadType::unified) {
-      cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * n_ * n_));
-      cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * n_ * n_));
-      cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * n_ * n_));
-			cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges));
-			cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges));
-			cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * edges));
-			cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges));
-			cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges));
-			cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * edges));
-			cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * edges));
-			cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * edges));
-			cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * edges));
-//			cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_));
-    } else {
-      // Allocate matrices on host
-			A_ = (T*)malloc(sizeof(T) * n_ * n_);
-			B_ = (T*)malloc(sizeof(T) * n_ * n_);
-			C_ = (T*)malloc(sizeof(T) * n_ * n_);
-      // Allocate matrices on device
-      cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * n_ * n_));
-      cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * n_ * n_));
-      cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * n_ * n_));
-			// Alloce non-zero vector for A
-//			cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_));
-    }
 		// Initialise the host matricies
 		// cusparseSpGEMM() works on CSR format only.  This helpfully makes our
@@ -113,109 +109,160 @@ class sp_gemm_gpu : public gemm<T> {
 			             &gen, dist, false)) {}
-//		for (int i = 0; i < (n_ * n_); i++) {
-//			C_[i] = 0.0;
-//		}
+		toCSR(A_, n, n, edges, A_val_, A_col_, A_row_);
+		toCSR(B_, n, n, edges, B_val_, B_col_, B_row_);
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
-    switch (offload_) {
-      case gpuOffloadType::always: {
-        // Offload data each iteration - no requirements
-        break;
-      }
-      case gpuOffloadType::once: {
-        // Offload data from host to the device.
-        cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_,
-                                       cudaMemcpyHostToDevice, s1_));
-        cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_,
-                                       cudaMemcpyHostToDevice, s2_));
-        cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_,
-                                       cudaMemcpyHostToDevice, s3_));
-        break;
-      }
-      case gpuOffloadType::unified: {
-        // Prefetch memory to device
-        cudaCheckError(
-            cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_, gpuDevice_, s1_));
-        cudaCheckError(
-            cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_, gpuDevice_, s2_));
-        cudaCheckError(
-            cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_, gpuDevice_, s3_));
-        break;
-      }
-    }
+    // Prefetch memory to device
+		cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int), gpuDevice_,
+																				s1_));
+		cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int), gpuDevice_,
+																				s1_));
+		cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int), gpuDevice_,
+																				s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges, gpuDevice_,
+																				s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges,
+																				gpuDevice_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1),
+																				gpuDevice_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int), gpuDevice_,
+																				s2_));
+		cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int), gpuDevice_,
+																				s2_));
+		cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int), gpuDevice_,
+																				s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges, gpuDevice_,
+																				s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges,
+																				gpuDevice_, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1),
+																				gpuDevice_, s2_));
+//		cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_,
+//																				s3_));
+//		cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_,
+//																				s3_));
+//		cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_,
+//																				s3_));
+//		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_,
+//																				s3_));
+//		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges,
+//																				gpuDevice_, s3_));
+//		cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges,
+//																				gpuDevice_, s3_));
+		// Create the CSR matrices on the device
+		cusparseCreateCsr(descrA_, n_, n_, A_nnz_, A_row_, A_col_, A_val_,
+											CUSPARSE_INDEX_BASE_ZERO, cudaDateType_);
+		cusparseCreateCsr(descrB_, n_, n_, B_nnz_, B_row_, B_col_, B_val_,
+											CUSPARSE_INDEX_BASE_ZERO, cudaDateType_);
+		cusparseCreateCsr(descrC_, n_, n_, 0, NULL, NULL, NULL,
+											CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
+		cusparseSpGEMM_createDescr(&spgemmDesc_);
   /** Make a call to the BLAS Library Kernel. */
   void callGemm() override {
-    switch (offload_) {
-      case gpuOffloadType::always: {
-        // Offload data from host to the device.
-        cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_,
-                                       cudaMemcpyHostToDevice, s1_));
-        cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_,
-                                       cudaMemcpyHostToDevice, s2_));
-        cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_,
-                                       cudaMemcpyHostToDevice, s3_));
-        break;
-      }
-      case gpuOffloadType::once: {
-        // Call cuSPRASE SpGEMM kernel
-				// ToDo -- implement
-        break;
-      }
-      case gpuOffloadType::unified: {
-        // Call cuSPARSE SpGEMM kernel
-				// ToDo -- implement
-        break;
-      }
-    }
-  }
+    cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+																	 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+																	 descrA_, descrB_, &beta, descrC_,
+																	 CUSPARSE_SPGEMM_DEFAULT, cudaDataType_,
+																	 spgemmDesc_, buffer_size1_, NULL);
+		cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_));
+    cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+																	 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+																	 descrA_, descrB_, &beta, descrC_,
+																	 CUSPARSE_SPGEMM_DEFAULT, cudaDataType_,
+																	 spgemmDesc_, buffer_size1_, buffer1_);
+		cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+													 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
+													 descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT,
+													 cudaDataType_, spgemmDesc_, buffer_size2_, NULL);
+		cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2));
+		if (cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+													 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
+													 descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT,
+													 cudaDataType_, spgemmDesc_, buffer_size2_, buffer2_)
+			std::cout << "Insufficient resources" << std::endl;
+			exit(1);
+		}
+		int rows, cols, nnz;
+		cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz_);
+		C_nnz_ = nnz;
+		cudaCheckError(cudaMallocManaged(C_val_), sizeof(T) * nnz);
+		cudaCheckError(cudaMallocManaged(C_col_), sizeof(int) * nnz);
+		cudaCheckError(cudaMallocManaged(C_row_), sizeof(int) * (n_ + 1));
+		cusparseCstSetPointers(descrC_, *C_row, *C_colind, *C_val);
+												CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
+												descrB_, &beta, descrC_, CUDA_R_32F,
+												CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_);
+	}
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
-	// ToDo -- check that this all still works
   void postLoopRequirements() override {
-    switch (offload_) {
-      case gpuOffloadType::always: {
-        // Offload data each iteration - no requirements
-        break;
-      }
-      case gpuOffloadType::once: {
-        // Offload data from device to host
-        cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_,
-                                       cudaMemcpyDeviceToHost, s1_));
-        cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_,
-                                       cudaMemcpyDeviceToHost, s2_));
-        cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_,
-                                       cudaMemcpyDeviceToHost, s3_));
-        // Ensure device has finished all work.
-        cudaCheckError(cudaDeviceSynchronize());
-        break;
-      }
-      case gpuOffloadType::unified: {
-        // Ensure all data resides on host once work has completed
-        cudaCheckError(cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_,
-                                            cudaCpuDeviceId, s1_));
-        cudaCheckError(cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_,
-                                            cudaCpuDeviceId, s2_));
-        cudaCheckError(cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_,
-                                            cudaCpuDeviceId, s3_));
-        // Ensure device has finished all work.
-        cudaCheckError(cudaDeviceSynchronize());
-        break;
-      }
-    }
+    // Ensure all data resides on host once work has completed
+		cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int),
+																				cudaCpuDeviceId_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int),
+																				cudaCpuDeviceId_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int),
+																				cudaCpuDeviceId_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges,
+																				cudaCpuDeviceId_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges,
+																				cudaCpuDeviceId_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1),
+																				cudaCpuDeviceId_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int),
+																				cudaCpuDeviceId_, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int),
+																				cudaCpuDeviceId_, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int),
+																				cudaCpuDeviceId_, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges,
+																				cudaCpuDeviceId_, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges,
+																				cudaCpuDeviceId_, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1),
+																				cudaCpuDeviceId_, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int),
+																				cudaCpuDeviceId_, s3_));
+		cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int),
+																				cudaCpuDeviceId_, s3_));
+		cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int),
+																				cudaCpuDeviceId_, s3_));
+		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * C_nnz_,
+																				cudaCpuDeviceId_, s3_));
+		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * C_nnz_,
+																				cudaCpuDeviceId_, s3_));
+		cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * (n_ + 1),
+																				cudaCpuDeviceId_, s3_));
+    // Ensure device has finished all work.
+    cudaCheckError(cudaDeviceSynchronize());
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
@@ -229,65 +276,76 @@ class sp_gemm_gpu : public gemm<T> {
-    if (offload_ == gpuOffloadType::unified) {
-      cudaFree(A_);
-      cudaFree(B_);
-      cudaFree(C_);
-    } else {
-      // Free the memory held on host and device
-      free(A_);
-      free(B_);
-      free(C_);
-      cudaFree(A_device_);
-      cudaFree(B_device_);
-      cudaFree(C_device_);
-    }
+    cudaFree(A_);
+    cudaFree(B_);
+    cudaFree(C_);
 	bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
 					        float a, float b, float c, std::default_random_engine* gen,
 					        std::uniform_real_distribution<double> dist, bool bin) {
-			// If a 1x1 submatrix, then add an edge and return out
-			if (x1 >= x2 && y1 >= y2) {
-				if (abs(M[(y1 * n) + x1]) > 0.1) {
-					return false;
-				} else {
-					// Add 1.0 if this is a binary graph, and a random real number otherwise
-					M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
-									100.0) - 50.0);
-					return true;
-				}
+		// If a 1x1 submatrix, then add an edge and return out
+		if (x1 >= x2 && y1 >= y2) {
+			if (abs(M[(y1 * n) + x1]) > 0.1) {
+				return false;
+			} else {
+				// Add 1.0 if this is a binary graph, and a random real number otherwise
+				M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
+								100.0) - 50.0);
+				return true;
+			}
+		} else {
+			// Divide up the matrix
+			int xMidPoint = x1 + floor((x2 - x1) / 2);
+			int yMidPoint = y1 + floor((y2 - y1) / 2);
+			// ToDo -- add some noise to these values between iterations
+			float newA = a;
+			float newB = b;
+			float newC = c;
+			// Work out which quarter to recurse into
+			// There are some ugly ternary operators here to avoid going out of bounds in the edge case
+			// that we are already at 1 width or 1 height
+			float randomNum = dist(*gen);
+			if (randomNum < a) {
+				return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+				            newA, newB, newC, gen, dist, bin);
+			} else if (randomNum < (a + b)) {
+				return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+				            newA, newB, newC, gen, dist, bin);
+			} else if (randomNum < (a + b + c)) {
+				return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+				            newA, newB, newC, gen, dist, bin);
 			} else {
-				// Divide up the matrix
-				int xMidPoint = x1 + floor((x2 - x1) / 2);
-				int yMidPoint = y1 + floor((y2 - y1) / 2);
-				// ToDo -- add some noise to these values between iterations
-				float newA = a;
-				float newB = b;
-				float newC = c;
-				// Work out which quarter to recurse into
-				// There are some ugly ternary operators here to avoid going out of bounds in the edge case
-				// that we are already at 1 width or 1 height
-				float randomNum = dist(*gen);
-				if (randomNum < a) {
-					return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
-					            newA, newB, newC, gen, dist, bin);
-				} else if (randomNum < (a + b)) {
-					return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
-					            newA, newB, newC, gen, dist, bin);
-				} else if (randomNum < (a + b + c)) {
-					return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
-					            newA, newB, newC, gen, dist, bin);
-				} else {
-					return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
-					            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
-					            gen, dist, bin);
+				return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+				            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
+				            gen, dist, bin);
+			}
+		}
+		return true;
+	}
+	void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index,
+						 int* row_ptr) {
+		int nnz_encountered = 0;
+		int prev_row_ptr = 0;
+		for (int row = 0; row < n_row; row++) {
+			if (nnz_encountered >= nnz) break;
+			row_ptr[row] = prev_row_ptr;
+			int nnz_row = 0;
+			for (int col = 0; col < n_col; col++) {
+				if (nnz_encountered >= nnz) break;
+				if (dense[(row * n_col) + col] != 0.0) {
+					nnz_row++;
+					col_index[nnz_encountered] = col;
+					vals[nnz_encountered] = dense[(row * n_col) + col];
+					nnz_encountered++;
-			return true;
+			prev_row_ptr += nnz_row;
+	}
   /** Handle used when calling cuBLAS. */
   cublasHandle_t handle_;
@@ -307,27 +365,36 @@ class sp_gemm_gpu : public gemm<T> {
   /** The ID of the target GPU Device. */
   int gpuDevice_;
-  /** Input matrix A, held on the device. */
-  T* A_device_;
-  /** Input matrix B, held on the device. */
-  T* B_device_;
-  /** Input matrix C, held on the device. */
-  T* C_device_;
-	/** Vector for number non-zeros, held on the device */
-//	int* dANnzPerRow;
-	/** CSR format vectors for matrices A, B and C on the device */
+	/** CSR format vectors for matrices A, B and C on the host */
+	int A_nnz_, B_nnz_, C_nnz_;
 	T* A_val_, B_val_, C_val_;
 	int* A_col_, A_row_, B_col_, B_row_, C_col_, C_row_;
+  /** CSR format vectors for matrices A, B and C on the device. */
+	int A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_,
+	B_num_cols_dev_, B_nnz_dev_, C_num_rows_dev_, C_num_cols_dev_, C_nnz_dev_;
+	T* A_val_dev_, B_val_dev_, C_val_dev_;
+	int* A_col_dev_, A_row_dev_, B_col_dev_, B_row_dev_, C_col_dev_, C_row_dev_;
   /** The constant value Alpha. */
   const T alpha = ALPHA;
   /** The constant value Beta. */
   const T beta = BETA;
+	// Create descriptors for matrices A->C
+	cusparseMatDescr_t descrA_, descrB_, descrC_;
+	// index type depends on kernel being run
+	cusparseIndexType_t cudaDataType_;
+	cusparceSpGEMMDescr_t spgemmDesc_;
+	size_t buffer_size1_ = 0;
+	size_t buffer_size2_ = 0;
+  void* buffer1_ = NULL;
+	void* buffer2_ = NULL;
 }  // namespace gpu
\ No newline at end of file

From 37ce8b4c32b7b04caae5a4dbc697b21086447c9f Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Thu, 21 Mar 2024 13:08:49 +0000
Subject: [PATCH 07/38] Now compiles

 DefaultGPU/sp_gemm.hh          |   2 +-
 Makefile                       |   2 +-
 cuBLAS/sp_gemm.hh              | 228 +++++++++++++++------------------
 include/doGemm.hh              |   7 +-
 include/kernels/GPU/sp_gemm.hh |   2 +-
 5 files changed, 112 insertions(+), 129 deletions(-)

diff --git a/DefaultGPU/sp_gemm.hh b/DefaultGPU/sp_gemm.hh
index 92d157c..2a9f478 100644
--- a/DefaultGPU/sp_gemm.hh
+++ b/DefaultGPU/sp_gemm.hh
@@ -22,7 +22,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Initialise the required data structures. */
-  void initialise(gpuOffloadType offload, int m, int n, int k) override {
+  void initialise(gpuOffloadType offload, int n, float sparsity) override {
     // Default GPU implementation - do nothing.
diff --git a/Makefile b/Makefile
index 5dd2fc5..bff0add 100644
--- a/Makefile
+++ b/Makefile
@@ -177,7 +177,7 @@ $(info $(TAB)$(TAB)Add `CXXFLAGS=-L<NVHPC_DIR>/.../math_libs/lib64 -L<NVHPC_DIR>
 $(info $(TAB)$(TAB)Add `CXXFLAGS=-I<NVHPC_DIR>/.../math_libs/include -I<NVHPC_DIR>/.../cuda/include` to make command)
 $(info $(TAB)$(TAB)Add `CXXFLAGS=-Wl,-rpath,<NVHPC_DIR>/.../math_libs/lib64 -Wl,-rpath,<NVHPC_DIR>/.../cuda/lib64` to make command)
 $(info )
-override CXXFLAGS += -lcublas -lcudart
+override CXXFLAGS += -lcublas -lcudart -lcusparse
 HEADER_FILES += $(wildcard cuBLAS/*.hh)
diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 68e3b84..c0bfb8e 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -2,24 +2,27 @@
 #ifdef GPU_CUBLAS
 #include "cusparse.h"
+#include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <type_traits>
+#include <random>
+#include <iostream>
-#include "../include/kernels/GPU/gemm.hh"
+#include "../include/kernels/GPU/sp_gemm.hh"
 #include "../include/utilities.hh"
 #include "common.hh"
 namespace gpu {
 /** A class for GEMM GPU BLAS kernels. */
 template <typename T>
-class sp_gemm_gpu : public gemm<T> {
+class sp_gemm_gpu : public sp_gemm<T> {
-  using gemm<T>::gemm;
-  using gemm<T>::n_;
-  using gemm<T>::A_;
-  using gemm<T>::B_;
-  using gemm<T>::C_;
-  using gemm<T>::offload_;
+  using sp_gemm<T>::sp_gemm;
+  using sp_gemm<T>::n_;
+  using sp_gemm<T>::A_;
+  using sp_gemm<T>::B_;
+  using sp_gemm<T>::C_;
+  using sp_gemm<T>::offload_;
 	// ToDo -- just unified implemented so far.  Fill in Always and Once later
@@ -31,63 +34,50 @@ class sp_gemm_gpu : public gemm<T> {
    *  - Unified: Initialise data as unified memory; no data movement semantics
    *             required */
   void initialise(gpuOffloadType offload, int n, float sparsity) override {
+    std::cout << "Initialising" << std::endl;
     offload_ = offload;
 		// Create a handle for cuSPARSE
+    std::cout << "Handle created" << std::endl;
-		cudaDataType_ = (std::is_same_v<T, float>) ? CUDA_R_32F :
-						CUDA_R_64F;
+		if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
+    else if (std::is_same_v<T, double>) cudaDataType_ = CUDA_R_64F;
+    else {
+      std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
+      exit(1);
+    }
     n_ = n;
-		cusparseCreateMatDescr(&descrA);
-		cusparseCreateMatDescr(&descrB);
-		cusparseCreateMatDescr(&descrC);
-		cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
-		cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL);
-		cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL);
-		cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
-		cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO);
-		cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO);
     // Get device identifier
+    std::cout << "GPU device got" << std::endl;
     // Initialise 3 streams to asynchronously move data between host and device
+    std::cout << "Streams created" << std::endl;
 		// Work out number of edges needed to achieve target sparsity
 		int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
-		A_nnz_ = B_nnz_ = edges
+		(*A_nnz_) = (*B_nnz_) = edges;
 		// ToDo -- for all of this mallocing, bear in mind that row will probably
 		//  have fewer than 'edges' values (thats the whole point).  May need to
 		//  reorganise
-    cudaCheckError(cudaMallocManaged(A_num_rows_, sizeof(int)));
-		cudaCheckError(cudaMallocManaged(A_num_cols_, sizeof(int)));
-		cudaCheckError(cudaMallocManaged(A_nnz_, sizeof(int)));
 		cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges));
 		cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges));
 		cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1)));
+    std::cout << "A CSR vectors malloced" << std::endl;
-		cudaCheckError(cudaMallocManaged(B_num_rows_, sizeof(int)));
-		cudaCheckError(cudaMallocManaged(B_num_cols_, sizeof(int)));
-		cudaCheckError(cudaMallocManaged(B_nnz_, sizeof(int)));
 		cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges));
 		cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges));
 		cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1)));
-		C_val_ = NULL;
-		C_col_ = NULL;
-		C_row_ = NULL;
+    std::cout << "B CSR vectors malloced" << std::endl;
 		// Initialise the host matricies
 		// cusparseSpGEMM() works on CSR format only.  This helpfully makes our
@@ -99,6 +89,13 @@ class sp_gemm_gpu : public gemm<T> {
 			A_[i] = 0.0;
 			B_[i] = 0.0;
+    // Random number generator objects for use in descent
+    std::default_random_engine gen;
+    gen.seed(std::chrono::system_clock::now()
+                     .time_since_epoch().count());
+    std::uniform_real_distribution<double> dist(0.0, 1.0);
 		// Using a=0.45 and b=c=0.22 as default probabilities
 		for (int i = 0; i < edges; i++) {
 			while (!rMat(A_, n, 0, n - 1, 0, n - 1,
@@ -117,34 +114,20 @@ class sp_gemm_gpu : public gemm<T> {
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
     // Prefetch memory to device
-		cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int), gpuDevice_,
-																				s1_));
-		cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int), gpuDevice_,
-																				s1_));
-		cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int), gpuDevice_,
-																				s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges, gpuDevice_,
-																				s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges,
+		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_),
+                                        gpuDevice_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_),
 																				gpuDevice_, s1_));
 		cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1),
 																				gpuDevice_, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int), gpuDevice_,
-																				s2_));
-		cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int), gpuDevice_,
-																				s2_));
-		cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int), gpuDevice_,
-																				s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges, gpuDevice_,
-																				s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges,
+		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_),
+                                        gpuDevice_, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_),
 																				gpuDevice_, s2_));
 		cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1),
 																				gpuDevice_, s2_));
@@ -163,13 +146,13 @@ class sp_gemm_gpu : public gemm<T> {
 //																				gpuDevice_, s3_));
 		// Create the CSR matrices on the device
-		cusparseCreateCsr(descrA_, n_, n_, A_nnz_, A_row_, A_col_, A_val_,
+		cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_,
-											CUSPARSE_INDEX_BASE_ZERO, cudaDateType_);
-		cusparseCreateCsr(descrB_, n_, n_, B_nnz_, B_row_, B_col_, B_val_,
+											CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
+		cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_,
-											CUSPARSE_INDEX_BASE_ZERO, cudaDateType_);
-		cusparseCreateCsr(descrC_, n_, n_, 0, NULL, NULL, NULL,
+											CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
+		cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL,
 											CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
@@ -181,38 +164,40 @@ class sp_gemm_gpu : public gemm<T> {
     cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
 																	 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
 																	 descrA_, descrB_, &beta, descrC_,
-																	 CUSPARSE_SPGEMM_DEFAULT, cudaDataType_,
-																	 spgemmDesc_, buffer_size1_, NULL);
+																	 cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
+																	 spgemmDesc_, &buffer_size1_, NULL);
 		cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_));
     cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
 																	 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
 																	 descrA_, descrB_, &beta, descrC_,
-																	 CUSPARSE_SPGEMM_DEFAULT, cudaDataType_,
-																	 spgemmDesc_, buffer_size1_, buffer1_);
-		cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+																	 cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
+																	 spgemmDesc_, &buffer_size1_, buffer1_);
+		cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
 													 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
-													 descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT,
-													 cudaDataType_, spgemmDesc_, buffer_size2_, NULL);
-		cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2));
+													 descrB_, &beta, descrC_, cudaDataType_,
+													 CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
+                           &buffer_size2_, NULL);
+		cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_));
-		if (cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+		if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
 													 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
-													 descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT,
-													 cudaDataType_, spgemmDesc_, buffer_size2_, buffer2_)
+													 descrB_, &beta, descrC_, cudaDataType_,
+                           CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
+                           &buffer_size2_, buffer2_)
 			std::cout << "Insufficient resources" << std::endl;
-		int rows, cols, nnz;
+		int64_t rows, cols, nnz;
-		cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz_);
-		C_nnz_ = nnz;
-		cudaCheckError(cudaMallocManaged(C_val_), sizeof(T) * nnz);
-		cudaCheckError(cudaMallocManaged(C_col_), sizeof(int) * nnz);
-		cudaCheckError(cudaMallocManaged(C_row_), sizeof(int) * (n_ + 1));
+		cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz);
+		(*C_nnz_) = nnz;
+		cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz));
+		cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz));
+		cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1)));
-		cusparseCstSetPointers(descrC_, *C_row, *C_colind, *C_val);
+		cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_);
 												CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
 												descrB_, &beta, descrC_, CUDA_R_32F,
@@ -223,44 +208,26 @@ class sp_gemm_gpu : public gemm<T> {
    * be timed. */
   void postLoopRequirements() override {
     // Ensure all data resides on host once work has completed
-		cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int),
-																				cudaCpuDeviceId_, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int),
-																				cudaCpuDeviceId_, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int),
-																				cudaCpuDeviceId_, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges,
-																				cudaCpuDeviceId_, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges,
-																				cudaCpuDeviceId_, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_),
+																				cudaCpuDeviceId, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_),
+																				cudaCpuDeviceId, s1_));
 		cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1),
-																				cudaCpuDeviceId_, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int),
-																				cudaCpuDeviceId_, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int),
-																				cudaCpuDeviceId_, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int),
-																				cudaCpuDeviceId_, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges,
-																				cudaCpuDeviceId_, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges,
-																				cudaCpuDeviceId_, s2_));
+																				cudaCpuDeviceId, s1_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_),
+																				cudaCpuDeviceId, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_),
+																				cudaCpuDeviceId, s2_));
 		cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1),
-																				cudaCpuDeviceId_, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int),
-																				cudaCpuDeviceId_, s3_));
-		cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int),
-																				cudaCpuDeviceId_, s3_));
-		cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int),
-																				cudaCpuDeviceId_, s3_));
-		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * C_nnz_,
-																				cudaCpuDeviceId_, s3_));
-		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * C_nnz_,
-																				cudaCpuDeviceId_, s3_));
+																				cudaCpuDeviceId, s2_));
+		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * (*C_nnz_),
+																				cudaCpuDeviceId, s3_));
+		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * (*C_nnz_),
+																				cudaCpuDeviceId, s3_));
 		cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * (n_ + 1),
-																				cudaCpuDeviceId_, s3_));
+																				cudaCpuDeviceId, s3_));
     // Ensure device has finished all work.
@@ -348,7 +315,7 @@ class sp_gemm_gpu : public gemm<T> {
   /** Handle used when calling cuBLAS. */
-  cublasHandle_t handle_;
+  cusparseHandle_t handle_;
   /** CUDA Stream 1 - used to asynchronously move data between host and device.
@@ -366,12 +333,29 @@ class sp_gemm_gpu : public gemm<T> {
   int gpuDevice_;
 	/** CSR format vectors for matrices A, B and C on the host */
-	int A_nnz_, B_nnz_, C_nnz_;
-	T* A_val_, B_val_, C_val_;
-	int* A_col_, A_row_, B_col_, B_row_, C_col_, C_row_;
+	T* A_val_;
+	int* A_col_;
+  int* A_row_;
+  int* A_num_rows_;
+  int* A_num_cols_;
+  int* A_nnz_;
+  T* B_val_;
+  int* B_col_;
+  int* B_row_;
+  int* B_num_rows_;
+  int* B_num_cols_;
+  int* B_nnz_;
+  T* C_val_;
+  int* C_col_;
+  int* C_row_;
+  int* C_num_rows_;
+  int* C_num_cols_;
+  int*C_nnz_;
   /** CSR format vectors for matrices A, B and C on the device. */
-	int A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_,
+	int* A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_,
 	B_num_cols_dev_, B_nnz_dev_, C_num_rows_dev_, C_num_cols_dev_, C_nnz_dev_;
 	T* A_val_dev_, B_val_dev_, C_val_dev_;
 	int* A_col_dev_, A_row_dev_, B_col_dev_, B_row_dev_, C_col_dev_, C_row_dev_;
@@ -384,12 +368,12 @@ class sp_gemm_gpu : public gemm<T> {
 	// Create descriptors for matrices A->C
-	cusparseMatDescr_t descrA_, descrB_, descrC_;
+	cusparseSpMatDescr_t descrA_, descrB_, descrC_;
-	// index type depends on kernel being run
-	cusparseIndexType_t cudaDataType_;
+	// Data type depends on kernel being run
+	cudaDataType_t cudaDataType_;
-	cusparceSpGEMMDescr_t spgemmDesc_;
+	cusparseSpGEMMDescr_t spgemmDesc_;
 	size_t buffer_size1_ = 0;
 	size_t buffer_size2_ = 0;
diff --git a/include/doGemm.hh b/include/doGemm.hh
index 4a7c564..5565fb2 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -519,20 +519,19 @@ class doGemm {
 		// Perform the GPU kernels
 		// - ONCE : Offload to/from GPU once before all iterations and once
 		// after
-		spGemmGpu_.initialise(gpuOffloadType::once, N, N, N);
+		spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
 		time_checksum_gflop gpuResult_once = gemmGpu_.compute();
 		gpuResult_once.gflops =
 						calcGflops(flops, iterations_, gpuResult_once.runtime);
 		// - ALWAYS: Offload to/from GPU every iteration
-		spGemmGpu_.initialise(gpuOffloadType::always, N, N, N);
+		spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
 		time_checksum_gflop gpuResult_always = gemmGpu_.compute();
 		gpuResult_always.gflops =
 						calcGflops(flops, iterations_, gpuResult_always.runtime);
 		// - UNIFIED : data passed from host to device (and device to host) as
 		//             needed
-		spGemmGpu_.initialise(gpuOffloadType::unified, N, N, N);
+		spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
 		time_checksum_gflop gpuResult_unified = gemmGpu_.compute();
 		gpuResult_unified.gflops =
 						calcGflops(flops, iterations_, gpuResult_unified.runtime);
diff --git a/include/kernels/GPU/sp_gemm.hh b/include/kernels/GPU/sp_gemm.hh
index 684c166..dbfba87 100644
--- a/include/kernels/GPU/sp_gemm.hh
+++ b/include/kernels/GPU/sp_gemm.hh
@@ -17,7 +17,7 @@ namespace gpu {
 				 *  - Always:  Move data from host to device and device to host each iteration
 				 *  - Unified: Initialise data as unified memory; no data movement semantics
 				 *             required */
-				virtual void initialise(gpuOffloadType offload, int m, int n, int k) = 0;
+				virtual void initialise(gpuOffloadType offload, int n, float sparsity) = 0;
 				/** Whether data should be offloaded to/from the GPU each iteration, or just

From 143c1c041d7da2afda07b27c5c3dbb8b273fab1c Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Mon, 25 Mar 2024 10:11:51 +0000
Subject: [PATCH 08/38] Now compiles with fewer runtime errors

 cuBLAS/sp_gemm.hh | 352 +++++++++++++++++++++++++++-------------------
 include/doGemm.hh |  42 +++---
 2 files changed, 227 insertions(+), 167 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index c0bfb8e..fa0e39d 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -37,12 +37,12 @@ class sp_gemm_gpu : public sp_gemm<T> {
     std::cout << "Initialising" << std::endl;
     offload_ = offload;
-		// Create a handle for cuSPARSE
+    // Create a handle for cuSPARSE
     std::cout << "Handle created" << std::endl;
-		if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
+    if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
     else if (std::is_same_v<T, double>) cudaDataType_ = CUDA_R_64F;
     else {
       std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
@@ -60,24 +60,38 @@ class sp_gemm_gpu : public sp_gemm<T> {
     std::cout << "Streams created" << std::endl;
+    if (offload_ == gpuOffloadType::unified) {
+      std::cout << "Into unified if statement" << std::endl;
+      A_num_rows_ = (int*)malloc(sizeof(int));
+      A_num_cols_ = (int*)malloc(sizeof(int));
+      A_nnz_ = (int*)malloc(sizeof(int));
+      B_num_rows_ = (int*)malloc(sizeof(int));
+      B_num_cols_ = (int*)malloc(sizeof(int));
+      B_nnz_ = (int*)malloc(sizeof(int));
+      C_num_rows_ = (int*)malloc(sizeof(int));
+      C_num_cols_ = (int*)malloc(sizeof(int));
+      C_nnz_ = (int*)malloc(sizeof(int));
+    }
-		// Work out number of edges needed to achieve target sparsity
-		int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
-		(*A_nnz_) = (*B_nnz_) = edges;
-		// ToDo -- for all of this mallocing, bear in mind that row will probably
-		//  have fewer than 'edges' values (thats the whole point).  May need to
-		//  reorganise
+   // Work out number of edges needed to achieve target sparsity
+    int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
+    (*A_nnz_) = (*B_nnz_) = edges;
-		cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges));
-		cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges));
-		cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1)));
-    std::cout << "A CSR vectors malloced" << std::endl;
+    if (offload_ == gpuOffloadType::unified) {
+      std::cout << "beginning mallocs" << std::endl;
+      cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * (*A_nnz_)));
+      std::cout << "A vals vectors malloced" << std::endl;
+      cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * (*A_nnz_)));
+      std::cout << "A cols vectors malloced" << std::endl;
+      cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1)));
+      std::cout << "A CSR vectors malloced" << std::endl;
-		cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges));
-		cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges));
-		cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1)));
-    std::cout << "B CSR vectors malloced" << std::endl;
+      cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * (*B_nnz_)));
+      cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * (*B_nnz_)));
+      cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1)));
+      std::cout << "B CSR vectors malloced" << std::endl;
+    }
 		// Initialise the host matricies
 		// cusparseSpGEMM() works on CSR format only.  This helpfully makes our
@@ -85,10 +99,12 @@ class sp_gemm_gpu : public sp_gemm<T> {
 		// Initialise the matrices
 		// Set initial values to 0
-		for (int i = 0; i < (n_ * n_); i++) {
-			A_[i] = 0.0;
-			B_[i] = 0.0;
-		}
+    A_ = (T*)malloc(sizeof(T) * n_ * n_);
+    B_ = (T*)malloc(sizeof(T) * n_ * n_);
+    for (int i = 0; i < (n_ * n_); i++) {
+      A_[i] = 0.0;
+      B_[i] = 0.0;
+    }
     // Random number generator objects for use in descent
     std::default_random_engine gen;
@@ -96,19 +112,20 @@ class sp_gemm_gpu : public sp_gemm<T> {
     std::uniform_real_distribution<double> dist(0.0, 1.0);
-		// Using a=0.45 and b=c=0.22 as default probabilities
-		for (int i = 0; i < edges; i++) {
-			while (!rMat(A_, n, 0, n - 1, 0, n - 1,
-			             0.45, 0.22, 0.22,
-			             &gen, dist, false)) {}
-			while (!rMat(B_, n, 0, n - 1, 0, n - 1,
-			             0.45, 0.22, 0.22,
-			             &gen, dist, false)) {}
-		}
-		toCSR(A_, n, n, edges, A_val_, A_col_, A_row_);
-		toCSR(B_, n, n, edges, B_val_, B_col_, B_row_);
+    // Using a=0.45 and b=c=0.22 as default probabilities
+    for (int i = 0; i < (*A_nnz_); i++) {
+      while (!rMat(A_, n, 0, n - 1, 0, n - 1,
+                   0.45, 0.22, 0.22,
+                   &gen, dist, false)) {}
+    }
+    for (int i = 0; i < (*B_nnz_); i++) {
+      while (!rMat(B_, n, 0, n - 1, 0, n - 1,
+                   0.45, 0.22, 0.22,
+                   &gen, dist, false)) {}
+    }
+    toCSR(A_, n, n, (*A_nnz_), A_val_, A_col_, A_row_);
+    toCSR(B_, n, n, (*B_nnz_), B_val_, B_col_, B_row_);
@@ -117,135 +134,178 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
-    // Prefetch memory to device
-		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_),
-                                        gpuDevice_, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_),
-																				gpuDevice_, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1),
-																				gpuDevice_, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_),
-                                        gpuDevice_, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_),
-																				gpuDevice_, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1),
-																				gpuDevice_, s2_));
-//		cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_,
-//																				s3_));
-//		cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_,
-//																				s3_));
-//		cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_,
-//																				s3_));
-//		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_,
-//																				s3_));
-//		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges,
-//																				gpuDevice_, s3_));
-//		cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges,
-//																				gpuDevice_, s3_));
-		// Create the CSR matrices on the device
-		cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_,
-											CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
-		cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_,
-											CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
-		cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL,
-											CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
-		cusparseSpGEMM_createDescr(&spgemmDesc_);
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        break;
+      }
+      case gpuOffloadType::once: {
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Prefetch memory to device
+        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_),
+                                            gpuDevice_, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_),
+                                            gpuDevice_, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1),
+                                            gpuDevice_, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_),
+                                            gpuDevice_, s2_));
+        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_),
+                                            gpuDevice_, s2_));
+        cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1),
+                                            gpuDevice_, s2_));
+    //
+    //		cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_,
+    //																				s3_));
+    //		cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_,
+    //																				s3_));
+    //		cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_,
+    //																				s3_));
+    //		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_,
+    //																				s3_));
+    //		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges,
+    //																				gpuDevice_, s3_));
+    //		cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges,
+    //																				gpuDevice_, s3_));
+        // Create the CSR matrices on the device
+        cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_,
+                          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                          CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
+        cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_,
+                          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                          CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
+        cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL,
+                          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+                          CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
+        cusparseSpGEMM_createDescr(&spgemmDesc_);
+        break;
+      }
+    }
   /** Make a call to the BLAS Library Kernel. */
   void callGemm() override {
-    cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-																	 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
-																	 descrA_, descrB_, &beta, descrC_,
-																	 cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
-																	 spgemmDesc_, &buffer_size1_, NULL);
-		cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_));
-    cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-																	 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
-																	 descrA_, descrB_, &beta, descrC_,
-																	 cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
-																	 spgemmDesc_, &buffer_size1_, buffer1_);
-		cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-													 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
-													 descrB_, &beta, descrC_, cudaDataType_,
-													 CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
-                           &buffer_size2_, NULL);
-		cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_));
-		if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-													 CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
-													 descrB_, &beta, descrC_, cudaDataType_,
-                           CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
-                           &buffer_size2_, buffer2_)
-			std::cout << "Insufficient resources" << std::endl;
-			exit(1);
-		}
-		int64_t rows, cols, nnz;
-		cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz);
-		(*C_nnz_) = nnz;
-		cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz));
-		cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz));
-		cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1)));
-		cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_);
-												CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
-												descrB_, &beta, descrC_, CUDA_R_32F,
-												CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_);
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        break;
+      }
+      case gpuOffloadType::once: {
+        break;
+      }
+      case gpuOffloadType::unified: {
+        cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                       CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                                       descrA_, descrB_, &beta, descrC_,
+                                       cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
+                                       spgemmDesc_, &buffer_size1_, NULL);
+        cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_));
+        cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                       CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                                       descrA_, descrB_, &beta, descrC_,
+                                       cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
+                                       spgemmDesc_, &buffer_size1_, buffer1_);
+        cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                               CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
+                               descrB_, &beta, descrC_, cudaDataType_,
+                               CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
+                               &buffer_size2_, NULL);
+        cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_));
+        if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                               CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
+                               descrB_, &beta, descrC_, cudaDataType_,
+                               CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
+                               &buffer_size2_, buffer2_)
+          std::cout << "Insufficient resources" << std::endl;
+          exit(1);
+        }
+        int64_t rows, cols, nnz;
+        cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz);
+        (*C_nnz_) = nnz;
+        cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz));
+        cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz));
+        cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1)));
+        cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_);
+        cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
+                            descrB_, &beta, descrC_, CUDA_R_32F,
+                            CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_);
+        break;
+      }
+    }
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
-    // Ensure all data resides on host once work has completed
-		cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_),
-																				cudaCpuDeviceId, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_),
-																				cudaCpuDeviceId, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1),
-																				cudaCpuDeviceId, s1_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_),
-																				cudaCpuDeviceId, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_),
-																				cudaCpuDeviceId, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1),
-																				cudaCpuDeviceId, s2_));
-		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * (*C_nnz_),
-																				cudaCpuDeviceId, s3_));
-		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * (*C_nnz_),
-																				cudaCpuDeviceId, s3_));
-		cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * (n_ + 1),
-																				cudaCpuDeviceId, s3_));
-    // Ensure device has finished all work.
-    cudaCheckError(cudaDeviceSynchronize());
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        break;
+      }
+      case gpuOffloadType::once: {
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Ensure all data resides on host once work has completed
+        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_),
+                                            cudaCpuDeviceId, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_),
+                                            cudaCpuDeviceId, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1),
+                                            cudaCpuDeviceId, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_),
+                                            cudaCpuDeviceId, s2_));
+        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_),
+                                            cudaCpuDeviceId, s2_));
+        cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1),
+                                            cudaCpuDeviceId, s2_));
+        cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * (*C_nnz_),
+                                            cudaCpuDeviceId, s3_));
+        cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * (*C_nnz_),
+                                            cudaCpuDeviceId, s3_));
+        cudaCheckError(cudaMemPrefetchAsync(C_row_, sizeof(int) * (n_ + 1),
+                                            cudaCpuDeviceId, s3_));
+        // Ensure device has finished all work.
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+    }
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
-    // Destroy the handle
-    cusparseDestroy(handle_);
-    // Destroy streams after use
-    cudaCheckError(cudaStreamDestroy(s1_));
-    cudaCheckError(cudaStreamDestroy(s2_));
-    cudaCheckError(cudaStreamDestroy(s3_));
+    if (offload_ == gpuOffloadType::unified) {
+      // Destroy the handle
+      cusparseDestroy(handle_);
+      // Destroy streams after use
+      cudaCheckError(cudaStreamDestroy(s1_));
+      cudaCheckError(cudaStreamDestroy(s2_));
+      cudaCheckError(cudaStreamDestroy(s3_));
+    }
-    cudaFree(A_);
-    cudaFree(B_);
-    cudaFree(C_);
+    if (offload_ == gpuOffloadType::unified) {
+      cudaFree(A_val_);
+      cudaFree(A_col_);
+      cudaFree(A_row_);
+      cudaFree(B_val_);
+      cudaFree(B_col_);
+      cudaFree(B_row_);
+      cudaFree(C_val_);
+      cudaFree(C_col_);
+      cudaFree(C_row_);
+    }
 	bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
diff --git a/include/doGemm.hh b/include/doGemm.hh
index 5565fb2..0e4dcc0 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -516,23 +516,23 @@ class doGemm {
 		time_checksum_gflop cpuResult = spGemmCpu_.compute();
 		cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
-		// Perform the GPU kernels
-		// - ONCE : Offload to/from GPU once before all iterations and once
-		// after
-		spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
-		time_checksum_gflop gpuResult_once = gemmGpu_.compute();
-		gpuResult_once.gflops =
-						calcGflops(flops, iterations_, gpuResult_once.runtime);
-		// - ALWAYS: Offload to/from GPU every iteration
-		spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
-		time_checksum_gflop gpuResult_always = gemmGpu_.compute();
-		gpuResult_always.gflops =
-						calcGflops(flops, iterations_, gpuResult_always.runtime);
-		// - UNIFIED : data passed from host to device (and device to host) as
-		//             needed
+//		// Perform the GPU kernels
+//		// - ONCE : Offload to/from GPU once before all iterations and once
+//		// after
+//		spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
+//		time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
+//		gpuResult_once.gflops =
+//						calcGflops(flops, iterations_, gpuResult_once.runtime);
+//		// - ALWAYS: Offload to/from GPU every iteration
+//		spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
+//		time_checksum_gflop gpuResult_always = spGemmGpu_.compute();
+//		gpuResult_always.gflops =
+//						calcGflops(flops, iterations_, gpuResult_always.runtime);
+//		// - UNIFIED : data passed from host to device (and device to host) as
+//		//             needed
 		spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
-		time_checksum_gflop gpuResult_unified = gemmGpu_.compute();
+		time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
 		gpuResult_unified.gflops =
 						calcGflops(flops, iterations_, gpuResult_unified.runtime);
@@ -541,11 +541,11 @@ class doGemm {
 		// Write lines to CSV file
 		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
 		               cpuResult.runtime, cpuResult.gflops);
-		writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
-		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
-		writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
-		               iterations_, gpuResult_always.runtime,
-		               gpuResult_always.gflops);
+//		writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
+//		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
+//		writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
+//		               iterations_, gpuResult_always.runtime,
+//		               gpuResult_always.gflops);
 		writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
 		               iterations_, gpuResult_unified.runtime,

From bcd7ae88a01ec199951162c3fdba2d41817edff9 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Fri, 11 Oct 2024 15:23:02 +0100
Subject: [PATCH 09/38] rebasing

 cuBLAS/common.hh  |  13 ++
 cuBLAS/sp_gemm.hh | 576 ++++++++++++++++++++++++++++++++++------------
 include/doGemm.hh |  34 +--
 3 files changed, 458 insertions(+), 165 deletions(-)

diff --git a/cuBLAS/common.hh b/cuBLAS/common.hh
index 78d0270..70d58fb 100644
--- a/cuBLAS/common.hh
+++ b/cuBLAS/common.hh
@@ -2,6 +2,9 @@
 #if defined GPU_CUBLAS
+#include "cusparse.h"
+/** Macro function to check if error occurred when calling cuBLAS. */
 /** Macro function to check if error occurred when calling CUDA. */
 #define cudaCheckError(f)                                                \
   do {                                                                   \
@@ -22,4 +25,14 @@
     }                                                                      \
   } while (false)
+#define cusparseCheckError(f)                                                 \
+  do {                                                                        \
+    cusparseStatus_t status = (f);                                            \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                  \
+      std::cout << "CUSPARSE error: " << __FILE__ << ":" << __LINE__ << ": "  \
+      << cusparseGetErrorString(status) << std::endl;                         \
+      exit(1);                                                                \
+    }                                                                         \
+  } while (false)                                                             \
\ No newline at end of file
diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index fa0e39d..0879966 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -34,12 +34,9 @@ class sp_gemm_gpu : public sp_gemm<T> {
    *  - Unified: Initialise data as unified memory; no data movement semantics
    *             required */
   void initialise(gpuOffloadType offload, int n, float sparsity) override {
-    std::cout << "Initialising" << std::endl;
-    offload_ = offload;
+    std::cout << "_/_/_/_/ Initialising for problem size: " << n << std::endl;
-    // Create a handle for cuSPARSE
-    cusparseCreate(&handle_);
-    std::cout << "Handle created" << std::endl;
+    offload_ = offload;
     if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
@@ -52,45 +49,51 @@ class sp_gemm_gpu : public sp_gemm<T> {
     // Get device identifier
-    std::cout << "GPU device got" << std::endl;
     // Initialise 3 streams to asynchronously move data between host and device
-    std::cout << "Streams created" << std::endl;
-    if (offload_ == gpuOffloadType::unified) {
-      std::cout << "Into unified if statement" << std::endl;
-      A_num_rows_ = (int*)malloc(sizeof(int));
-      A_num_cols_ = (int*)malloc(sizeof(int));
-      A_nnz_ = (int*)malloc(sizeof(int));
-      B_num_rows_ = (int*)malloc(sizeof(int));
-      B_num_cols_ = (int*)malloc(sizeof(int));
-      B_nnz_ = (int*)malloc(sizeof(int));
-      C_num_rows_ = (int*)malloc(sizeof(int));
-      C_num_cols_ = (int*)malloc(sizeof(int));
-      C_nnz_ = (int*)malloc(sizeof(int));
-    }
    // Work out number of edges needed to achieve target sparsity
     int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
-    (*A_nnz_) = (*B_nnz_) = edges;
+    A_nnz_ = B_nnz_ = edges;
     if (offload_ == gpuOffloadType::unified) {
-      std::cout << "beginning mallocs" << std::endl;
-      cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * (*A_nnz_)));
-      std::cout << "A vals vectors malloced" << std::endl;
-      cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * (*A_nnz_)));
-      std::cout << "A cols vectors malloced" << std::endl;
+      cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_));
+      cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * A_nnz_));
       cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1)));
-      std::cout << "A CSR vectors malloced" << std::endl;
-      cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * (*B_nnz_)));
-      cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * (*B_nnz_)));
+      cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * B_nnz_));
+      cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * B_nnz_));
       cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1)));
-      std::cout << "B CSR vectors malloced" << std::endl;
+      cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1)));
+      C_val_ = NULL;
+      C_col_ = NULL;
+    } else {
+      A_val_ = (T*)malloc(sizeof(T) * A_nnz_);
+      A_col_ = (int*)malloc(sizeof(int) * A_nnz_);
+      A_row_ = (int*)malloc(sizeof(int) * (n_ + 1));
+      B_val_ = (T*)malloc(sizeof(T) * B_nnz_);
+      B_col_ = (int*)malloc(sizeof(int) * B_nnz_);
+      B_row_ = (int*)malloc(sizeof(int) * (n_ + 1));
+      C_row_ = (int*)malloc(sizeof(int) * (n_ + 1));
+      cudaCheckError(cudaMalloc((void**)&A_val_dev_, sizeof(T) * A_nnz_));
+      cudaCheckError(cudaMalloc((void**)&A_col_dev_, sizeof(int) * A_nnz_));
+      cudaCheckError(cudaMalloc((void**)&A_row_dev_, sizeof(int) * (n_ + 1)));
+      cudaCheckError(cudaMalloc((void**)&B_val_dev_, sizeof(T) * B_nnz_));
+      cudaCheckError(cudaMalloc((void**)&B_col_dev_, sizeof(int) * B_nnz_));
+      cudaCheckError(cudaMalloc((void**)&B_row_dev_, sizeof(int) * (n_ + 1)));
+      cudaCheckError(cudaMalloc((void**)&C_row_dev_, sizeof(int) * (n_ + 1)));
 		// Initialise the host matricies
@@ -113,75 +116,116 @@ class sp_gemm_gpu : public sp_gemm<T> {
     std::uniform_real_distribution<double> dist(0.0, 1.0);
     // Using a=0.45 and b=c=0.22 as default probabilities
-    for (int i = 0; i < (*A_nnz_); i++) {
-      while (!rMat(A_, n, 0, n - 1, 0, n - 1,
+    for (int i = 0; i < A_nnz_; i++) {
+      while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1,
                    0.45, 0.22, 0.22,
                    &gen, dist, false)) {}
-    for (int i = 0; i < (*B_nnz_); i++) {
-      while (!rMat(B_, n, 0, n - 1, 0, n - 1,
+    for (int i = 0; i < B_nnz_; i++) {
+      while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1,
                    0.45, 0.22, 0.22,
                    &gen, dist, false)) {}
-    toCSR(A_, n, n, (*A_nnz_), A_val_, A_col_, A_row_);
-    toCSR(B_, n, n, (*B_nnz_), B_val_, B_col_, B_row_);
-  }
+    toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_);
+    toCSR(B_, n_, n_, B_nnz_, B_val_, B_col_, B_row_);
+//    std::cout << "_____Matrix A_____" << std::endl;
+//    printDenseMatrix(A_, n_, n_);
+//    std::cout << std::endl << std::endl;
+//    printCSR(A_val_, A_col_, A_row_, A_nnz_, n_, n_);
+//    std::cout << "_____Matrix B_____" << std::endl;
+//    printDenseMatrix(B_, n_, n_);
+//    std::cout << std::endl << std::endl;
+//    printCSR(B_val_, B_col_, B_row_, B_nnz_, n_, n_);
+    // Create a handle for cuSPARSE
+    cusparseCheckError(cusparseCreate(&handle_));
+  }
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
+    std::cout << "\t\tPreLoop" << std::endl;
+    cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_));
     switch(offload_) {
       case gpuOffloadType::always: {
+        // Make matrix descriptors
+        cusparseCheckError(
+                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
+                                  A_col_dev_, A_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_,
+                                  B_col_dev_, B_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL,
+                                  rType_, cType_, indType_, cudaDataType_));
       case gpuOffloadType::once: {
+        cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) *
+                                       A_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) *
+                                       A_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (n_
+                                       + 1), cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) *
+                                       B_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) *
+                                       B_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_
+                                       + 1), cudaMemcpyHostToDevice, s1_));
+        // Craete matrix descriptors
+        cusparseCheckError(
+                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
+                                  A_col_dev_, A_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_,
+                                  B_col_dev_, B_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL,
+                                  rType_, cType_, indType_, cudaDataType_));
       case gpuOffloadType::unified: {
         // Prefetch memory to device
-        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_,
                                             gpuDevice_, s1_));
-        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * A_nnz_,
                                             gpuDevice_, s1_));
         cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1),
                                             gpuDevice_, s1_));
-        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * B_nnz_,
                                             gpuDevice_, s2_));
-        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * B_nnz_,
                                             gpuDevice_, s2_));
         cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1),
                                             gpuDevice_, s2_));
-    //
-    //		cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_,
-    //																				s3_));
-    //		cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_,
-    //																				s3_));
-    //		cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_,
-    //																				s3_));
-    //		cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_,
-    //																				s3_));
-    //		cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges,
-    //																				gpuDevice_, s3_));
-    //		cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges,
-    //																				gpuDevice_, s3_));
-        // Create the CSR matrices on the device
-        cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_,
-                          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-                          CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
-        cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_,
-                          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-                          CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
-        cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL,
-                          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-                          CUSPARSE_INDEX_BASE_ZERO, cudaDataType_);
-        cusparseSpGEMM_createDescr(&spgemmDesc_);
+        // Make matrix descriptors
+        cusparseCheckError(
+                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_, A_col_,
+                                  A_val_, rType_, cType_, indType_,
+                                  cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_, B_col_,
+                                  B_val_, rType_, cType_, indType_,
+                                  cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_, NULL, NULL,
+                                  rType_, cType_, indType_, cudaDataType_));
@@ -189,55 +233,208 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Make a call to the BLAS Library Kernel. */
   void callGemm() override {
+    std::cout << "\t\tcallGemm" << std::endl;
     switch(offload_) {
       case gpuOffloadType::always: {
+        cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) *
+        A_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) *
+        A_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (n_
+                                       + 1), cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) *
+        B_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) *
+        B_nnz_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_
+                                       + 1), cudaMemcpyHostToDevice, s1_));
+        cusparseCheckError(
+                cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_,
+                                    descrB_, &beta, descrC_, cudaDataType_,
+                                    alg_, spgemmDesc_));
+        cusparseCheckError(
+                cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
+                                              descrA_, descrB_, &beta,
+                                              descrC_, cudaDataType_, alg_,
+                                              spgemmDesc_, &buffer_size1_,
+                                              NULL));
+        cudaCheckError(cudaMalloc((void**)&buffer1_, buffer_size1_));
+        cusparseCheckError(
+                cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
+                                              descrA_, descrB_, &beta,
+                                              descrC_, cudaDataType_, alg_,
+                                              spgemmDesc_, &buffer_size1_,
+                                              buffer1_));
+        cusparseCheckError(
+                cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_,
+                                       descrB_, &beta, descrC_, cudaDataType_,
+                                       alg_, spgemmDesc_, &buffer_size2_,
+                                       NULL));
+        cudaCheckError(cudaMalloc((void**)&buffer2_, buffer_size2_));
+        cusparseCheckError(
+                cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_,
+                                       descrB_, &beta, descrC_,
+                                       cudaDataType_, alg_, spgemmDesc_,
+                                       &buffer_size2_, buffer2_));
+        cusparseCheckError(
+                cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
+                                     &C_nnz_));
+        cusparseCheckError(
+                cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
+                                     &C_nnz_));
+        cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_));
+        cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_));
+        cusparseCheckError(
+                cusparseCsrSetPointers(descrC_, C_row_dev_, C_col_dev_,
+                                       C_val_dev_));
+        cusparseCheckError(
+                cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_,
+                                    descrB_, &beta, descrC_, cudaDataType_,
+                                    alg_, spgemmDesc_));
+        cudaCheckError(cudaMemcpyAsync(A_val_, A_val_dev_, sizeof(T) *
+        A_nnz_, cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_col_, A_col_dev_, sizeof(int) *
+        A_nnz_, cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_row_, A_row_dev_, sizeof(int) *
+        (n_ + 1), cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_val_, B_val_dev_, sizeof(T) *
+        B_nnz_, cudaMemcpyDeviceToHost, s2_));
+        cudaCheckError(cudaMemcpyAsync(B_col_, B_col_dev_, sizeof(int) *
+        B_nnz_, cudaMemcpyDeviceToHost, s2_));
+        cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) *
+        (n_ + 1), cudaMemcpyDeviceToHost, s2_));
+        C_val_ = (T*)malloc(sizeof(T) * C_nnz_);
+        C_col_ = (int*)malloc(sizeof(int) * C_nnz_);
+        cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) *
+        C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) *
+        C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) *
+        (n_ + 1), cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaDeviceSynchronize());
+        // Freeing memory
+        cudaCheckError(cudaFree(buffer1_));
+        cudaCheckError(cudaFree(buffer2_));
+        cudaCheckError(cudaFree(C_val_dev_));
+        cudaCheckError(cudaFree(C_col_dev_));
+        free(C_val_);
+        free(C_col_);
       case gpuOffloadType::once: {
+        cusparseCheckError(
+                cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_,
+                                    descrB_, &beta, descrC_, cudaDataType_,
+                                    alg_, spgemmDesc_));
+        cusparseCheckError(
+                cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
+                                              descrA_, descrB_, &beta,
+                                              descrC_, cudaDataType_, alg_,
+                                              spgemmDesc_, &buffer_size1_,
+                                              NULL));
+        cudaCheckError(cudaMalloc((void**)&buffer1_, buffer_size1_));
+        cusparseCheckError(
+                cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
+                                              descrA_, descrB_, &beta,
+                                              descrC_, cudaDataType_, alg_,
+                                              spgemmDesc_, &buffer_size1_,
+                                              buffer1_));
+        cusparseCheckError(
+                cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_,
+                                       descrB_, &beta, descrC_, cudaDataType_,
+                                       alg_, spgemmDesc_, &buffer_size2_,
+                                       NULL));
+        cudaCheckError(cudaMalloc((void**)&buffer2_, buffer_size2_));
+        cusparseCheckError(
+                cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_,
+                               descrB_, &beta, descrC_, cudaDataType_,
+                               alg_, spgemmDesc_, &buffer_size2_, buffer2_));
+        cusparseCheckError(
+                cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
+                                     &C_nnz_));
+        cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_));
+        cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_));
+        cusparseCheckError(
+                cusparseCsrSetPointers(descrC_, C_row_dev_, C_col_dev_,
+                                       C_val_dev_));
+        cusparseCheckError(
+                cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha,
+                                    descrA_, descrB_, &beta, descrC_,
+                                    cudaDataType_, alg_, spgemmDesc_));
+        // Freeing memory
+        cudaCheckError(cudaFree(buffer1_));
+        cudaCheckError(cudaFree(buffer2_));
       case gpuOffloadType::unified: {
-        cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                       CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
-                                       descrA_, descrB_, &beta, descrC_,
-                                       cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
-                                       spgemmDesc_, &buffer_size1_, NULL);
-        cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_));
-        cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                       CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
-                                       descrA_, descrB_, &beta, descrC_,
-                                       cudaDataType_, CUSPARSE_SPGEMM_DEFAULT,
-                                       spgemmDesc_, &buffer_size1_, buffer1_);
-        cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                               CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
+        cusparseCheckError(
+                cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
+                                              descrA_, descrB_, &beta,
+                                              descrC_, cudaDataType_,
+                                              alg_, spgemmDesc_, &buffer_size1_,
+                                              NULL));
+        cudaCheckError(cudaMallocManaged((void**)&buffer1_, buffer_size1_));
+        cusparseCheckError(
+                cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
+                                              descrA_, descrB_, &beta,
+                                              descrC_, cudaDataType_,
+                                              alg_, spgemmDesc_, &buffer_size1_,
+                                              buffer1_));
+        cusparseCheckError(
+                cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_,
+                                       descrB_, &beta, descrC_, cudaDataType_,
+                                       alg_, spgemmDesc_, &buffer_size2_,
+                                       NULL));
+        cudaCheckError(cudaMallocManaged((void**)&buffer2_, buffer_size2_));
+        cusparseCheckError(
+                cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_,
                                descrB_, &beta, descrC_, cudaDataType_,
-                               CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
-                               &buffer_size2_, NULL);
-        cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_));
+                               alg_, spgemmDesc_, &buffer_size2_, buffer2_));
-        if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                               CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
-                               descrB_, &beta, descrC_, cudaDataType_,
-                               CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_,
-                               &buffer_size2_, buffer2_)
-          std::cout << "Insufficient resources" << std::endl;
-          exit(1);
-        }
-        int64_t rows, cols, nnz;
-        cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz);
-        (*C_nnz_) = nnz;
-        cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz));
-        cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz));
-        cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1)));
-        cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_);
-        cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                            CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_,
-                            descrB_, &beta, descrC_, CUDA_R_32F,
-                            CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_);
+        cusparseCheckError(
+                cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
+                                     &C_nnz_));
+        cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * C_nnz_));
+        cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * C_nnz_));
+        cusparseCheckError(
+                cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_));
+        cusparseCheckError(
+                cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_,
+                                    descrB_, &beta, descrC_, cudaDataType_,
+                                    alg_, spgemmDesc_));
+        cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * C_nnz_,
+                                            cudaCpuDeviceId, s3_));
+        cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * C_nnz_,
+                                            cudaCpuDeviceId, s3_));
+        // Freeing memory
+        cudaCheckError(cudaFree(buffer1_));
+        cudaCheckError(cudaFree(buffer2_));
+        cudaCheckError(cudaFree(C_val_));
+        cudaCheckError(cudaFree(C_col_));
@@ -246,33 +443,63 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
+    std::cout << "\t\tPostLoop" << std::endl;
+    cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_));
+    // Destroying descriptors
+    cusparseCheckError(cusparseDestroySpMat(descrA_));
+    cusparseCheckError(cusparseDestroySpMat(descrB_));
+    cusparseCheckError(cusparseDestroySpMat(descrC_));
     switch(offload_) {
       case gpuOffloadType::always: {
       case gpuOffloadType::once: {
+        cudaCheckError(cudaMemcpyAsync(A_val_, A_val_dev_, sizeof(T) *
+        A_nnz_, cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_col_, A_col_dev_, sizeof(int) *
+        A_nnz_, cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_row_, A_row_dev_, sizeof(int) *
+        (n_ + 1), cudaMemcpyDeviceToHost, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_val_, B_val_dev_, sizeof(T) *
+        B_nnz_, cudaMemcpyDeviceToHost, s2_));
+        cudaCheckError(cudaMemcpyAsync(B_col_, B_col_dev_, sizeof(int) *
+        B_nnz_, cudaMemcpyDeviceToHost, s2_));
+        cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) *
+        (n_ + 1), cudaMemcpyDeviceToHost, s2_));
+        C_val_ = (T*)malloc(sizeof(T) * C_nnz_);
+        C_col_ = (int*)malloc(sizeof(int) * C_nnz_);
+        cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) *
+        C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) *
+        C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) *
+        (n_ + 1), cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaDeviceSynchronize());
+        cudaCheckError(cudaFree(C_val_dev_));
+        cudaCheckError(cudaFree(C_col_dev_));
+        free(C_val_);
+        free(C_col_);
       case gpuOffloadType::unified: {
         // Ensure all data resides on host once work has completed
-        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_,
                                             cudaCpuDeviceId, s1_));
-        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * A_nnz_,
                                             cudaCpuDeviceId, s1_));
         cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1),
                                             cudaCpuDeviceId, s1_));
-        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * B_nnz_,
                                             cudaCpuDeviceId, s2_));
-        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_),
+        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * B_nnz_,
                                             cudaCpuDeviceId, s2_));
         cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1),
                                             cudaCpuDeviceId, s2_));
-        cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * (*C_nnz_),
-                                            cudaCpuDeviceId, s3_));
-        cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * (*C_nnz_),
-                                            cudaCpuDeviceId, s3_));
         cudaCheckError(cudaMemPrefetchAsync(C_row_, sizeof(int) * (n_ + 1),
                                             cudaCpuDeviceId, s3_));
         // Ensure device has finished all work.
@@ -285,26 +512,39 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
-    if (offload_ == gpuOffloadType::unified) {
-      // Destroy the handle
-      cusparseDestroy(handle_);
+    std::cout << "\t\tPostCall" << std::endl << std::endl;
+    // Destroy the handle
+    cusparseCheckError(cusparseDestroy(handle_));
+    // Destroy streams after use
+    cudaCheckError(cudaStreamDestroy(s1_));
+    cudaCheckError(cudaStreamDestroy(s2_));
+    cudaCheckError(cudaStreamDestroy(s3_));
-      // Destroy streams after use
-      cudaCheckError(cudaStreamDestroy(s1_));
-      cudaCheckError(cudaStreamDestroy(s2_));
-      cudaCheckError(cudaStreamDestroy(s3_));
-    }
     if (offload_ == gpuOffloadType::unified) {
-      cudaFree(A_val_);
-      cudaFree(A_col_);
-      cudaFree(A_row_);
-      cudaFree(B_val_);
-      cudaFree(B_col_);
-      cudaFree(B_row_);
-      cudaFree(C_val_);
-      cudaFree(C_col_);
-      cudaFree(C_row_);
+      cudaCheckError(cudaFree(A_val_));
+      cudaCheckError(cudaFree(A_col_));
+      cudaCheckError(cudaFree(A_row_));
+      cudaCheckError(cudaFree(B_val_));
+      cudaCheckError(cudaFree(B_col_));
+      cudaCheckError(cudaFree(B_row_));
+      cudaCheckError(cudaFree(C_row_));
+    } else {
+      free(A_val_);
+      free(A_col_);
+      free(A_row_);
+      free(B_val_);
+      free(B_col_);
+      free(B_row_);
+      free(C_row_);
+      cudaCheckError(cudaFree(A_val_dev_));
+      cudaCheckError(cudaFree(A_col_dev_));
+      cudaCheckError(cudaFree(A_row_dev_));
+      cudaCheckError(cudaFree(B_val_dev_));
+      cudaCheckError(cudaFree(B_col_dev_));
+      cudaCheckError(cudaFree(B_row_dev_));
+      cudaCheckError(cudaFree(C_row_dev_));
@@ -356,13 +596,10 @@ class sp_gemm_gpu : public sp_gemm<T> {
 	void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index,
 						 int* row_ptr) {
 		int nnz_encountered = 0;
-		int prev_row_ptr = 0;
 		for (int row = 0; row < n_row; row++) {
-			if (nnz_encountered >= nnz) break;
-			row_ptr[row] = prev_row_ptr;
+			row_ptr[row] = nnz_encountered;
 			int nnz_row = 0;
 			for (int col = 0; col < n_col; col++) {
-				if (nnz_encountered >= nnz) break;
 				if (dense[(row * n_col) + col] != 0.0) {
 					col_index[nnz_encountered] = col;
@@ -370,10 +607,41 @@ class sp_gemm_gpu : public sp_gemm<T> {
-			prev_row_ptr += nnz_row;
+    row_ptr[n_row] = nnz_encountered;
+  // ToDo -- the two following functons are useful for debugging.  I'm
+  //  keeping them in to that end, though they are not used by the benchmark
+  //  itself
+  void printDenseMatrix(T* M, int rows, int cols) {
+    for (int row = 0; row < rows; row++) {
+      std::cout << "| ";
+      for (int col = 0; col < cols; col++) {
+        std::cout << M[(row * cols) + col] << " | ";
+      }
+      std::cout << std::endl;
+    }
+  }
+  void printCSR(T* values, int* col_indices, int* row_pointers, int nnz,
+                int rows, int cols) {
+    std::cout << "\tRow pointers__" << std::endl;
+    for (int p = 0; p < (rows + 1); p++) {
+      std::cout << row_pointers[p] << ", ";
+    }
+    std::cout << std::endl << "\tColumn Indices__" << std::endl;
+    for (int i = 0; i < nnz; i++) {
+      std::cout << col_indices[i] << ", ";
+    }
+    std::cout << std::endl << "\tValues__" << std::endl;
+    for (int v = 0; v < nnz; v++) {
+      std::cout << values[v] << ", ";
+    }
+    std::cout << std::endl;
+  }
   /** Handle used when calling cuBLAS. */
   cusparseHandle_t handle_;
@@ -396,29 +664,34 @@ class sp_gemm_gpu : public sp_gemm<T> {
 	T* A_val_;
 	int* A_col_;
   int* A_row_;
-  int* A_num_rows_;
-  int* A_num_cols_;
-  int* A_nnz_;
+  int64_t A_num_rows_;
+  int64_t A_num_cols_;
+  int64_t A_nnz_;
   T* B_val_;
   int* B_col_;
   int* B_row_;
-  int* B_num_rows_;
-  int* B_num_cols_;
-  int* B_nnz_;
+  int64_t B_num_rows_;
+  int64_t B_num_cols_;
+  int64_t B_nnz_;
   T* C_val_;
   int* C_col_;
   int* C_row_;
-  int* C_num_rows_;
-  int* C_num_cols_;
-  int*C_nnz_;
+  int64_t C_num_rows_;
+  int64_t C_num_cols_;
+  int64_t C_nnz_;
   /** CSR format vectors for matrices A, B and C on the device. */
-	int* A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_,
-	B_num_cols_dev_, B_nnz_dev_, C_num_rows_dev_, C_num_cols_dev_, C_nnz_dev_;
-	T* A_val_dev_, B_val_dev_, C_val_dev_;
-	int* A_col_dev_, A_row_dev_, B_col_dev_, B_row_dev_, C_col_dev_, C_row_dev_;
+	T* A_val_dev_;
+  T* B_val_dev_;
+  T* C_val_dev_;
+	int* A_col_dev_;
+  int* A_row_dev_;
+  int* B_col_dev_;
+  int* B_row_dev_;
+  int* C_col_dev_;
+  int* C_row_dev_;
   /** The constant value Alpha. */
   const T alpha = ALPHA;
@@ -439,6 +712,13 @@ class sp_gemm_gpu : public sp_gemm<T> {
 	size_t buffer_size2_ = 0;
   void* buffer1_ = NULL;
 	void* buffer2_ = NULL;
+  cusparseOperation_t opA_ = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cusparseOperation_t opB_ = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cusparseSpGEMMAlg_t alg_ = CUSPARSE_SPGEMM_DEFAULT;
+  cusparseIndexType_t rType_ = CUSPARSE_INDEX_32I;
+  cusparseIndexType_t cType_ = CUSPARSE_INDEX_32I;
+  cusparseIndexBase_t indType_ = CUSPARSE_INDEX_BASE_ZERO;
 }  // namespace gpu
\ No newline at end of file
diff --git a/include/doGemm.hh b/include/doGemm.hh
index 0e4dcc0..9a66329 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -517,20 +517,20 @@ class doGemm {
 		cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
 //		// Perform the GPU kernels
+    // - ALWAYS: Offload to/from GPU every iteration
+    spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
+    time_checksum_gflop gpuResult_always = spGemmGpu_.compute();
+    gpuResult_always.gflops =
+            calcGflops(flops, iterations_, gpuResult_always.runtime);
 //		// - ONCE : Offload to/from GPU once before all iterations and once
 //		// after
-//		spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
-//		time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
-//		gpuResult_once.gflops =
-//						calcGflops(flops, iterations_, gpuResult_once.runtime);
-//		// - ALWAYS: Offload to/from GPU every iteration
-//		spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
-//		time_checksum_gflop gpuResult_always = spGemmGpu_.compute();
-//		gpuResult_always.gflops =
-//						calcGflops(flops, iterations_, gpuResult_always.runtime);
-//		// - UNIFIED : data passed from host to device (and device to host) as
-//		//             needed
+		spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
+		time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
+		gpuResult_once.gflops =
+						calcGflops(flops, iterations_, gpuResult_once.runtime);
+		// - UNIFIED : data passed from host to device (and device to host) as
+		//             needed
 		spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
 		time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
 		gpuResult_unified.gflops =
@@ -541,11 +541,11 @@ class doGemm {
 		// Write lines to CSV file
 		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
 		               cpuResult.runtime, cpuResult.gflops);
-//		writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
-//		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
-//		writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
-//		               iterations_, gpuResult_always.runtime,
-//		               gpuResult_always.gflops);
+		writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
+		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
+		writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
+		               iterations_, gpuResult_always.runtime,
+		               gpuResult_always.gflops);
 		writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
 		               iterations_, gpuResult_unified.runtime,

Subject: [PATCH 10/38] All implemented and running.  No checksum at the end

 cuBLAS/sp_gemm.hh | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 0879966..fbd08fd 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -325,10 +325,12 @@ class sp_gemm_gpu : public sp_gemm<T> {
         // Freeing memory
-        cudaCheckError(cudaFree(buffer1_));
-        cudaCheckError(cudaFree(buffer2_));
+        cudaCheckError(cudaFree(buffer1_));
+        cudaCheckError(cudaFree(buffer2_));
+        buffer_size1_ = 0;
+        buffer_size2_ = 0;
@@ -380,8 +382,12 @@ class sp_gemm_gpu : public sp_gemm<T> {
                                     cudaDataType_, alg_, spgemmDesc_));
         // Freeing memory
+        cudaCheckError(cudaFree(C_val_dev_));
+        cudaCheckError(cudaFree(C_col_dev_));
+        buffer_size1_ = 0;
+        buffer_size2_ = 0;
       case gpuOffloadType::unified: {
@@ -414,6 +420,8 @@ class sp_gemm_gpu : public sp_gemm<T> {
                 cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
+        if (C_val_ != NULL) cudaCheckError(cudaFree(C_val_));
+        if (C_val_ != NULL) cudaCheckError(cudaFree(C_col_));
         cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * C_nnz_));
         cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * C_nnz_));
@@ -425,16 +433,11 @@ class sp_gemm_gpu : public sp_gemm<T> {
                                     alg_, spgemmDesc_));
-        cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * C_nnz_,
-                                            cudaCpuDeviceId, s3_));
-        cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * C_nnz_,
-                                            cudaCpuDeviceId, s3_));
         // Freeing memory
-        cudaCheckError(cudaFree(C_val_));
-        cudaCheckError(cudaFree(C_col_));
+        buffer_size1_ = 0;
+        buffer_size2_ = 0;
@@ -468,20 +471,9 @@ class sp_gemm_gpu : public sp_gemm<T> {
         cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) *
         (n_ + 1), cudaMemcpyDeviceToHost, s2_));
-        C_val_ = (T*)malloc(sizeof(T) * C_nnz_);
-        C_col_ = (int*)malloc(sizeof(int) * C_nnz_);
-        cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) *
-        C_nnz_, cudaMemcpyDeviceToHost, s3_));
-        cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) *
-        C_nnz_, cudaMemcpyDeviceToHost, s3_));
         cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) *
         (n_ + 1), cudaMemcpyDeviceToHost, s3_));
-        cudaCheckError(cudaFree(C_val_dev_));
-        cudaCheckError(cudaFree(C_col_dev_));
-        free(C_val_);
-        free(C_col_);
       case gpuOffloadType::unified: {
@@ -675,8 +667,8 @@ class sp_gemm_gpu : public sp_gemm<T> {
   int64_t B_num_cols_;
   int64_t B_nnz_;
-  T* C_val_;
-  int* C_col_;
+  T* C_val_ = NULL;
+  int* C_col_ = NULL;
   int* C_row_;
   int64_t C_num_rows_;
   int64_t C_num_cols_;

 cuBLAS/sp_gemm.hh | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index fbd08fd..01c6edb 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -34,11 +34,8 @@ class sp_gemm_gpu : public sp_gemm<T> {
    *  - Unified: Initialise data as unified memory; no data movement semantics
    *             required */
   void initialise(gpuOffloadType offload, int n, float sparsity) override {
-    std::cout << "_/_/_/_/ Initialising for problem size: " << n << std::endl;
     offload_ = offload;
     if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
     else if (std::is_same_v<T, double>) cudaDataType_ = CUDA_R_64F;
     else {
@@ -151,7 +148,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
-    std::cout << "\t\tPreLoop" << std::endl;
     switch(offload_) {
       case gpuOffloadType::always: {
@@ -233,7 +229,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Make a call to the BLAS Library Kernel. */
   void callGemm() override {
-    std::cout << "\t\tcallGemm" << std::endl;
     switch(offload_) {
       case gpuOffloadType::always: {
         cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) *
@@ -446,7 +441,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
-    std::cout << "\t\tPostLoop" << std::endl;
     // Destroying descriptors
@@ -504,7 +498,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
-    std::cout << "\t\tPostCall" << std::endl << std::endl;
     // Destroy the handle

 cuBLAS/sp_gemm.hh | 116 +++++++++++++++++++++++++++++-----------------
 include/doGemm.hh |  20 ++++----
 2 files changed, 84 insertions(+), 52 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 01c6edb..db9cf29 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -24,7 +24,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
   using sp_gemm<T>::C_;
   using sp_gemm<T>::offload_;
-	// ToDo -- just unified implemented so far.  Fill in Always and Once later
+	// ToDo -- No checksum for sparse yet.  Nedd to do
   /** Initialise the required data structures.
    * `offload` refers to the data offload type:
@@ -42,7 +42,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
       std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
-    n_ = n;
+    n_ = n * 20;
     // Get device identifier
@@ -93,6 +93,10 @@ class sp_gemm_gpu : public sp_gemm<T> {
       cudaCheckError(cudaMalloc((void**)&C_row_dev_, sizeof(int) * (n_ + 1)));
+    C_mem_allocated_always_ = false;
+    C_mem_allocated_once_ = false;
+    C_mem_allocated_unified_ = false;
 		// Initialise the host matricies
 		// cusparseSpGEMM() works on CSR format only.  This helpfully makes our
 		// sparse matrix format decision for us!
@@ -148,21 +152,9 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
-    cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_));
     switch(offload_) {
       case gpuOffloadType::always: {
-        // Make matrix descriptors
-        cusparseCheckError(
-                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
-                                  A_col_dev_, A_val_dev_, rType_, cType_,
-                                  indType_, cudaDataType_));
-        cusparseCheckError(
-                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_,
-                                  B_col_dev_, B_val_dev_, rType_, cType_,
-                                  indType_, cudaDataType_));
-        cusparseCheckError(
-                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL,
-                                  rType_, cType_, indType_, cudaDataType_));
       case gpuOffloadType::once: {
@@ -174,11 +166,14 @@ class sp_gemm_gpu : public sp_gemm<T> {
                                        + 1), cudaMemcpyHostToDevice, s1_));
         cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) *
-                                       B_nnz_, cudaMemcpyHostToDevice, s1_));
+                                       B_nnz_, cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) *
-                                       B_nnz_, cudaMemcpyHostToDevice, s1_));
+                                       B_nnz_, cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_
-                                       + 1), cudaMemcpyHostToDevice, s1_));
+                                       + 1), cudaMemcpyHostToDevice, s2_));
+        cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_
+        + 1), cudaMemcpyHostToDevice, s3_));
         // Craete matrix descriptors
@@ -225,6 +220,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
+    cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_));
   /** Make a call to the BLAS Library Kernel. */
@@ -239,16 +235,27 @@ class sp_gemm_gpu : public sp_gemm<T> {
                                        + 1), cudaMemcpyHostToDevice, s1_));
         cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) *
-        B_nnz_, cudaMemcpyHostToDevice, s1_));
+        B_nnz_, cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) *
-        B_nnz_, cudaMemcpyHostToDevice, s1_));
+        B_nnz_, cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_
-                                       + 1), cudaMemcpyHostToDevice, s1_));
+                                       + 1), cudaMemcpyHostToDevice, s2_));
+        cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_
+        + 1), cudaMemcpyHostToDevice, s3_));
+        // Make matrix descriptors
-                cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_,
-                                    descrB_, &beta, descrC_, cudaDataType_,
-                                    alg_, spgemmDesc_));
+                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
+                                  A_col_dev_, A_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_,
+                                  B_col_dev_, B_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL,
+                                  rType_, cType_, indType_, cudaDataType_));
                 cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
@@ -280,10 +287,10 @@ class sp_gemm_gpu : public sp_gemm<T> {
                 cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
-        cusparseCheckError(
-                cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
-                                     &C_nnz_));
+        if (C_mem_allocated_always_) {
+          cudaCheckError(cudaFree(C_val_dev_));
+          cudaCheckError(cudaFree(C_col_dev_));
+        }
         cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_));
         cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_));
@@ -309,8 +316,14 @@ class sp_gemm_gpu : public sp_gemm<T> {
         cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) *
         (n_ + 1), cudaMemcpyDeviceToHost, s2_));
+        if (C_mem_allocated_always_) {
+          free(C_val_);
+          free(C_col_);
+        }
         C_val_ = (T*)malloc(sizeof(T) * C_nnz_);
         C_col_ = (int*)malloc(sizeof(int) * C_nnz_);
+        C_mem_allocated_always_ = true;
         cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) *
         C_nnz_, cudaMemcpyDeviceToHost, s3_));
         cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) *
@@ -320,22 +333,13 @@ class sp_gemm_gpu : public sp_gemm<T> {
         // Freeing memory
-        cudaCheckError(cudaFree(C_val_dev_));
-        cudaCheckError(cudaFree(C_col_dev_));
         buffer_size1_ = 0;
         buffer_size2_ = 0;
-        free(C_val_);
-        free(C_col_);
       case gpuOffloadType::once: {
-        cusparseCheckError(
-                cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_,
-                                    descrB_, &beta, descrC_, cudaDataType_,
-                                    alg_, spgemmDesc_));
                 cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha,
                                               descrA_, descrB_, &beta,
@@ -365,8 +369,13 @@ class sp_gemm_gpu : public sp_gemm<T> {
                 cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
+        if (C_mem_allocated_once_) {
+          cudaCheckError(cudaFree(C_val_dev_));
+          cudaCheckError(cudaFree(C_col_dev_));
+        }
         cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_));
         cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_));
+        C_mem_allocated_once_ = true;
                 cusparseCsrSetPointers(descrC_, C_row_dev_, C_col_dev_,
@@ -377,8 +386,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
                                     cudaDataType_, alg_, spgemmDesc_));
         // Freeing memory
-        cudaCheckError(cudaFree(C_val_dev_));
-        cudaCheckError(cudaFree(C_col_dev_));
         buffer_size1_ = 0;
@@ -415,10 +422,14 @@ class sp_gemm_gpu : public sp_gemm<T> {
                 cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
-        if (C_val_ != NULL) cudaCheckError(cudaFree(C_val_));
-        if (C_val_ != NULL) cudaCheckError(cudaFree(C_col_));
+        if (C_mem_allocated_unified_) {
+          cudaCheckError(cudaFree(C_val_));
+          cudaCheckError(cudaFree(C_col_));
+        }
         cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * C_nnz_));
         cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * C_nnz_));
+        C_mem_allocated_unified_ = true;
                 cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_));
@@ -445,7 +456,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
     // Destroying descriptors
-    cusparseCheckError(cusparseDestroySpMat(descrC_));
     switch(offload_) {
       case gpuOffloadType::always: {
@@ -465,12 +475,19 @@ class sp_gemm_gpu : public sp_gemm<T> {
         cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) *
         (n_ + 1), cudaMemcpyDeviceToHost, s2_));
+        C_val_ = (T*)malloc(sizeof(T) * C_nnz_);
+        C_col_ = (int*)malloc(sizeof(int) * C_nnz_);
+        cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) *
+        C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) *
+        C_nnz_, cudaMemcpyDeviceToHost, s3_));
         cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) *
         (n_ + 1), cudaMemcpyDeviceToHost, s3_));
       case gpuOffloadType::unified: {
+        cusparseCheckError(cusparseDestroySpMat(descrC_));
         // Ensure all data resides on host once work has completed
         cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_,
                                             cudaCpuDeviceId, s1_));
@@ -486,6 +503,10 @@ class sp_gemm_gpu : public sp_gemm<T> {
         cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1),
                                             cudaCpuDeviceId, s2_));
+//        cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * C_nnz_,
+//                                            cudaCpuDeviceId, s3_));
+//        cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * C_nnz_,
+//                                            cudaCpuDeviceId, s3_));
         cudaCheckError(cudaMemPrefetchAsync(C_row_, sizeof(int) * (n_ + 1),
                                             cudaCpuDeviceId, s3_));
         // Ensure device has finished all work.
@@ -506,7 +527,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
     if (offload_ == gpuOffloadType::unified) {
@@ -514,6 +534,8 @@ class sp_gemm_gpu : public sp_gemm<T> {
+      cudaCheckError(cudaFree(C_val_));
+      cudaCheckError(cudaFree(C_col_));
     } else {
@@ -522,6 +544,8 @@ class sp_gemm_gpu : public sp_gemm<T> {
+      free(C_val_);
+      free(C_col_);
@@ -529,6 +553,8 @@ class sp_gemm_gpu : public sp_gemm<T> {
+      cudaCheckError(cudaFree(C_val_dev_));
+      cudaCheckError(cudaFree(C_col_dev_));
@@ -678,6 +704,10 @@ class sp_gemm_gpu : public sp_gemm<T> {
   int* C_col_dev_;
   int* C_row_dev_;
+  bool C_mem_allocated_always_;
+  bool C_mem_allocated_once_;
+  bool C_mem_allocated_unified_;
   /** The constant value Alpha. */
   const T alpha = ALPHA;
diff --git a/include/doGemm.hh b/include/doGemm.hh
index 9a66329..8743314 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -516,25 +516,27 @@ class doGemm {
 		time_checksum_gflop cpuResult = spGemmCpu_.compute();
 		cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
-//		// Perform the GPU kernels
+		// Perform the GPU kernels
+    // - UNIFIED : data passed from host to device (and device to host) as
+    //             needed
+    spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
+    time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
+    gpuResult_unified.gflops =
+    calcGflops(flops, iterations_, gpuResult_unified.runtime);
     // - ALWAYS: Offload to/from GPU every iteration
     spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
     time_checksum_gflop gpuResult_always = spGemmGpu_.compute();
     gpuResult_always.gflops =
             calcGflops(flops, iterations_, gpuResult_always.runtime);
-//		// - ONCE : Offload to/from GPU once before all iterations and once
-//		// after
+		// - ONCE : Offload to/from GPU once before all iterations and once
+		// after
 		spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
 		time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
 		gpuResult_once.gflops =
 						calcGflops(flops, iterations_, gpuResult_once.runtime);
-		// - UNIFIED : data passed from host to device (and device to host) as
-		//             needed
-		spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
-		time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
-		gpuResult_unified.gflops =
-						calcGflops(flops, iterations_, gpuResult_unified.runtime);
 		// ToDo -- non-default GPU operations

 cuBLAS/sp_gemm.hh | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index db9cf29..0848bb6 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -155,6 +155,18 @@ class sp_gemm_gpu : public sp_gemm<T> {
     switch(offload_) {
       case gpuOffloadType::always: {
+        // Make matrix descriptors
+        cusparseCheckError(
+                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
+                                  A_col_dev_, A_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_,
+                                  B_col_dev_, B_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL,
+                                  rType_, cType_, indType_, cudaDataType_));
       case gpuOffloadType::once: {

 include/doGemm.hh     | 44 ++++++++++++++----------
 include/main.hh       |  2 +-
 oneMKL/CPU/sp_gemm.hh | 79 +++++++++++++++++++++++++++++++++++++++++++
 src/           |  3 +-
 4 files changed, 108 insertions(+), 20 deletions(-)
 create mode 100644 oneMKL/CPU/sp_gemm.hh

diff --git a/include/doGemm.hh b/include/doGemm.hh
index 8743314..8153651 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -267,9 +267,7 @@ class doGemm {
     if (doCPU_ && doGPU_) {
       // Print offload results to stdout
       printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)");
-    }
     // Square x Short and Wide
     // Re-initialise offload threshold structures & previous results
     cpuGpu_always_ = cpuGpu_offloadThreshold();
@@ -295,7 +293,7 @@ class doGemm {
-    // Square sparse matrix - sparse matrix multiplication
+// Square sparse matrix - sparse matrix multiplication
     cpuGpu_always_ = cpuGpu_offloadThreshold();
     cpuGpu_once_ = cpuGpu_offloadThreshold();
     cpuGpu_unified_ = cpuGpu_offloadThreshold();
@@ -309,6 +307,12 @@ class doGemm {
     // Close file
+    if (doCPU_ && dpGPU_) {
+      // Print offload results to stdout
+	    printOffloadThreshold("Sparse Square");
+    }
@@ -512,14 +516,20 @@ class doGemm {
 		const uint64_t flops = calcFlops(N, N, N);
 		std::string kernelName = getKernelName();
-		spGemmCpu_.initialise(N, sparsity);
-		time_checksum_gflop cpuResult = spGemmCpu_.compute();
-		cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
-		// Perform the GPU kernels
+    if (doCPU_) {
+      spGemmCpu_.initialise(N, sparsity);
+      time_checksum_gflop cpuResult = spGemmCpu_.compute();
+      cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
+		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
+		               cpuResult.runtime, cpuResult.gflops);
+    }
+    // Perform the GPU kernels
     // - UNIFIED : data passed from host to device (and device to host) as
     //             needed
+    if (doGPU_) {
     spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
     time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
     gpuResult_unified.gflops =
@@ -536,13 +546,9 @@ class doGemm {
 		time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
 		gpuResult_once.gflops =
 						calcGflops(flops, iterations_, gpuResult_once.runtime);
 		// ToDo -- non-default GPU operations
 		// Write lines to CSV file
-		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
-		               cpuResult.runtime, cpuResult.gflops);
 		writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
 		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
 		writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
@@ -551,6 +557,10 @@ class doGemm {
 		writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
 		               iterations_, gpuResult_unified.runtime,
+    }
   /** A function for calculating FLOPs performed by a GEMM.
@@ -589,7 +599,7 @@ class doGemm {
   /** Print to stdout the offload thresholds. */
-  void printOffloadThreshold(std::string problemName) const {
+  void printOffloadThreshold(const std::string& problemName) const {
     std::vector<std::string> header = {
         "Device",  "M",          "N", "K", "Total Prob. Size (KiB)",
         "GFLOP/s", "CPU GFLOP/s"};
@@ -686,16 +696,14 @@ class doGemm {
   /** The GEMM CPU kernel. */
   cpu::gemm_cpu<T> gemmCpu_;
+  cpu::sp_gemm_cpu<T> spGemmCpu_;
-	cpu::sp_gemm_cpu<T> spGemmCpu_;
   /** The GEMM GPU kernel. */
   gpu::gemm_gpu<T> gemmGpu_;
 	gpu::sp_gemm_gpu<T> spGemmGpu_;
   /** The point at which offloading to GPU (offload once) becomes worthwhile. */
   cpuGpu_offloadThreshold cpuGpu_once_;
diff --git a/include/main.hh b/include/main.hh
index cc0bb8f..f12ebcb 100644
--- a/include/main.hh
+++ b/include/main.hh
@@ -15,4 +15,4 @@ void printBenchmarkConfig(const int iters, const int upperLimit);
 int parseInt(const char* str);
 /** A function which parsen the runtime arguments. */
-void getParameters(int argc, char* argv[]);
\ No newline at end of file
+void getParameters(int argc, char** argv);
\ No newline at end of file
diff --git a/oneMKL/CPU/sp_gemm.hh b/oneMKL/CPU/sp_gemm.hh
new file mode 100644
index 0000000..847006b
--- /dev/null
+++ b/oneMKL/CPU/sp_gemm.hh
@@ -0,0 +1,79 @@
+#pragma once
+#ifdef CPU_ONEMKL
+#include <mkl.h>
+#include <algorithm>
+#include "../../include/kernels/CPU/sp_gemm.hh"
+#include "../../include/utilities.hh"
+namespace cpu {
+/** A class for GEMM CPU BLAS kernels. */
+template <typename T>
+class sp_gemm_cpu : public sp_gemm<T> {
+ public:
+  using sp_gemm<T>::sp_gemm;
+  using sp_gemm<T>::initInputMatrices;
+  using sp_gemm<T>::callConsume;
+  using sp_gemm<T>::m_;
+  using sp_gemm<T>::n_;
+  using sp_gemm<T>::k_;
+  using sp_gemm<T>::A_;
+  using sp_gemm<T>::B_;
+  using sp_gemm<T>::C_;
+  /** Initialise the required data structures. */
+  void initialise(int m, int n, int k) {
+    m_ = m;
+    n_ = n;
+    k_ = k;
+    A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64);
+    B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64);
+    C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64);
+    // Initialise the matricies
+    initInputMatrices();
+  }
+ private:
+  /** Make call to the GEMM kernel. */
+  void callGemm() override {
+    if constexpr (std::is_same_v<T, float>) {
+      cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_,
+                  (float)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_),
+                  (float)BETA, C_, std::max(1, m_));
+    } else if constexpr (std::is_same_v<T, double>) {
+      cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_,
+                  (double)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_),
+                  (double)BETA, C_, std::max(1, m_));
+    } else {
+      // Un-specialised class will not do any work - print error and exit.
+      std::cout << "ERROR - Datatype for OneMKL CPU GEMM kernel not supported."
+                << std::endl;
+      exit(1);
+    }
+    // Ensure compiler doesn't optimise away the work being done
+    callConsume();
+  }
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {}
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {}
+  /** Do any necessary cleanup (free pointers, close library handles, etc.)
+   * after Kernel has been called. */
+  void postCallKernelCleanup() override {
+    mkl_free_buffers();
+    mkl_free(A_);
+    mkl_free(B_);
+    mkl_free(C_);
+  }
+}  // namespace cpu
\ No newline at end of file
diff --git a/src/ b/src/
 #include "../include/main.hh"
 int iters = 10;
+int startDim = 1;
 int upperLimit = 128;
 bool sgemm = true;
 bool dgemm = true;
@@ -115,7 +116,7 @@ int parseInt(const char* str) {
   return strlen(next) ? -1 : value;
-void getParameters(int argc, char* argv[]) {
+void getParameters(int argc, char** argv) {
   for (int i = 1; i < argc; i++) {
     if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i")) {
       if (++i >= argc || (iters = parseInt(argv[i])) < 0) {

From be9094c3c28399ac44658d92941b4923323850f5 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Fri, 11 Oct 2024 15:32:57 +0100
Subject: [PATCH 15/38] rebasing

--- | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ b/
index 0ed7772..d323162 100644
--- a/
+++ b/
@@ -199,7 +199,7 @@
     plt.margins(x=0.01, y=0.01)
     leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
-    for obj in leg.legendHandles:
+    for obj in leg.legend_handles:

 cuBLAS/sp_gemm.hh              | 90 ++-------------------------------
 include/kernels/CPU/sp_gemm.hh | 72 ++------------------------
 include/kernels/gemm.hh        | 92 ++++++++++++++++++++++++++++++++++
 oneMKL/CPU/sp_gemm.hh          |  9 ++--
 4 files changed, 102 insertions(+), 161 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 0848bb6..992b018 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -18,6 +18,8 @@ template <typename T>
 class sp_gemm_gpu : public sp_gemm<T> {
   using sp_gemm<T>::sp_gemm;
+  using sp_gemm<T>::initInputMatricesSparse;
+  using sp_gemm<T>::toCSR;
   using sp_gemm<T>::n_;
   using sp_gemm<T>::A_;
   using sp_gemm<T>::B_;
@@ -55,8 +57,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
    // Work out number of edges needed to achieve target sparsity
-    int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
-    A_nnz_ = B_nnz_ = edges;
+    A_nnz_ = B_nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity));
     if (offload_ == gpuOffloadType::unified) {
       cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_));
@@ -105,28 +106,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
 		// Set initial values to 0
     A_ = (T*)malloc(sizeof(T) * n_ * n_);
     B_ = (T*)malloc(sizeof(T) * n_ * n_);
-    for (int i = 0; i < (n_ * n_); i++) {
-      A_[i] = 0.0;
-      B_[i] = 0.0;
-    }
-    // Random number generator objects for use in descent
-    std::default_random_engine gen;
-    gen.seed(std::chrono::system_clock::now()
-                     .time_since_epoch().count());
-    std::uniform_real_distribution<double> dist(0.0, 1.0);
-    // Using a=0.45 and b=c=0.22 as default probabilities
-    for (int i = 0; i < A_nnz_; i++) {
-      while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1,
-                   0.45, 0.22, 0.22,
-                   &gen, dist, false)) {}
-    }
-    for (int i = 0; i < B_nnz_; i++) {
-      while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1,
-                   0.45, 0.22, 0.22,
-                   &gen, dist, false)) {}
-    }
+    initInputMatricesSparse(sparsity);
     toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_);
@@ -571,68 +551,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
-	bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
-					        float a, float b, float c, std::default_random_engine* gen,
-					        std::uniform_real_distribution<double> dist, bool bin) {
-		// If a 1x1 submatrix, then add an edge and return out
-		if (x1 >= x2 && y1 >= y2) {
-			if (abs(M[(y1 * n) + x1]) > 0.1) {
-				return false;
-			} else {
-				// Add 1.0 if this is a binary graph, and a random real number otherwise
-				M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
-								100.0) - 50.0);
-				return true;
-			}
-		} else {
-			// Divide up the matrix
-			int xMidPoint = x1 + floor((x2 - x1) / 2);
-			int yMidPoint = y1 + floor((y2 - y1) / 2);
-			// ToDo -- add some noise to these values between iterations
-			float newA = a;
-			float newB = b;
-			float newC = c;
-			// Work out which quarter to recurse into
-			// There are some ugly ternary operators here to avoid going out of bounds in the edge case
-			// that we are already at 1 width or 1 height
-			float randomNum = dist(*gen);
-			if (randomNum < a) {
-				return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
-				            newA, newB, newC, gen, dist, bin);
-			} else if (randomNum < (a + b)) {
-				return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
-				            newA, newB, newC, gen, dist, bin);
-			} else if (randomNum < (a + b + c)) {
-				return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
-				            newA, newB, newC, gen, dist, bin);
-			} else {
-				return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
-				            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
-				            gen, dist, bin);
-			}
-		}
-		return true;
-	}
-	void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index,
-						 int* row_ptr) {
-		int nnz_encountered = 0;
-		for (int row = 0; row < n_row; row++) {
-			row_ptr[row] = nnz_encountered;
-			int nnz_row = 0;
-			for (int col = 0; col < n_col; col++) {
-				if (dense[(row * n_col) + col] != 0.0) {
-					nnz_row++;
-					col_index[nnz_encountered] = col;
-					vals[nnz_encountered] = dense[(row * n_col) + col];
-					nnz_encountered++;
-				}
-			}
-		}
-    row_ptr[n_row] = nnz_encountered;
-	}
   // ToDo -- the two following functons are useful for debugging.  I'm
diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh
index 3de5ea5..6d9d011 100644
--- a/include/kernels/CPU/sp_gemm.hh
+++ b/include/kernels/CPU/sp_gemm.hh
@@ -11,6 +11,8 @@ namespace cpu {
 		class sp_gemm : public ::gemm<T> {
 				using ::gemm<T>::gemm;
+        using ::gemm<T>::initInputMatricesSparse;
+        using ::gemm<T>::toCSR;
 				using ::gemm<T>::m_;
 				using ::gemm<T>::n_;
 				using ::gemm<T>::k_;
@@ -27,78 +29,10 @@ namespace cpu {
 				B_ = (T*)malloc(sizeof(T) * n_ * n_);
 				C_ = (T*)malloc(sizeof(T) * n_ * n_);
-				// Set initial values to 0
-				for (int i = 0; i < (n_ * n_); i++) {
-					A_[i] = 0.0;
-					B_[i] = 0.0;
-				}
-				// Random number generator objects for use in descent
-				std::default_random_engine gen;
-				gen.seed(std::chrono::system_clock::now()
-								         .time_since_epoch().count());
-				std::uniform_real_distribution<double> dist(0.0, 1.0);
-				// Work out number of edges needed to achieve target sparsity
-				int edges = 1 + (int) (n * n * (1 - sparsity));
-				// Initialise the matrices
-				// Using a=0.45 and b=c=0.22 as default probabilities
-				for (int i = 0; i < edges; i++) {
-					while (!rMat(A_, n, 0, n - 1, 0, n - 1,
-					             0.45, 0.22, 0.22,
-					             &gen, dist, false)) {}
-					while (!rMat(B_, n, 0, n - 1, 0, n - 1,
-					             0.45, 0.22, 0.22,
-					             &gen, dist, false)) {}
-				}
+				initInputMatricesSparse(sparsity);
-				bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
-					        float a, float b, float c, std::default_random_engine* gen,
-					        std::uniform_real_distribution<double> dist, bool bin) {
-					// If a 1x1 submatrix, then add an edge and return out
-					if (x1 >= x2 && y1 >= y2) {
-						if (abs(M[(y1 * n) + x1]) > 0.1) {
-							return false;
-						} else {
-							// Add 1.0 if this is a binary graph, and a random real number otherwise
-							M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
-											100.0) - 50.0);
-							return true;
-						}
-					} else {
-						// Divide up the matrix
-						int xMidPoint = x1 + floor((x2 - x1) / 2);
-						int yMidPoint = y1 + floor((y2 - y1) / 2);
-						// ToDo -- add some noise to these values between iterations
-						float newA = a;
-						float newB = b;
-						float newC = c;
-						// Work out which quarter to recurse into
-						// There are some ugly ternary operators here to avoid going out of bounds in the edge case
-						// that we are already at 1 width or 1 height
-						float randomNum = dist(*gen);
-						if (randomNum < a) {
-							return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
-							            newA, newB, newC, gen, dist, bin);
-						} else if (randomNum < (a + b)) {
-							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
-							            newA, newB, newC, gen, dist, bin);
-						} else if (randomNum < (a + b + c)) {
-							return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
-							            newA, newB, newC, gen, dist, bin);
-						} else {
-							return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
-							            ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
-							            gen, dist, bin);
-						}
-					}
-					return true;
-				}
 				/** Do any necessary cleanup (free pointers, close library handles, etc.)
 				 * after Kernel has been called. */
 				void postCallKernelCleanup() {
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index 4eda90f..59a9898 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -4,6 +4,7 @@
 #include <chrono>
 #include <cmath>
 #include <limits>
+#include <random>
 #include "../utilities.hh"
@@ -86,9 +87,100 @@ class gemm {
+  void initInputMatricesSparse(float sparsity) {
+    for (int i = 0; i < (n_ * n_); i++) {
+      A_[i] = 0.0;
+      B_[i] = 0.0;
+    }
+    // Random number generator objects for use in descent
+    std::default_random_engine gen;
+    gen.seed(std::chrono::system_clock::now()
+                     .time_since_epoch().count());
+    std::uniform_real_distribution<double> dist(0.0, 1.0);
+    int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
+    // Using a=0.45 and b=c=0.22 as default probabilities
+    for (int i = 0; i < edges; i++) {
+      while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1,
+                   0.45, 0.22, 0.22,
+                   &gen, dist, false)) {}
+    }
+    for (int i = 0; i < edges; i++) {
+      while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1,
+                   0.45, 0.22, 0.22,
+                   &gen, dist, false)) {}
+    }
+  }
   /** Call the extern consume() function. */
   void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); }
+  /** Recursive function to populate sparse matrices */
+  bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
+            float a, float b, float c, std::default_random_engine* gen,
+            std::uniform_real_distribution<double> dist, bool bin) {
+    // If a 1x1 submatrix, then add an edge and return out
+    if (x1 >= x2 && y1 >= y2) {
+      if (abs(M[(y1 * n) + x1]) > 0.1) {
+        return false;
+      } else {
+        // Add 1.0 if this is a binary graph, and a random real number otherwise
+        M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
+                                                 100.0) - 50.0);
+        return true;
+      }
+    } else {
+      // Divide up the matrix
+      int xMidPoint = x1 + floor((x2 - x1) / 2);
+      int yMidPoint = y1 + floor((y2 - y1) / 2);
+      // ToDo -- add some noise to these values between iterations
+      float newA = a;
+      float newB = b;
+      float newC = c;
+      // Work out which quarter to recurse into
+      // There are some ugly ternary operators here to avoid going out of bounds in the edge case
+      // that we are already at 1 width or 1 height
+      float randomNum = dist(*gen);
+      if (randomNum < a) {
+        return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+                    newA, newB, newC, gen, dist, bin);
+      } else if (randomNum < (a + b)) {
+        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+                    newA, newB, newC, gen, dist, bin);
+      } else if (randomNum < (a + b + c)) {
+        return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+                    newA, newB, newC, gen, dist, bin);
+      } else {
+        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+                    ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
+                    gen, dist, bin);
+      }
+    }
+    return true;
+  }
+  void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index,
+             int* row_ptr) {
+    int nnz_encountered = 0;
+    for (int row = 0; row < n_row; row++) {
+      row_ptr[row] = nnz_encountered;
+      int nnz_row = 0;
+      for (int col = 0; col < n_col; col++) {
+        if (dense[(row * n_col) + col] != 0.0) {
+          nnz_row++;
+          col_index[nnz_encountered] = col;
+          vals[nnz_encountered] = dense[(row * n_col) + col];
+          nnz_encountered++;
+        }
+      }
+    }
+    row_ptr[n_row] = nnz_encountered;
+  }
   /** The number of iterations to perform per problem size. */
   const int iterations_;
diff --git a/oneMKL/CPU/sp_gemm.hh b/oneMKL/CPU/sp_gemm.hh
index 847006b..5ac6a70 100644
--- a/oneMKL/CPU/sp_gemm.hh
+++ b/oneMKL/CPU/sp_gemm.hh
@@ -14,20 +14,17 @@ template <typename T>
 class sp_gemm_cpu : public sp_gemm<T> {
   using sp_gemm<T>::sp_gemm;
-  using sp_gemm<T>::initInputMatrices;
+  using sp_gemm<T>::initInputMatricesSparse;
+  using sp_gemm<T>::toCSR;
   using sp_gemm<T>::callConsume;
-  using sp_gemm<T>::m_;
   using sp_gemm<T>::n_;
-  using sp_gemm<T>::k_;
   using sp_gemm<T>::A_;
   using sp_gemm<T>::B_;
   using sp_gemm<T>::C_;
   /** Initialise the required data structures. */
-  void initialise(int m, int n, int k) {
-    m_ = m;
+  void initialise(int n, float sparsity) {
     n_ = n;
-    k_ = k;
     A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64);
     B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64);

 cuBLAS/sp_gemm.hh       | 17 +++++++--
 include/doGemm.hh       | 82 +++++++++++++++++++++++------------------
 include/kernels/gemm.hh | 49 +++++++++---------------
 src/             |  4 +-
 4 files changed, 80 insertions(+), 72 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 992b018..aa095f8 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -36,6 +36,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
    *  - Unified: Initialise data as unified memory; no data movement semantics
    *             required */
   void initialise(gpuOffloadType offload, int n, float sparsity) override {
+    std::cout << "___________Initialising, problem size = " << n << std::endl;
     offload_ = offload;
     if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
@@ -46,9 +47,11 @@ class sp_gemm_gpu : public sp_gemm<T> {
     n_ = n * 20;
+    std::cout << "\tGetting device" << std::endl;
     // Get device identifier
+    std::cout << "\tMaking streams" << std::endl;
     // Initialise 3 streams to asynchronously move data between host and device
@@ -59,6 +62,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
    // Work out number of edges needed to achieve target sparsity
     A_nnz_ = B_nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity));
+    std::cout << "\tMallocing" << std::endl;
     if (offload_ == gpuOffloadType::unified) {
       cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_));
       cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * A_nnz_));
@@ -106,8 +110,11 @@ class sp_gemm_gpu : public sp_gemm<T> {
 		// Set initial values to 0
     A_ = (T*)malloc(sizeof(T) * n_ * n_);
     B_ = (T*)malloc(sizeof(T) * n_ * n_);
+    std::cout << "\tInitialising start matrices" << std::endl;
+    std::cout << "\tConverting to CSR" << std::endl;
     toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_);
     toCSR(B_, n_, n_, B_nnz_, B_val_, B_col_, B_row_);
@@ -132,7 +139,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
+    std::cout << "\t\tpre loop" << std::endl;
     switch(offload_) {
       case gpuOffloadType::always: {
         // Make matrix descriptors
@@ -217,6 +224,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Make a call to the BLAS Library Kernel. */
   void callGemm() override {
+    std::cout << "\t\tGEMM" << std::endl;
     switch(offload_) {
       case gpuOffloadType::always: {
         cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) *
@@ -444,6 +452,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
+    std::cout << "\t\tpost loop" << std::endl;
     // Destroying descriptors
@@ -511,6 +520,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
+    std::cout << "\t\tcleaning up" << std::endl;
     // Destroy the handle
@@ -519,6 +529,9 @@ class sp_gemm_gpu : public sp_gemm<T> {
+    free(A_);
+    free(B_);
     if (offload_ == gpuOffloadType::unified) {
@@ -551,8 +564,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   // ToDo -- the two following functons are useful for debugging.  I'm
   //  keeping them in to that end, though they are not used by the benchmark
   //  itself
diff --git a/include/doGemm.hh b/include/doGemm.hh
index 8153651..f4ec053 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -34,13 +34,16 @@ class doGemm {
   doGemm(const std::string csvDir, const int iters, const int startDim,
          const int upperLimit, const bool cpuEnabled = true,
-         const bool gpuEnabled = true)
+         const bool gpuEnabled = true, const bool doDense = true,
+         const bool doSparse = true)
       : CSV_DIR(csvDir),
-        doGPU_(gpuEnabled)
+        doGPU_(gpuEnabled),
+        doDense_(dense),
+        doSparse_(sparse),
@@ -59,27 +62,28 @@ class doGemm {
   /** Run all problem types and write data to CSV files. */
   void collectData() {
-    // Square Problem Sizes...
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                                        "_square_square_M=N=K.csv");
-    for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-      // M = dim, N = dim, K = dim;
-      callDenseKernels(csvFile, dim, dim, dim);
-    }
-    // Close file
-    csvFile.close();
+    if (doDense_) {
+      // Square Problem Sizes...
+      // Re-initialise offload threshold structures
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                                          "_square_square_M=N=K.csv");
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        // M = dim, N = dim, K = dim;
+        callDenseKernels(csvFile, dim, dim, dim);
+      }
+      // Close file
+      csvFile.close();
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Square x Square (M=N=K)");
-    }
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Square x Square (M=N=K)");
+      }
     // Rectangular Problem Sizes:
@@ -267,6 +271,7 @@ class doGemm {
     if (doCPU_ && doGPU_) {
       // Print offload results to stdout
       printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)");
+    }
     // Square x Short and Wide
     // Re-initialise offload threshold structures & previous results
@@ -292,27 +297,28 @@ class doGemm {
       printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)");
+    }
-// Square sparse matrix - sparse matrix multiplication
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() +
-                          "_sparse_square.csv");
-    if (upperLimit_ >= 32) {
-      for (int dim = 1; dim <= upperLimit_; dim++) {
-        const int N = dim;
-        callSparseKernels(csvFile, N, 0.99);
+    if (doSparse_) {    // Square sparse matrix - sparse matrix multiplication
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() +
+                            "_sparse_square.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+          callSparseKernels(csvFile, dim, 0.99);
+        }
-    }
-    // Close file
-    csvFile.close();
+      // Close file
+      csvFile.close();
-    if (doCPU_ && dpGPU_) {
+    if (doCPU_ && doGPU_) {
       // Print offload results to stdout
 	    printOffloadThreshold("Sparse Square");
+    }
@@ -693,6 +699,10 @@ class doGemm {
   /** Whether the GPU kernels should be run. */
   const bool doGPU_ = true;
+  /** Whether we should run dense and or sparse kernels */
+  const bool doDense_;
+  const bool doSparse_;
   /** The GEMM CPU kernel. */
   cpu::gemm_cpu<T> gemmCpu_;
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index 59a9898..3ffc0d7 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -103,14 +103,8 @@ class gemm {
     // Using a=0.45 and b=c=0.22 as default probabilities
     for (int i = 0; i < edges; i++) {
-      while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1,
-                   0.45, 0.22, 0.22,
-                   &gen, dist, false)) {}
-    }
-    for (int i = 0; i < edges; i++) {
-      while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1,
-                   0.45, 0.22, 0.22,
-                   &gen, dist, false)) {}
+      rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false);
+      rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false);
@@ -118,23 +112,18 @@ class gemm {
   void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); }
   /** Recursive function to populate sparse matrices */
-  bool rMat(T* M, int n, int x1, int x2, int y1, int y2,
-            float a, float b, float c, std::default_random_engine* gen,
+  void rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b,
+            float c, std::default_random_engine* gen,
             std::uniform_real_distribution<double> dist, bool bin) {
     // If a 1x1 submatrix, then add an edge and return out
     if (x1 >= x2 && y1 >= y2) {
-      if (abs(M[(y1 * n) + x1]) > 0.1) {
-        return false;
-      } else {
-        // Add 1.0 if this is a binary graph, and a random real number otherwise
-        M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
+      M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
                                                  100.0) - 50.0);
-        return true;
-      }
+      return;
     } else {
       // Divide up the matrix
-      int xMidPoint = x1 + floor((x2 - x1) / 2);
-      int yMidPoint = y1 + floor((y2 - y1) / 2);
+      int xMidPoint = (x1 == x2) ? x1 : x1 + floor((x2 - x1) / 2);
+      int yMidPoint = (y1 == y2) ? y1 : y1 + floor((y2 - y1) / 2);
       // ToDo -- add some noise to these values between iterations
       float newA = a;
@@ -142,25 +131,23 @@ class gemm {
       float newC = c;
       // Work out which quarter to recurse into
-      // There are some ugly ternary operators here to avoid going out of bounds in the edge case
-      // that we are already at 1 width or 1 height
+      // There are some ugly ternary operators here to avoid going out of
+      // bounds in the edge case that we are already at 1 width or 1 height
       float randomNum = dist(*gen);
       if (randomNum < a) {
-        return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
-                    newA, newB, newC, gen, dist, bin);
+        rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC, gen, dist,
+             bin);
       } else if (randomNum < (a + b)) {
-        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
-                    newA, newB, newC, gen, dist, bin);
+        rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC, gen, dist,
+             bin);
       } else if (randomNum < (a + b + c)) {
-        return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
-                    newA, newB, newC, gen, dist, bin);
+        rMat(M, n, x1, xMidPoint,  yMidPoint, y2, newA, newB, newC, gen,
+             dist, bin);
       } else {
-        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
-                    ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
-                    gen, dist, bin);
+        rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA,  newB, newC, gen,
+             dist, bin);
-    return true;
   void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index,
diff --git a/src/ b/src/
index a4eb55b..268b628 100644
--- a/src/
+++ b/src/
@@ -37,14 +37,14 @@ int main(int argc, char** argv) {
   // SGEMM Comparison
   std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl;
   doGemm<float> sgemm(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                      doGpu);
+                      doGpu, sgemm, sp_sgemm);
   std::cout << "Finished!" << std::endl;
   // DGEMM Comparison
   std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl;
   doGemm<double> dgemm(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                       doGpu);
+                       doGpu, dgemm, sp_dgemm);
   std::cout << "Finished!" << std::endl;

 cuBLAS/sp_gemm.hh       | 16 +++-------------
 include/doGemm.hh       |  4 ++--
 include/kernels/gemm.hh | 34 ++++++++++++++++++++--------------
 src/             | 32 ++++++++++++++++++--------------
 4 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index aa095f8..2c787d9 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -36,7 +36,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
    *  - Unified: Initialise data as unified memory; no data movement semantics
    *             required */
   void initialise(gpuOffloadType offload, int n, float sparsity) override {
-    std::cout << "___________Initialising, problem size = " << n << std::endl;
     offload_ = offload;
     if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
@@ -45,13 +44,11 @@ class sp_gemm_gpu : public sp_gemm<T> {
       std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
-    n_ = n * 20;
+    n_ = n;
-    std::cout << "\tGetting device" << std::endl;
     // Get device identifier
-    std::cout << "\tMaking streams" << std::endl;
     // Initialise 3 streams to asynchronously move data between host and device
@@ -62,7 +59,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
    // Work out number of edges needed to achieve target sparsity
     A_nnz_ = B_nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity));
-    std::cout << "\tMallocing" << std::endl;
     if (offload_ == gpuOffloadType::unified) {
       cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_));
       cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * A_nnz_));
@@ -111,13 +107,11 @@ class sp_gemm_gpu : public sp_gemm<T> {
     A_ = (T*)malloc(sizeof(T) * n_ * n_);
     B_ = (T*)malloc(sizeof(T) * n_ * n_);
-    std::cout << "\tInitialising start matrices" << std::endl;
-    std::cout << "\tConverting to CSR" << std::endl;
-    toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_);
+    toCSR(A_, n_, n_, A_val_, A_col_, A_row_);
-    toCSR(B_, n_, n_, B_nnz_, B_val_, B_col_, B_row_);
+    toCSR(B_, n_, n_, B_val_, B_col_, B_row_);
 //    std::cout << "_____Matrix A_____" << std::endl;
@@ -139,7 +133,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
-    std::cout << "\t\tpre loop" << std::endl;
     switch(offload_) {
       case gpuOffloadType::always: {
         // Make matrix descriptors
@@ -224,7 +217,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Make a call to the BLAS Library Kernel. */
   void callGemm() override {
-    std::cout << "\t\tGEMM" << std::endl;
     switch(offload_) {
       case gpuOffloadType::always: {
         cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) *
@@ -452,7 +444,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
-    std::cout << "\t\tpost loop" << std::endl;
     // Destroying descriptors
@@ -520,7 +511,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
-    std::cout << "\t\tcleaning up" << std::endl;
     // Destroy the handle
diff --git a/include/doGemm.hh b/include/doGemm.hh
index f4ec053..53bbb54 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -42,8 +42,8 @@ class doGemm {
-        doDense_(dense),
-        doSparse_(sparse),
+        doDense_(doDense),
+        doSparse_(doSparse)
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index 3ffc0d7..230c7d3 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -103,8 +103,10 @@ class gemm {
     // Using a=0.45 and b=c=0.22 as default probabilities
     for (int i = 0; i < edges; i++) {
-      rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false);
-      rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false);
+      while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
+              false)) {}
+      while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
+              false)){}
@@ -112,14 +114,18 @@ class gemm {
   void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); }
   /** Recursive function to populate sparse matrices */
-  void rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b,
+  bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b,
             float c, std::default_random_engine* gen,
             std::uniform_real_distribution<double> dist, bool bin) {
     // If a 1x1 submatrix, then add an edge and return out
     if (x1 >= x2 && y1 >= y2) {
-      M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
+      if (M[(int) (y1 * n) + x1] == 0) {
+        M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
                                                  100.0) - 50.0);
-      return;
+        return true;
+      } else {
+        return false;
+      }
     } else {
       // Divide up the matrix
       int xMidPoint = (x1 == x2) ? x1 : x1 + floor((x2 - x1) / 2);
@@ -135,22 +141,22 @@ class gemm {
       // bounds in the edge case that we are already at 1 width or 1 height
       float randomNum = dist(*gen);
       if (randomNum < a) {
-        rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC, gen, dist,
-             bin);
+        return rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC,
+                    gen, dist, bin);
       } else if (randomNum < (a + b)) {
-        rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC, gen, dist,
-             bin);
+        return rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC,
+                    gen, dist, bin);
       } else if (randomNum < (a + b + c)) {
-        rMat(M, n, x1, xMidPoint,  yMidPoint, y2, newA, newB, newC, gen,
-             dist, bin);
+        return rMat(M, n, x1, xMidPoint,  yMidPoint, y2, newA, newB, newC, gen,
+                    dist, bin);
       } else {
-        rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA,  newB, newC, gen,
-             dist, bin);
+        return rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA,  newB, newC,
+                    gen, dist, bin);
-  void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index,
+  void toCSR(T* dense, int n_col, int n_row, T* vals, int* col_index,
              int* row_ptr) {
     int nnz_encountered = 0;
     for (int row = 0; row < n_row; row++) {
diff --git a/src/ b/src/
index 268b628..06fd48e 100644
--- a/src/
+++ b/src/
@@ -3,10 +3,10 @@
 int iters = 10;
 int startDim = 1;
 int upperLimit = 128;
-bool sgemm = true;
-bool dgemm = true;
-bool sp_sgemm = true;
-bool sp_dgemm = true;
+bool doSgemm = true;
+bool doDgemm = true;
+bool doSp_sgemm = true;
+bool doSp_dgemm = true;
 bool doCpu = CPU_ENABLED;
 bool doGpu = GPU_ENABLED;
@@ -37,14 +37,14 @@ int main(int argc, char** argv) {
   // SGEMM Comparison
   std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl;
   doGemm<float> sgemm(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                      doGpu, sgemm, sp_sgemm);
+                      doGpu, doSgemm, doSp_sgemm);
   std::cout << "Finished!" << std::endl;
   // DGEMM Comparison
   std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl;
   doGemm<double> dgemm(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                       doGpu, dgemm, sp_dgemm);
+                       doGpu, doDgemm, doSp_dgemm);
   std::cout << "Finished!" << std::endl;
@@ -146,28 +146,28 @@ void getParameters(int argc, char** argv) {
     } else if (!strcmp(argv[i], "--no_gpu")) {
       doGpu = false;
     } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) {
-	    sgemm = dgemm = sp_sgemm = sp_dgemm = false;
+	    doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false;
 	    std::string kernelList = argv[++i];
 	    if (kernelList.find("sp-sgemm") != std::string::npos) {
-		    sp_sgemm = true;
+		    doSp_sgemm = true;
 		    if (kernelList.find("sgemm") != std::string::npos &&
 						kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) {
-			    sgemm = true;
+			    doSgemm = true;
 	    } else if (kernelList.find("sgemm") != std::string::npos) {
-			    sgemm = true;
+			    doSgemm = true;
 	    if (kernelList.find("sp-dgemm") != std::string::npos) {
-		    sp_dgemm = true;
+		    doSp_dgemm = true;
 		    if (kernelList.find("dgemm") != std::string::npos &&
 		        kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) {
-			    dgemm = true;
+			    doDgemm = true;
 	    } else if (kernelList.find("dgemm") != std::string::npos) {
-		    dgemm = true;
+		    doDgemm = true;
-	    if (!sgemm && !dgemm && !sp_sgemm && !sp_dgemm) {
+	    if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm) {
 		    std::cout << "ERROR - no implemented kernels in list" << std::endl;
@@ -200,6 +200,10 @@ void getParameters(int argc, char** argv) {
       std::cout << "  -d  --dimension_limit D      Max value of M, N, K is D "
                    "(default: "
                 << upperLimit << ")" << std::endl;
+      std::cout << "  -k  --kernels <kernels>      Comma-separated list of "
+                   "kernels to be run.  Options are sgemm, dgemm, sp-sgemm, "
+                   "sp-dgemm (default: sgemm,dgemm,sp-gemm,sp-dgemm)" <<
+                   std::endl;
       std::cout << std::endl;
     } else {

 cuBLAS/sp_gemm.hh       | 27 +++++++++++++++++++--------
 include/doGemm.hh       |  2 +-
 include/kernels/gemm.hh | 38 +++++++++++++++++++++-----------------
 3 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 2c787d9..8bed12b 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -26,7 +26,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
   using sp_gemm<T>::C_;
   using sp_gemm<T>::offload_;
-	// ToDo -- No checksum for sparse yet.  Nedd to do
+	// ToDo -- No checksum for sparse yet.  Need to do
   /** Initialise the required data structures.
    * `offload` refers to the data offload type:
@@ -44,7 +44,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
       std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
-    n_ = n;
+    n_ = 100 * n;
     // Get device identifier
@@ -133,6 +133,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
+    cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_));
     switch(offload_) {
       case gpuOffloadType::always: {
         // Make matrix descriptors
@@ -212,13 +213,17 @@ class sp_gemm_gpu : public sp_gemm<T> {
-    cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_));
   /** Make a call to the BLAS Library Kernel. */
   void callGemm() override {
     switch(offload_) {
       case gpuOffloadType::always: {
+        if (C_mem_allocated_always_) {
+          cusparseCheckError(cusparseDestroySpMat(descrA_));
+          cusparseCheckError(cusparseDestroySpMat(descrB_));
+          cusparseCheckError(cusparseDestroySpMat(descrC_));
+        }
         cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) *
         A_nnz_, cudaMemcpyHostToDevice, s1_));
         cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) *
@@ -235,6 +240,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
         cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_
         + 1), cudaMemcpyHostToDevice, s3_));
+        cudaCheckError(cudaDeviceSynchronize());
         // Make matrix descriptors
@@ -444,10 +450,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
-    cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_));
-    // Destroying descriptors
-    cusparseCheckError(cusparseDestroySpMat(descrA_));
-    cusparseCheckError(cusparseDestroySpMat(descrB_));
     switch(offload_) {
       case gpuOffloadType::always: {
@@ -476,10 +478,14 @@ class sp_gemm_gpu : public sp_gemm<T> {
         cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) *
         (n_ + 1), cudaMemcpyDeviceToHost, s3_));
+        cusparseCheckError(cusparseDestroySpMat(descrA_));
+        cusparseCheckError(cusparseDestroySpMat(descrB_));
+        cusparseCheckError(cusparseDestroySpMat(descrC_));
       case gpuOffloadType::unified: {
-        cusparseCheckError(cusparseDestroySpMat(descrC_));
         // Ensure all data resides on host once work has completed
         cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_,
                                             cudaCpuDeviceId, s1_));
@@ -503,9 +509,14 @@ class sp_gemm_gpu : public sp_gemm<T> {
                                             cudaCpuDeviceId, s3_));
         // Ensure device has finished all work.
+        cusparseCheckError(cusparseDestroySpMat(descrA_));
+        cusparseCheckError(cusparseDestroySpMat(descrB_));
+        cusparseCheckError(cusparseDestroySpMat(descrC_));
+    cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_));
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
diff --git a/include/doGemm.hh b/include/doGemm.hh
index 53bbb54..b89abee 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -307,7 +307,7 @@ class doGemm {
       if (upperLimit_ >= 32) {
         for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-          callSparseKernels(csvFile, dim, 0.99);
+          callSparseKernels(csvFile, dim, 0.9999);
       // Close file
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index 230c7d3..2a971a0 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -106,7 +106,7 @@ class gemm {
       while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
               false)) {}
       while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
-              false)){}
+              false)) {}
@@ -119,17 +119,19 @@ class gemm {
             std::uniform_real_distribution<double> dist, bool bin) {
     // If a 1x1 submatrix, then add an edge and return out
     if (x1 >= x2 && y1 >= y2) {
-      if (M[(int) (y1 * n) + x1] == 0) {
-        M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) /
-                                                 100.0) - 50.0);
-        return true;
-      } else {
+      // Needed to avoid overfloe segfaults with large problem sizes
+      uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1);
+      if (abs(M[index]) > 0.1) {
         return false;
+      } else {
+        // Add 1.0 if this is a binary graph, and a random real number otherwise
+        M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0);
+        return true;
     } else {
       // Divide up the matrix
-      int xMidPoint = (x1 == x2) ? x1 : x1 + floor((x2 - x1) / 2);
-      int yMidPoint = (y1 == y2) ? y1 : y1 + floor((y2 - y1) / 2);
+      int xMidPoint = x1 + floor((x2 - x1) / 2);
+      int yMidPoint = y1 + floor((y2 - y1) / 2);
       // ToDo -- add some noise to these values between iterations
       float newA = a;
@@ -137,23 +139,25 @@ class gemm {
       float newC = c;
       // Work out which quarter to recurse into
-      // There are some ugly ternary operators here to avoid going out of
-      // bounds in the edge case that we are already at 1 width or 1 height
+      // There are some ugly ternary operators here to avoid going out of bounds in the edge case
+      // that we are already at 1 width or 1 height
       float randomNum = dist(*gen);
       if (randomNum < a) {
-        return rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC,
-                    gen, dist, bin);
+        return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+                    newA, newB, newC, gen, dist, bin);
       } else if (randomNum < (a + b)) {
-        return rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC,
-                    gen, dist, bin);
+        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+                    newA, newB, newC, gen, dist, bin);
       } else if (randomNum < (a + b + c)) {
-        return rMat(M, n, x1, xMidPoint,  yMidPoint, y2, newA, newB, newC, gen,
-                    dist, bin);
+        return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+                    newA, newB, newC, gen, dist, bin);
       } else {
-        return rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA,  newB, newC,
+        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+                    ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
                     gen, dist, bin);
+    return true;
   void toCSR(T* dense, int n_col, int n_row, T* vals, int* col_index,

 src/ | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/ b/src/
index 06fd48e..51d1cf1 100644
--- a/src/
+++ b/src/
@@ -146,26 +146,26 @@ void getParameters(int argc, char** argv) {
     } else if (!strcmp(argv[i], "--no_gpu")) {
       doGpu = false;
     } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) {
-	    doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false;
-	    std::string kernelList = argv[++i];
-	    if (kernelList.find("sp-sgemm") != std::string::npos) {
-		    doSp_sgemm = true;
-		    if (kernelList.find("sgemm") != std::string::npos &&
-						kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) {
-			    doSgemm = true;
-		    }
-	    } else if (kernelList.find("sgemm") != std::string::npos) {
-			    doSgemm = true;
-			}
-	    if (kernelList.find("sp-dgemm") != std::string::npos) {
-		    doSp_dgemm = true;
-		    if (kernelList.find("dgemm") != std::string::npos &&
-		        kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) {
-			    doDgemm = true;
-		    }
-	    } else if (kernelList.find("dgemm") != std::string::npos) {
-		    doDgemm = true;
-	    }
+      doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false;
+      std::string kernelList = argv[++i];
+      if (kernelList.find("sp-sgemm") != std::string::npos) {
+        doSp_sgemm = true;
+        if (kernelList.find("sgemm") != std::string::npos &&
+            kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) {
+          doSgemm = true;
+        }
+      } else if (kernelList.find("sgemm") != std::string::npos) {
+        doSgemm = true;
+      }
+      if (kernelList.find("sp-dgemm") != std::string::npos) {
+        doSp_dgemm = true;
+        if (kernelList.find("dgemm") != std::string::npos &&
+            kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) {
+          doDgemm = true;
+        }
+      } else if (kernelList.find("dgemm") != std::string::npos) {
+        doDgemm = true;
+      }
 	    if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm) {
 		    std::cout << "ERROR - no implemented kernels in list" << std::endl;

--- | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ b/
index 38c2646..43028c0 100644
--- a/
+++ b/
@@ -165,7 +165,7 @@ def printResults(once:offloadThreshold, always:offloadThreshold, unified:offload
             gpuAlways.M = 0
             gpuAlways.N = 0
             gpuAlways.K = 0
-    if(gpuUnified.M != 0 and float(cpu[8]) >= float(gpuU[8])):
+    if("gemm" in kernel and gpuUnified.M != 0 and float(cpu[8]) >= float(gpuU[8])):
         # Do check to see if this is a momentary drop that we should ignore
         if (prevGpuUgflops <= float(cpu[8])) and  (float(gpuLines[2].split(',')[8]) <= float(cpu[8])):
             gpuUnified.cpuGflops = 0.0

 AOCL/sp_gemm.hh                |  62 ++++++++++
 cuBLAS/common.hh               |  53 +++++++--
 cuBLAS/sp_gemm.hh              |   4 +-
 include/doGemm.hh              |   4 +-
 include/kernels/CPU/sp_gemm.hh |   3 +-
 include/kernels/gemm.hh        |  25 +++-
 oneMKL/CPU/sp_gemm.hh          | 201 +++++++++++++++++++++++++++++----
 7 files changed, 320 insertions(+), 32 deletions(-)
 create mode 100644 AOCL/sp_gemm.hh

diff --git a/AOCL/sp_gemm.hh b/AOCL/sp_gemm.hh
new file mode 100644
index 0000000..3c6b5c0
--- /dev/null
+++ b/AOCL/sp_gemm.hh
@@ -0,0 +1,62 @@
+#pragma once
+#ifdef CPU_AOCL
+#include <blis.h>
+#include "../include/kernels/CPU/gemm.hh"
+#include "../include/utilities.hh"
+namespace cpu {
+/** A class for GEMM CPU BLAS kernels. */
+template <typename T>
+class gemm_cpu : public gemm<T> {
+ public:
+  using gemm<T>::gemm;
+  using gemm<T>::callConsume;
+  using gemm<T>::m_;
+  using gemm<T>::n_;
+  using gemm<T>::k_;
+  using gemm<T>::A_;
+  using gemm<T>::B_;
+  using gemm<T>::C_;
+ private:
+  /** Make call to the GEMM kernel. */
+  void callGemm() override {
+    if constexpr (std::is_same_v<T, float>) {
+      bli_sgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_,
+                rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_),
+                &beta, C_, rowStride, std::max(1, m_));
+    } else if constexpr (std::is_same_v<T, double>) {
+      bli_dgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_,
+                rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_),
+                &beta, C_, rowStride, std::max(1, m_));
+    } else {
+      // Un-specialised class will not do any work - print error and exit.
+      std::cout << "ERROR - Datatype for AOCL CPU GEMM kernel not supported."
+                << std::endl;
+      exit(1);
+    }
+    // Ensure compiler doesn't optimise away the work being done
+    callConsume();
+  }
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {}
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {}
+  /** The constant value Alpha. */
+  T alpha = ALPHA;
+  /** The constant value Beta. */
+  T beta = BETA;
+  /** The distance in elements to the next column. */
+  const int rowStride = 1;
+}  // namespace cpu
\ No newline at end of file
diff --git a/cuBLAS/common.hh b/cuBLAS/common.hh
index 70d58fb..c8086db 100644
--- a/cuBLAS/common.hh
+++ b/cuBLAS/common.hh
@@ -16,13 +16,52 @@
   } while (false)
 /** Macro function to check if error occurred when calling cuBLAS. */
-#define cublasCheckError(f)                                                \
-  do {                                                                     \
-    if (cublasStatus_t e = (f); e != CUBLAS_STATUS_SUCCESS) {              \
-      std::cout << "CUBLAS error: " << __FILE__ << ":" << __LINE__ << ": " \
-                << cublasGetStatusString(e) << std::endl;                  \
-      exit(1);                                                             \
-    }                                                                      \
+#define cublasCheckError(f)                                              \
+  do {                                                                   \
+    switch (f) {                                                         \
+        case CUBLAS_STATUS_SUCCESS:                                      \
+          break;                                                         \
+        case CUBLAS_STATUS_NOT_INITIALIZED:                              \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_NOT_INITIALIZED" << std::endl;             \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_ALLOC_FAILED:                                 \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_ALLOC_FAILED" << std::endl;                \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_INVALID_VALUE:                                \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_INVALID_VALUE" << std::endl;               \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_ARCH_MISMATCH:                                \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_ARCH_MISMATCH" << std::endl;               \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_MAPPING_ERROR:                                \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_MAPPING_ERROR" << std::endl;               \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_EXECUTION_FAILED:                             \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_EXECUTION_FAILED" << std::endl;            \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_INTERNAL_ERROR:                               \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_INTERNAL_ERROR" << std::endl;              \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_NOT_SUPPORTED:                                \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_NOT_SUPPORTED" << std::endl;               \
+          exit(1);                                                       \
+        case CUBLAS_STATUS_LICENSE_ERROR:                                \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": CUBLAS_STATUS_LICENSE_ERROR" << std::endl;               \
+          exit(1);                                                       \
+        default:                                                         \
+          std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__  \
+          << ": other error not in switch statement" << std::endl;       \
+          exit(1);                                                       \
+    }                                                                    \
   } while (false)
 #define cusparseCheckError(f)                                                 \
diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index 8bed12b..d849d22 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -109,9 +109,9 @@ class sp_gemm_gpu : public sp_gemm<T> {
-    toCSR(A_, n_, n_, A_val_, A_col_, A_row_);
+    toCSR_int(A_, n_, n_, A_val_, A_col_, A_row_);
-    toCSR(B_, n_, n_, B_val_, B_col_, B_row_);
+    toCSR_int(B_, n_, n_, B_val_, B_col_, B_row_);
 //    std::cout << "_____Matrix A_____" << std::endl;
diff --git a/include/doGemm.hh b/include/doGemm.hh
index b89abee..e264273 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -303,8 +303,8 @@ class doGemm {
       cpuGpu_always_ = cpuGpu_offloadThreshold();
       cpuGpu_once_ = cpuGpu_offloadThreshold();
       cpuGpu_unified_ = cpuGpu_offloadThreshold();
-      csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() +
-                            "_sparse_square.csv");
+      std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
+              getKernelName() + "_sparse_square.csv");
       if (upperLimit_ >= 32) {
         for (int dim = startDimention_; dim <= upperLimit_; dim++) {
           callSparseKernels(csvFile, dim, 0.9999);
diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh
index 6d9d011..60778e7 100644
--- a/include/kernels/CPU/sp_gemm.hh
+++ b/include/kernels/CPU/sp_gemm.hh
@@ -1,5 +1,6 @@
 #pragma once
+#ifdef CPU_ONEMKL
 #include "../gemm.hh"
 #include <random>
@@ -41,4 +42,4 @@ namespace cpu {
-}  // namespace cpu
\ No newline at end of file
+}  // namespace cpu
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index 2a971a0..d97fc8c 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -1,5 +1,9 @@
 #pragma once
+#ifdef CPU_ONEMKL
+#include <mkl.h>
 #include <algorithm>
 #include <chrono>
 #include <cmath>
@@ -160,7 +164,7 @@ class gemm {
     return true;
-  void toCSR(T* dense, int n_col, int n_row, T* vals, int* col_index,
+  void toCSR_int(T* dense, int n_col, int n_row, T* vals, int* col_index,
              int* row_ptr) {
     int nnz_encountered = 0;
     for (int row = 0; row < n_row; row++) {
@@ -178,6 +182,25 @@ class gemm {
     row_ptr[n_row] = nnz_encountered;
+#ifdef CPU_ONEMKL
+  void toCSR_mkl(T* dense, int n_col, int n_row, T* vals, MKL_INT* col_index,
+                 MKL_INT* row_ptr) {
+    int nnz_encountered = 0;
+    for (int row = 0; row < n_row; row++) {
+      row_ptr[row] = (MKL_INT)nnz_encountered;
+      int nnz_row = 0;
+      for (int col = 0; col < n_col; col++) {
+        if (dense[(row * n_col) + col] != 0.0) {
+          nnz_row++;
+          col_index[nnz_encountered] = (MKL_INT)col;
+          vals[nnz_encountered] = dense[(row * n_col) + col];
+          nnz_encountered++;
+        }
+      }
+    }
+    row_ptr[n_row] = (MKL_INT)nnz_encountered;
+  }
   /** The number of iterations to perform per problem size. */
   const int iterations_;
diff --git a/oneMKL/CPU/sp_gemm.hh b/oneMKL/CPU/sp_gemm.hh
index 5ac6a70..0b4e32b 100644
--- a/oneMKL/CPU/sp_gemm.hh
+++ b/oneMKL/CPU/sp_gemm.hh
@@ -24,33 +24,146 @@ class sp_gemm_cpu : public sp_gemm<T> {
   /** Initialise the required data structures. */
   void initialise(int n, float sparsity) {
-    n_ = n;
     A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64);
     B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64);
     C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64);
+    n_ = n * 100;
+    nnz_ = (1 + (int)(n_ * n_ * (1 - sparsity)));
+    values_A_ = (T*)mkl_malloc(sizeof(T) * nnz_, ALIGN);
+    columns_A_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, ALIGN);
+    rowIndex_A_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (n_ + 1), ALIGN);
+    values_B_ = (T*)mkl_malloc(sizeof(T) * nnz_, ALIGN);
+    columns_B_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, ALIGN);
+    rowIndex_B_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (n_ + 1), ALIGN);
+    x_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN);
+    y_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN);
+    rslt_mv_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN);
+    rslt_mv_trans_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN);
     // Initialise the matricies
-    initInputMatrices();
+    initInputMatricesSparse(sparsity);
+    descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
+    // Transfer from dense to CSR format
+    toCSR_mkl(A_, n_, n_, values_A_, columns_A_, rowIndex_A_);
+    toCSR_mkl(B_, n_, n_, values_B_, columns_B_, rowIndex_B_);
+    // ToDo -- Set values for x and y (which are vectors of length n_?)
+    if constexpr (std::is_same_v<T, float>) {
+      CALL_AND_CHECK_STATUS(mkl_sparse_s_create_csr(&csrA_,
+                                                    SPARSE_INDEX_BASE_ZERO, n_,
+                                                    n_, rowIndex_A_,
+                                                    rowIndex_A_+1, columns_A_,
+                                                    values_A_),
+                            "Error after MKL_SPARSE_D_CREATE_CSR for csrA\n");
+      CALL_AND_CHECK_STATUS(mkl_sparse_s_create_csr(&csrB_,
+                                                    SPARSE_INDEX_BASE_ZERO, n_,
+                                                    n_, rowIndex_B_,
+                                                    rowIndex_B_+1, columns_B_,
+                                                    values_B_),
+                            "Error after MKL_SPARSE_D_CREATE_CSR for csrB\n");
+    } else if constexpr (std::is_same_v<T, double>) {
+      CALL_AND_CHECK_STATUS(mkl_sparse_d_create_csr(&csrA_,
+                                                    SPARSE_INDEX_BASE_ZERO, n_,
+                                                    n_, rowIndex_A_,
+                                                    rowIndex_A_+1, columns_A_,
+                                                    values_A_),
+                            "Error after MKL_SPARSE_D_CREATE_CSR for csrA\n");
+      CALL_AND_CHECK_STATUS(mkl_sparse_d_create_csr(&csrB_,
+                                                    SPARSE_INDEX_BASE_ZERO, n_,
+                                                    n_, rowIndex_B_,
+                                                    rowIndex_B_+1, columns_B_,
+                                                    values_B_),
+                            "Error after MKL_SPARSE_D_CREATE_CSR for csrB\n");
+    } else {
+      std::cout << "ERROR - Datatype for OneMKL CPU spGEMM kernel not "
+                   "supported." << std::endl;
+      exit(1)
+    };
+                                            csrA_, csrB_, &csrC_),
+                            "Error after MKL_SPARSE_SPMM\n");
+    // ToDo -- check that transpose is what I want here
+    CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrA_,
+                                                 SPARSE_OPERATION_TRANSPOSE,
+                                                 descr_type_gen_, 1),
+                          "Error after MKL_SPARSE_SET_MV_HINT with csrA_\n");
+    CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrB_,
+                                                 SPARSE_OPERATION_NON_TRANSPOSE,
+                                                 descr_type_gen_, 1),
+                          "Error after MKL_SPARSE_SET_MV_HINT with csrB_\n");
+    CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrC_,
+                                                 SPARSE_OPERATION_NON_TRANSPOSE,
+                                                 descr_type_gen_, 1),
+                          "Error after MKL_SPARSE_SET_MV_HINT with csrC_\n");
+    CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrA_),
+                          "Error after MKL_SPARSE_OPTIMIZE with csrA_\n");
+    CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrB_),
+                          "Error after MKL_SPARSE_OPTIMIZE with csrB_\n");
+    CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrC_),
+                          "Error after MKL_SPARSE_OPTIMIZE with csrC_\n");
   /** Make call to the GEMM kernel. */
   void callGemm() override {
     if constexpr (std::is_same_v<T, float>) {
-      cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_,
-                  (float)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_),
-                  (float)BETA, C_, std::max(1, m_));
-    } else if constexpr (std::is_same_v<T, double>) {
-      cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_,
-                  (double)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_),
-                  (double)BETA, C_, std::max(1, m_));
-    } else {
-      // Un-specialised class will not do any work - print error and exit.
-      std::cout << "ERROR - Datatype for OneMKL CPU GEMM kernel not supported."
-                << std::endl;
-      exit(1);
+      .0, csrC_, descr_type_gen_, x_, 0.0, rslt_mv_),
+                            "Error after MKL_SPARSE_S_MV for csrC_ * x_\n");
+      left_ = cblas_sdot(n_, rstl_mv_, 1, y_, 1);
+      .0, csrB_, descr_type_gen_, x, 0.0, trslt_mv_),
+                            "Error adter MKL_SPARSE_S_MV for csrB_ * x_\n");
+                                            csrA_, descr_type_gen_, y_, 0.0,
+                                            rslt_mv_trans_),
+                            "Error adter MKL_SPARSE_S_MV for csrA_ * y_\n");
+      right_ = cblas_sdot(n_, rslt_mv_, 1, rslt_mv_trans_, 1);
+      residual = fabs(left - right)/(fabs(left) + 1);
+      CALL_AND_CHECK_STATUS(mkl_sparse_s_export_csr(csrC_, &indexing_,
+                                                    &rows_, &cols_,
+                                                    &pointerB_C_,
+                                                    &pointerE_C_,
+                                                    &columns_C_, &values_C_),
+                            "Error after MKL_SPARSE_S_EXPORT_CSR\n");
+    } else if constexpr (std::is_same_v<T, double) {
+      .0, csrC_, descr_type_gen_, x_, 0.0, rslt_mv_),
+                            "Error after MKL_SPARSE_D_MV for csrC_ * x_\n");
+      left_ = cblas_ddot(n_, rstl_mv_, 1, y_, 1);
+      .0, csrB_, descr_type_gen_, x, 0.0, trslt_mv_),
+                            "Error adter MKL_SPARSE_D_MV for csrB_ * x_\n");
+                                            csrA_, descr_type_gen_, y_, 0.0,
+                                            rslt_mv_trans_),
+                            "Error adter MKL_SPARSE_D_MV for csrA_ * y_\n");
+      right_ = cblas_ddot(n_, rslt_mv_, 1, rslt_mv_trans_, 1);
+      residual = fabs(left - right)/(fabs(left) + 1);
+      CALL_AND_CHECK_STATUS(mkl_sparse_d_export_csr(csrC_, &indexing_,
+                                                    &rows_, &cols_,
+                                                    &pointerB_C_,
+                                                    &pointerE_C_,
+                                                    &columns_C_, &values_C_),
+                            "Error after MKL_SPARSE_D_EXPORT_CSR\n");
     // Ensure compiler doesn't optimise away the work being done
@@ -66,11 +179,61 @@ class sp_gemm_cpu : public sp_gemm<T> {
   /** Do any necessary cleanup (free pointers, close library handles, etc.)
    * after Kernel has been called. */
   void postCallKernelCleanup() override {
-    mkl_free_buffers();
-    mkl_free(A_);
-    mkl_free(B_);
-    mkl_free(C_);
+    if (mkl_sparse_destroy(csrC_) != SPARSE_STATUS_SUCCESS) {
+      printf(" Error after MKL_SPARSE_DESTROY, csrC_\n");
+      fflush(0);
+      status = 1;
+    }
+    //Deallocate arrays for which we allocate memory ourselves.
+    mkl_free(rslt_mv_trans_);
+    mkl_free(rslt_mv-);
+    mkl_free(x_);
+    mkl_free(y_);
+    //Release matrix handle and deallocate arrays for which we allocate memory ourselves.
+    if (mkl_sparse_destroy(csrA_) != SPARSE_STATUS_SUCCESS) {
+      printf("Error after MKL_SPARSE_DESTROY, csrA_\n");
+      fflush(0);
+      status = 1;
+    }
+    mkl_free(values_A_);
+    mkl_free(columns_A_);
+    mkl_free(rowIndex_A_);
+    if (mkl_sparse_destroy(csrB_) != SPARSE_STATUS_SUCCESS) {
+      printf("Error after MKL_SPARSE_DESTROY, csrB_\n");
+      fflush(0);
+      status = 1;
+    }
+    mkl_free(values_B_);
+    mkl_free(columns_B_);
+    mkl_free(rowIndex_B_);
+  int nnz_;
+  MKL_INT* columns_A_;
+  MKL_INT* columns_B_;
+  MKL_INT* columns_C_;
+  MKL_INT* rowIndex_A_;
+  MKL_INT* rowIndex_B_;
+  MKL_INT* pointerB_C_;
+  MKL_INT* pointerE_C_;
+  T* rslt_mv_;
+  T* rslt_mv_trans_;
+  T* x_;
+  T* y_;
+  T left_, right_, residual_;
+  MKL_INT rows_, cols_, i_, j_, ii_, status_;
+  sparse_index_base_t indexing_;
+  struct matrix_descr descr_type_gen_;
+  sparse_matrix_t csrA_, csrB_, csrC_;
 }  // namespace cpu
\ No newline at end of file

From 42bdc5846d6a5bac4f3270d62b258e0d021757aa Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Wed, 21 Aug 2024 11:05:52 +0100
Subject: [PATCH 23/38] Adding AOCL files

 AOCL/gemm.hh                   |   1 +
 AOCL/sp_gemm.hh                |  32 ++++-
 ArmPL/sp_gemm.hh               | 231 +++++++++++++++++++++++++++++++++
 NVPL/sp_gemv.hh                | 117 +++++++++++++++++
 include/kernels/CPU/sp_gemm.hh |  71 +++++++++-
 include/kernels/gemm.hh        |  22 ++++
 6 files changed, 464 insertions(+), 10 deletions(-)
 create mode 100644 ArmPL/sp_gemm.hh
 create mode 100644 NVPL/sp_gemv.hh

diff --git a/AOCL/gemm.hh b/AOCL/gemm.hh
index 3c6b5c0..f418bdc 100644
--- a/AOCL/gemm.hh
+++ b/AOCL/gemm.hh
@@ -23,6 +23,7 @@ class gemm_cpu : public gemm<T> {
   /** Make call to the GEMM kernel. */
   void callGemm() override {
     if constexpr (std::is_same_v<T, float>) {
       bli_sgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_,
                 rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_),
diff --git a/AOCL/sp_gemm.hh b/AOCL/sp_gemm.hh
index 3c6b5c0..4fc178b 100644
--- a/AOCL/sp_gemm.hh
+++ b/AOCL/sp_gemm.hh
@@ -28,9 +28,16 @@ class gemm_cpu : public gemm<T> {
                 rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_),
                 &beta, C_, rowStride, std::max(1, m_));
     } else if constexpr (std::is_same_v<T, double>) {
-      bli_dgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_,
-                rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_),
-                &beta, C_, rowStride, std::max(1, m_));
+      // Todo -- base?
+      aoclsparse_create_dscr(&A_csr_, base, n_, n_, nnz_,
+      (),,;
+      aoclsparse_create_dscr(&B_csr_, base, n_, n_, nnz_,
+      (),,;
+      aoclsparse_spmm(aoclsparse_operation_none, A_csr_, B_csr_, &C_csr_);
+      aoclsparse_export_dcsr(C_csr_, &base, &C_M_, &C_N_, &nnz_C_,
+                             &csr_row_ptr_C_, &csr_col_ind_C_, (void**)
+                             &csr_val_C_);
     } else {
       // Un-specialised class will not do any work - print error and exit.
       std::cout << "ERROR - Datatype for AOCL CPU GEMM kernel not supported."
@@ -57,6 +64,25 @@ class gemm_cpu : public gemm<T> {
   /** The distance in elements to the next column. */
   const int rowStride = 1;
+  aoclsparse_matrix A_csr_;
+  aoclsparse_int* csr_row_ptr_A_;
+  aoclsparse_int* csr_col_ind_A_;
+  T* csr_val_A_;
+  aoclsparse_matrix B_csr_;
+  aoclsparse_int* csr_row_ptr_B_;
+  aoclsparse_int* csr_col_ind_B_;
+  T* csr_val_B_;
+  aoclsparse_matrix C_csr_;
+  aoclsparse_int* csr_row_ptr_C_;
+  aoclsparse_int* csr_col_ind_C_;
+  T* csr_val_C_;
+  aoclsparse_int C_M_;
+  aoclsparse_int C_N_;
+  aoclsparse_status status;
 }  // namespace cpu
\ No newline at end of file
diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh
new file mode 100644
index 0000000..aba5814
--- /dev/null
+++ b/ArmPL/sp_gemm.hh
@@ -0,0 +1,231 @@
+#pragma once
+#ifdef CPU_ARMPL
+#include <stdio.h>
+#include <stdlib.h>
+#include <armpl.h>
+#include <omp.h>
+#include <algorithm>
+#include "../include/kernels/CPU/sp_gemm.hh"
+#include "../include/utilities.hh"
+namespace cpu {
+/** A class for GEMM CPU BLAS kernels. */
+template <typename T>
+class sp_gemm_cpu : public sp_gemm<T> {
+ public:
+  using sp_gemm<T>::gemm;
+  using sp_gemm<T>::callConsume;
+  using sp_gemm<T>::m_;
+  using sp_gemm<T>::n_;
+  using sp_gemm<T>::k_;
+  using sp_gemm<T>::A_;
+  using sp_gemm<T>::B_;
+  using sp_gemm<T>::C_;
+ private:
+  /** Make call to the GEMM kernel. */
+  void callGemm() override {
+    /**
+     * Flow of ARMPL Sparse LA:
+     *
+     * 1. Create sparse matrix objects: armpl_spmat_create_csr[sdcz]()
+     *
+     * 2. Supply hints on usage: armpl_spmat_hint()
+     *
+     * 3. Optimise for SpMV: armpl_spmv_optimize()
+     *
+     * 4. Solve SpMV case: armpl_spmv_exec_[sdcz]()
+     *
+     * 5. Destroy sparse matrix object: armpl_spmat_destroy()
+     *
+     * In addiion, users can choose to update a set of non-zero values using
+     * armpl_spmat_update_[sdcz]()
+     */
+    // Todo -- See if using armpl_spmat_hint can improve performance here.
+    //  If so, follow with optimisation functions
+    if (std::is_same_v<T, float>) {
+      status_ = armpl_spmm_exec_s(transA,
+                                  transB,
+                                  alpha,
+                                  A_armpl_,
+                                  B_armpl,
+                                  beta,
+                                  C_armpl_);
+    } else if constexpr (std::is_same_v<T, double>) {
+      status_ = armpl_spmm_exec_d(transA,
+                                  transB,
+                                  alpha,
+                                  A_armpl_,
+                                  B_armpl,
+                                  beta,
+                                  C_armpl_);
+    } else {
+      // Un-specialised class will not do any work - print error and exit.
+      std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported."
+                << std::endl;
+      exit(1);
+    }
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    // Ensure compiler doesn't optimise away the work being done
+    callConsume();
+  }
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {}
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {
+    status_ = armpl_spmat_destroy(A_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_destroy(B_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_destroy(C_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+  }
+  /** The constant value Alpha. */
+  const T alpha = ALPHA;
+  /** The constant value Beta. */
+  const T beta = BETA;
+  armpl_status_t status_;
+  armpl_spmat_t armpl_A, armpl_B, armpl_C;
+  @override
+  void toCSR() {
+    n_armpl_ = n_;
+    // ToDo -- check whether flags_ is correct!
+    flags_ = 0;
+    // Move A to CSR
+    A_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+    A_armpl_col_index_ = new armpl_int_t[nnz_];
+    A_vals_ = new T[nnz_];
+    int nnz_encountered = 0;
+    for (int row = 0; row < n_; row++) {
+      A_armpl_row_ptr_[row] = nnz_encountered;
+      for (int col = 0; col < n_; col++) {
+        if (A_[(row * n_) + col] != 0.0) {
+          A_armpl_col_index_[nnz_encountered] = col;
+          A_vals_[nnz_encountered] = A_[(row * n_) + col];
+          nnz_encountered++;
+        }
+      }
+    }
+    // Move B to CSR
+    B_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+    B_armpl_col_index_ = new armpl_int_t[nnz_];
+    B_vals_ = new T[nnz_];
+    nnz_encountered = 0;
+    for (int row = 0; row < n_; row++) {
+      B_armpl_row_ptr_[row] = nnz_encountered;
+      for (int col = 0; col < n_; col++) {
+        if (B_[(row * n_) + col] != 0.0) {
+          B_armpl_col_index_[nnz_encountered] = col;
+          B_vals_[nnz_encountered] = B_[(row * n_) + col];
+          nnz_encountered++;
+        }
+      }
+    }
+    if (std::is_sam_v<T, float>) {
+      status_ = armpl_spmat_create_csr_s(A_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         A_armpl_row_ptr_,
+                                         A_armpl_col_index_,
+                                         A_vals_,
+                                         flags);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+      status_ = armpl_spmat_create_csr_s(B_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         B_armpl_row_ptr_,
+                                         B_armpl_col_index_,
+                                         B_vals_,
+                                         flags);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+    } else if (std::is_same_v<T, double>) {
+      status_ = armpl_spmat_create_csr_d(A_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         A_armpl_row_ptr_,
+                                         A_armpl_col_index_,
+                                         A_vals_,
+                                         flags);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+      status_ = armpl_spmat_create_csr_d(B_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         B_armpl_row_ptr_,
+                                         B_armpl_col_index_,
+                                         B_vals_,
+                                         flags);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+    }
+  }
+  armpl_int_t flags_;
+  armpl_int_t n_armpl_;
+  armpl_int_t* A_armpl_row_ptr_;
+  armpl_int_t* A_armpl_col_index_;
+  armpl_int_t* B_armpl_row_ptr_;
+  armpl_int_t* B_armpl_col_index_;
+  armpl_int_t* C_armpl_row_ptr_;
+  armpl_int_t* C_armpl_col_index_;
+  armpl_spmat_t* A_armpl_;
+  armpl_spmat_t* B_armpl_;
+  armpl_spmat_t* C_armpl_;
+  sparse_hint_value transA = ARMPL_SPARSE_OPERATION_NOTRANS;
+  sparse_hint_value transB = ARMPL_SPARSE_OPERATION_NOTRANS;
+}  // namespace cpu
\ No newline at end of file
diff --git a/NVPL/sp_gemv.hh b/NVPL/sp_gemv.hh
new file mode 100644
index 0000000..d04f6b8
--- /dev/null
+++ b/NVPL/sp_gemv.hh
@@ -0,0 +1,117 @@
+ * ToDo -- This is all currently written for GEMM, but NVPL does not support
+ * GEMM, so this needs to be adjusted to spmv -- which is supported
+ */
+#pragma once
+#ifdef CPU_NVPL
+#include <nvpl_sparse.h>
+#include "../include/kernels/CPU/gemm.hh"
+#include "../include/utilities.hh"
+namespace cpu {
+/** A class for GEMM CPU BLAS kernels. */
+template <typename T>
+class sp_gemm_cpu : public sp_gemm<T> {
+ public:
+  using sp_gemm<T>::gemm;
+  using sp_gemm<T>::callConsume;
+  using sp_gemm<T>::m_;
+  using sp_gemm<T>::n_;
+  using sp_gemm<T>::k_;
+  using sp_gemm<T>::A_;
+  using sp_gemm<T>::B_;
+  using sp_gemm<T>::C_;
+ private:
+  /** Make call to the GEMM kernel. */
+  void callGemm() override {
+    // Ensure compiler doesn't optimise away the work being done
+    callConsume();
+  }
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {
+    // Set type enum
+    if constexpr (std::is_same_v<T, float>) {
+      type_ = NVPL_SPARSE_R_32F;
+    } else if constexpr (std::is_same_v<T, double>) {
+      type_ = NVPL_SPARSE_R_64F;
+    } else {
+      // Un-specialised class will not do any work - print error and exit.
+      std::cout << "ERROR - Datatype for NVPL sparse GEMM kernel not supported."
+                << std::endl;
+      exit(1);
+    }
+    status_ = nvpl_sparse_create(&handle_);
+    // Todo -- error check
+    // Todo -- Make const?
+    status_ = nvpl_sparse_create_csr(A_nvpl_, n_, n_, nnz_, A_row_ptr_nvpl_,
+                                     A_col_index_nvpl_, A_vals_nvpl_,
+                                     index_type_, index_type_, base_, type_);
+    status_ = nvpl_sparse_create_csr(B_nvpl_, n_, n_, nnz_, B_row_ptr_nvpl_,
+                                     B_col_index_nvpl_, B_vals_nvpl_,
+                                     index_type_, index_type_, base_, type_);
+    // Todo -- error check
+  }
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {
+    status_ = nvpl_sparse_destroy(handle_);
+    // Todo -- error check
+    status_ = nvpl_sparse_destroy_sp_mat(A_nvpl_);
+    status_ = nvpl_sparse_destroy_sp_mat(B_nvpl_);
+    status_ = nvpl_sparse_destroy_sp_mat(C_nvpl_);
+  }
+  /** The constant value Alpha. */
+  T alpha = ALPHA;
+  /** The constant value Beta. */
+  T beta = BETA;
+  /**
+   * Sparse metadata
+  */
+  nvpl_sparse_status_t status_;
+  nvpl_sparse_handle_t handle_;
+  nvpl_sparse_data_type_t type_;
+  nvpl_sparse_operation_t op_ = NVPL_SPARSE_OPERATION_NON_TRANSPOSE;
+  nvpl_sparse_index_base_t base_ = NVPL_SPARSE_INDEX_BASE_ZERO;
+  nvpl_sparse_format_t format_ = NVPL_SPARSE_FORMAT_CSR;
+  nvpl_sparse_order_t order_ = NVPL_SPARSE_ORDER_COL;
+  nvpl_sparse_index_type_t index_type_ = NVPL_SPARSE_INDEX_64I;
+  /**
+   * Sparse matrix descriptors
+  */
+  nvpl_sparse_sp_mat_descr_t* A_nvpl_;
+  nvpl_sparse_sp_mat_descr_t* B_nvpl_;
+  nvpl_sparse_sp_mat_descr_t* C_nvpl_;
+  void* A_row_ptr_nvpl_;
+  void* B_row_ptr_nvpl_;
+  void* C_row_ptr_nvpl_;
+  void* A_col_idnex_nvpl_;
+  void* B_col_idnex_nvpl_;
+  void* C_col_idnex_nvpl_;
+  void* A_vals_nvpl_;
+  void* B_vals_nvpl_;
+  void* C_vals_nvpl_;
+}  // namespace cpu
\ No newline at end of file
diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh
index 60778e7..72fd5dc 100644
--- a/include/kernels/CPU/sp_gemm.hh
+++ b/include/kernels/CPU/sp_gemm.hh
@@ -1,9 +1,9 @@
 #pragma once
-#ifdef CPU_ONEMKL
 #include "../gemm.hh"
 #include <random>
+#include <memory>
 namespace cpu {
@@ -25,21 +25,78 @@ namespace cpu {
 			/** Initialise the required data structures. */
 			virtual void initialise(int n, double sparsity, bool binary = false) {
 				n_ = n;
+        sparsity_ = sparsity;
+        // Note that the below should be the same as the edges calculation
+        // used in the initInputMatricesSparse function.  If changed here,
+        // change there
+        nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity_));
 				A_ = (T*)malloc(sizeof(T) * n_ * n_);
 				B_ = (T*)malloc(sizeof(T) * n_ * n_);
 				C_ = (T*)malloc(sizeof(T) * n_ * n_);
-				initInputMatricesSparse(sparsity);
+				initInputMatricesSparse(sparsity_);
+        toCSR();
 				/** Do any necessary cleanup (free pointers, close library handles, etc.)
 				 * after Kernel has been called. */
-				void postCallKernelCleanup() {
-					free(A_);
-					free(B_);
-					free(C_);
-				}
+      void postCallKernelCleanup() {
+        free(A_);
+        free(B_);
+        free(C_);
+      }
+      void toCSR() {
+        // Move A to CSR
+        A_row_ptr_ = new int[n_ + 1];
+        A_col_index_ = new int[nnz_];
+        A_vals_ = new T[nnz_];
+        int nnz_encountered = 0;
+        for (int row = 0; row < n_; row++) {
+          A_row_ptr_[row] = nnz_encountered;
+          for (int col = 0; col < n_; col++) {
+            if (A_[(row * n_) + col] != 0.0) {
+              A_col_index_[nnz_encountered] = col;
+              A_vals_[nnz_encountered] = A_[(row * n_) + col];
+              nnz_encountered++;
+            }
+          }
+        }
+        // Move B to CSR
+        B_row_ptr_ = new int[n_ + 1];
+        B_col_index_ = new int[nnz_];
+        B_vals_ = new T[nnz_];
+        nnz_encountered = 0;
+        for (int row = 0; row < n_; row++) {
+          B_row_ptr_[row] = nnz_encountered;
+          for (int col = 0; col < n_; col++) {
+            if (B_[(row * n_) + col] != 0.0) {
+              B_col_index_[nnz_encountered] = col;
+              B_vals_[nnz_encountered] = B_[(row * n_) + col];
+              nnz_encountered++;
+            }
+          }
+        }
+      }
+      double sparsity_;
+      int nnz_;
+      int* A_row_ptr_;
+      int* A_col_index_;
+      int* B_row_ptr_;
+      int* B_col_index_;
+      int* C_row_ptr_;
+      int* C_col_index_;
+      T* A_vals_;
+      T* B_vals_;
+      T* C_vals;
 }  // namespace cpu
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index d97fc8c..d357734 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -91,6 +91,9 @@ class gemm {
+  // Note that the below should be the same as the nnz calculation
+  // used in the cpu initialise functions.  If changed here,
+  // change there
   void initInputMatricesSparse(float sparsity) {
     for (int i = 0; i < (n_ * n_); i++) {
       A_[i] = 0.0;
@@ -200,6 +203,25 @@ class gemm {
     row_ptr[n_row] = (MKL_INT)nnz_encountered;
+#ifdef CPU_AOCL
+    void toCSR_aocl(T* dense, int n_col, int n_row, T* vals, aoclsparse_int*
+    col_index, aoclsparse_int* row_ptr) {
+    int nnz_encountered = 0;
+    for (int row = 0; row < n_row; row++) {
+      row_ptr[row] = (aoclsparse_int)nnz_encountered;
+      int nnz_row = 0;
+      for (int col = 0; col < n_col; col++) {
+        if (dense[(row * n_col) + col] != 0.0) {
+          nnz_row++;
+          col_index[nnz_encountered] = (aoclsparse_int)col;
+          vals[nnz_encountered] = dense[(row * n_col) + col];
+          nnz_encountered++;
+        }
+      }
+    }
+    row_ptr[n_row] = (MKL_INT)nnz_encountered;
+  }
   /** The number of iterations to perform per problem size. */
   const int iterations_;

From 521cbf3d1f4f5369813732e46be11fd019a09241 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Tue, 1 Oct 2024 12:00:19 +0100
Subject: [PATCH 24/38] Working changes

 .DS_Store                            | Bin 0 -> 8196 bytes
 .idea/GPU-BLAS-Offload-Benchmark.iml |   2 +
 .idea/codeStyles/codeStyleConfig.xml |   5 +
 .idea/misc.xml                       |   6 +
 .idea/modules.xml                    |   8 +
 .idea/vcs.xml                        |   6 +
 .idea/workspace.xml                  | 541 +++++++++++++++++++++++++++
 ArmPL/sp_gemm.hh                     | 271 ++++++++++++--
 DefaultCPU/sp_gemm.hh                |  55 ---
 DefaultGPU/sp_gemm.hh                |  54 ---
 Makefile                             |   2 +-
 NVPL/sp_gemv.hh                      | 117 ------                |   5 +
 cuBLAS/sp_gemm.hh                    |   9 +-
 cuBLAS/sp_gemv.hh                    | 261 +++++++++++++
 include/.DS_Store                    | Bin 0 -> 6148 bytes
 include/doGemm.hh                    |  46 ++-
 include/kernels/.DS_Store            | Bin 0 -> 6148 bytes
 include/kernels/CPU/sp_gemm.hh       |  23 +-
 include/kernels/CPU/sp_gemv.hh       |  47 +++
 include/kernels/GPU/sp_gemm.hh       |   3 +-
 include/kernels/GPU/sp_gemv.hh       |  28 ++
 include/kernels/gemm.hh              |   4 +
 include/kernels/gemv.hh              |  79 ++++
 24 files changed, 1278 insertions(+), 294 deletions(-)
 create mode 100644 .DS_Store
 create mode 100644 .idea/GPU-BLAS-Offload-Benchmark.iml
 create mode 100644 .idea/codeStyles/codeStyleConfig.xml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/modules.xml
 create mode 100644 .idea/vcs.xml
 create mode 100644 .idea/workspace.xml
 delete mode 100644 DefaultCPU/sp_gemm.hh
 delete mode 100644 DefaultGPU/sp_gemm.hh
 delete mode 100644 NVPL/sp_gemv.hh
 create mode 100644 cuBLAS/sp_gemv.hh
 create mode 100644 include/.DS_Store
 create mode 100644 include/kernels/.DS_Store
 create mode 100644 include/kernels/CPU/sp_gemv.hh
 create mode 100644 include/kernels/GPU/sp_gemv.hh

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..5e3f9bcf14470d249e0f7fdd3125325b2078e4a9
GIT binary patch
literal 8196

literal 0

diff --git a/.idea/GPU-BLAS-Offload-Benchmark.iml b/.idea/GPU-BLAS-Offload-Benchmark.iml
new file mode 100644
index 0000000..190534e
--- /dev/null
+++ b/.idea/GPU-BLAS-Offload-Benchmark.iml
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module classpath="External""GPU-BLAS-Offload-Benchmark" external.linked.project.path="$MODULE_DIR$" external.root.project.path="$MODULE_DIR$""Makefile" type="CPP_MODULE" version="4" />
\ No newline at end of file
diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml
new file mode 100644
index 0000000..a55e7a1
--- /dev/null
+++ b/.idea/codeStyles/codeStyleConfig.xml
@@ -0,0 +1,5 @@
+<component name="ProjectCodeStyleConfiguration">
+  <state>
+    <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
+  </state>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..830d3c8
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="MakefileWorkspace">
+    <contentRoot DIR="$PROJECT_DIR$" />
+  </component>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..eff3984
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/GPU-BLAS-Offload-Benchmark.iml" filepath="$PROJECT_DIR$/.idea/GPU-BLAS-Offload-Benchmark.iml" />
+    </modules>
+  </component>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000..b954508
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,541 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AutoImportSettings">
+    <option name="autoReloadType" value="SELECTIVE" />
+  </component>
+  <component name="CMakeRunConfigurationManager">
+    <generated>
+      <config projectName="GPU-BLAS-Offload-Benchmark" targetName="all" />
+      <config projectName="GPU-BLAS-Offload-Benchmark" targetName="gpu-blob" />
+    </generated>
+  </component>
+  <component name="CMakeSettings">
+    <configurations>
+      <configuration PROFILE_NAME="Debug" ENABLED="true" CONFIG_NAME="Debug" />
+    </configurations>
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding AOCL files" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="ClangdSettings">
+    <option name="clangTidyViaClangd" value="false" />
+    <option name="formatViaClangd" value="false" />
+  </component>
+  <component name="ExternalProjectsData">
+    <projectState path="$PROJECT_DIR$">
+      <ProjectState />
+    </projectState>
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+    <option name="UPDATE_TYPE" value="REBASE" />
+  </component>
+  <component name="MakefileLocalSettings">
+    <option name="availableProjects">
+      <map>
+        <entry>
+          <key>
+            <ExternalProjectPojo>
+              <option name="name" value="GPU-BLAS-Offload-Benchmark" />
+              <option name="path" value="$PROJECT_DIR$" />
+            </ExternalProjectPojo>
+          </key>
+          <value>
+            <list>
+              <ExternalProjectPojo>
+                <option name="name" value="GPU-BLAS-Offload-Benchmark" />
+                <option name="path" value="$PROJECT_DIR$" />
+              </ExternalProjectPojo>
+            </list>
+          </value>
+        </entry>
+      </map>
+    </option>
+    <option name="projectSyncType">
+      <map>
+        <entry key="$PROJECT_DIR$" value="RE_IMPORT" />
+      </map>
+    </option>
+  </component>
+  <component name="MarkdownSettingsMigration">
+    <option name="stateVersion" value="1" />
+  </component>
+  <component name="OCResolveContextSettings">
+    <option name="configuration" value="$PROJECT_DIR$/src/" />
+  </component>
+  <component name="ProjectApplicationVersion">
+    <option name="ide" value="CLion" />
+    <option name="majorVersion" value="2023" />
+    <option name="minorVersion" value="3" />
+  </component>
+  <component name="ProjectColorInfo">{
+  &quot;associatedIndex&quot;: 2
+  <component name="ProjectId" id="2bAwYDqoTyLBV0DE8xYqkQ0FEw0" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent">{
+  &quot;keyToString&quot;: {
+    &quot;C/C++;: &quot;Run&quot;,
+    &quot;RunOnceActivity.OpenProjectViewOnStart&quot;: &quot;true&quot;,
+    &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
+    &quot;RunOnceActivity.cidr.known.project.marker&quot;: &quot;true&quot;,
+    &quot;RunOnceActivity.readMode.enableVisualFormatting&quot;: &quot;true&quot;,
+    &quot;cf.advertisement.text.has.clang-format&quot;: &quot;true&quot;,
+    &quot;cf.first.check.clang-format&quot;: &quot;false&quot;,
+    &quot;cidr.known.project.marker&quot;: &quot;true&quot;,
+    &quot;git-widget-placeholder&quot;: &quot;sparse&quot;,
+    &quot;last_opened_file_path&quot;: &quot;/Users/no22498/Documents/GPU-BLAS-Offload-Benchmark&quot;,
+    &quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
+    &quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
+    &quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
+    &quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
+    &quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
+    &quot;settings.editor.selected.configurable&quot;: &quot;preferences.lookFeel&quot;,
+    &quot;structure.view.defaults.are.configured&quot;: &quot;true&quot;,
+    &quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
+  }
+  <component name="RecentsManager">
+    <key name="MoveFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$/CSV_Results" />
+    </key>
+  </component>
+  <component name="RunManager" selected="C/C++">
+    <configuration name="all" type="CLionNativeAppRunConfigurationType" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" EMULATE_TERMINAL="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="GPU-BLAS-Offload-Benchmark" TARGET_NAME="all" CONFIG_NAME="all" version="1">
+      <method v="2">
+        <option name="CLION.COMPOUND.BUILD" enabled="true" />
+      </method>
+    </configuration>
+    <configuration name="gpu-blob" type="CLionNativeAppRunConfigurationType" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" EMULATE_TERMINAL="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="GPU-BLAS-Offload-Benchmark" TARGET_NAME="gpu-blob" CONFIG_NAME="gpu-blob" version="1">
+      <method v="2">
+        <option name="CLION.COMPOUND.BUILD" enabled="true" />
+      </method>
+    </configuration>
+    <configuration name="" type="CppFileRunConfiguration" factoryName="CppFileRunConfiguration" temporary="true" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" EMULATE_TERMINAL="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="GPU-BLAS-Offload-Benchmark" TARGET_NAME="" CONFIG_NAME="">
+      <option name="sourceFile" value="src/" />
+      <method v="2">
+        <option name="com.jetbrains.cidr.cpp.runfile.CppFileBuildBeforeRunTaskProvider$BasicBuildBeforeRunTask" enabled="true" />
+      </method>
+    </configuration>
+    <list>
+      <item itemvalue="C/C++" />
+      <item itemvalue="Native Application.all" />
+      <item itemvalue="Native Application.gpu-blob" />
+    </list>
+    <recent_temporary>
+      <list>
+        <item itemvalue="C/C++" />
+      </list>
+    </recent_temporary>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="" />
+      <created>1705671236426</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1705671236426</updated>
+      <workItem from="1705671237559" duration="4602000" />
+      <workItem from="1706262352145" duration="10830000" />
+      <workItem from="1706520146967" duration="113000" />
+      <workItem from="1706524361669" duration="11224000" />
+      <workItem from="1706871479435" duration="19313000" />
+      <workItem from="1707150032379" duration="1154000" />
+      <workItem from="1707218344676" duration="510000" />
+      <workItem from="1707218861842" duration="7823000" />
+      <workItem from="1707568200980" duration="5614000" />
+      <workItem from="1708954563821" duration="751000" />
+      <workItem from="1708955322064" duration="16518000" />
+      <workItem from="1709217936554" duration="14897000" />
+      <workItem from="1709904670690" duration="598000" />
+      <workItem from="1710146767066" duration="2251000" />
+      <workItem from="1710157491483" duration="1263000" />
+      <workItem from="1710158763389" duration="2993000" />
+      <workItem from="1710161850416" duration="103978000" />
+      <workItem from="1711446443157" duration="118701000" />
+      <workItem from="1715785109710" duration="13531000" />
+      <workItem from="1716389199190" duration="1275000" />
+      <workItem from="1716897681894" duration="598000" />
+      <workItem from="1716899034743" duration="1217000" />
+      <workItem from="1716981059825" duration="14000" />
+      <workItem from="1722246444109" duration="2990000" />
+      <workItem from="1722496439084" duration="24843000" />
+      <workItem from="1723101242209" duration="21225000" />
+      <workItem from="1724244974273" duration="40294000" />
+      <workItem from="1726568120590" duration="8508000" />
+      <workItem from="1726828018604" duration="38592000" />
+    </task>
+    <task id="LOCAL-00001" summary="trivial changes">
+      <option name="closed" value="true" />
+      <created>1706261672580</created>
+      <option name="number" value="00001" />
+      <option name="presentableId" value="LOCAL-00001" />
+      <option name="project" value="LOCAL" />
+      <updated>1706261672580</updated>
+    </task>
+    <task id="LOCAL-00002" summary="Adding sparse algorithm">
+      <option name="closed" value="true" />
+      <created>1706568127804</created>
+      <option name="number" value="00002" />
+      <option name="presentableId" value="LOCAL-00002" />
+      <option name="project" value="LOCAL" />
+      <updated>1706568127804</updated>
+    </task>
+    <task id="LOCAL-00003" summary="Integrating algorithm with benchmark">
+      <option name="closed" value="true" />
+      <created>1706881882900</created>
+      <option name="number" value="00003" />
+      <option name="presentableId" value="LOCAL-00003" />
+      <option name="project" value="LOCAL" />
+      <updated>1706881882900</updated>
+    </task>
+    <task id="LOCAL-00004" summary="Adding commandline options to select only sparse or dense kernels">
+      <option name="closed" value="true" />
+      <created>1707233768599</created>
+      <option name="number" value="00004" />
+      <option name="presentableId" value="LOCAL-00004" />
+      <option name="project" value="LOCAL" />
+      <updated>1707233768599</updated>
+    </task>
+    <task id="LOCAL-00005" summary="Changes">
+      <option name="closed" value="true" />
+      <created>1709208672718</created>
+      <option name="number" value="00005" />
+      <option name="presentableId" value="LOCAL-00005" />
+      <option name="project" value="LOCAL" />
+      <updated>1709208672718</updated>
+    </task>
+    <task id="LOCAL-00006" summary="Changes">
+      <option name="closed" value="true" />
+      <created>1709211130948</created>
+      <option name="number" value="00006" />
+      <option name="presentableId" value="LOCAL-00006" />
+      <option name="project" value="LOCAL" />
+      <updated>1709211130948</updated>
+    </task>
+    <task id="LOCAL-00007" summary="Adding sparse kernel to doGemm">
+      <option name="closed" value="true" />
+      <created>1709217956669</created>
+      <option name="number" value="00007" />
+      <option name="presentableId" value="LOCAL-00007" />
+      <option name="project" value="LOCAL" />
+      <updated>1709217956669</updated>
+    </task>
+    <task id="LOCAL-00008" summary="Adding matrix type enum class">
+      <option name="closed" value="true" />
+      <created>1709218027209</created>
+      <option name="number" value="00008" />
+      <option name="presentableId" value="LOCAL-00008" />
+      <option name="project" value="LOCAL" />
+      <updated>1709218027209</updated>
+    </task>
+    <task id="LOCAL-00009" summary="changes">
+      <option name="closed" value="true" />
+      <created>1709368112577</created>
+      <option name="number" value="00009" />
+      <option name="presentableId" value="LOCAL-00009" />
+      <option name="project" value="LOCAL" />
+      <updated>1709368112577</updated>
+    </task>
+    <task id="LOCAL-00010" summary="changes">
+      <option name="closed" value="true" />
+      <created>1709368228167</created>
+      <option name="number" value="00010" />
+      <option name="presentableId" value="LOCAL-00010" />
+      <option name="project" value="LOCAL" />
+      <updated>1709368228167</updated>
+    </task>
+    <task id="LOCAL-00011" summary="adding command line kernel selection">
+      <option name="closed" value="true" />
+      <created>1709582619984</created>
+      <option name="number" value="00011" />
+      <option name="presentableId" value="LOCAL-00011" />
+      <option name="project" value="LOCAL" />
+      <updated>1709582619984</updated>
+    </task>
+    <task id="LOCAL-00012" summary="adding command line kernel selection">
+      <option name="closed" value="true" />
+      <created>1710157174669</created>
+      <option name="number" value="00012" />
+      <option name="presentableId" value="LOCAL-00012" />
+      <option name="project" value="LOCAL" />
+      <updated>1710157174669</updated>
+    </task>
+    <task id="LOCAL-00013" summary="Adding basic sparse multiplication kernel for default CPU and GPU">
+      <option name="closed" value="true" />
+      <created>1710172355530</created>
+      <option name="number" value="00013" />
+      <option name="presentableId" value="LOCAL-00013" />
+      <option name="project" value="LOCAL" />
+      <updated>1710172355530</updated>
+    </task>
+    <task id="LOCAL-00014" summary="Implementing cuSPARSE kernel">
+      <option name="closed" value="true" />
+      <created>1710337387217</created>
+      <option name="number" value="00014" />
+      <option name="presentableId" value="LOCAL-00014" />
+      <option name="project" value="LOCAL" />
+      <updated>1710337387217</updated>
+    </task>
+    <task id="LOCAL-00015" summary="Trying to work out CSR malloc bug">
+      <option name="closed" value="true" />
+      <created>1710338720376</created>
+      <option name="number" value="00015" />
+      <option name="presentableId" value="LOCAL-00015" />
+      <option name="project" value="LOCAL" />
+      <updated>1710338720376</updated>
+    </task>
+    <task id="LOCAL-00016" summary="Trying to work out CSR malloc bug">
+      <option name="closed" value="true" />
+      <created>1710338867534</created>
+      <option name="number" value="00016" />
+      <option name="presentableId" value="LOCAL-00016" />
+      <option name="project" value="LOCAL" />
+      <updated>1710338867534</updated>
+    </task>
+    <task id="LOCAL-00017" summary="cuSPARSE unified memory implementation">
+      <option name="closed" value="true" />
+      <created>1710853559721</created>
+      <option name="number" value="00017" />
+      <option name="presentableId" value="LOCAL-00017" />
+      <option name="project" value="LOCAL" />
+      <updated>1710853559721</updated>
+    </task>
+    <task id="LOCAL-00018" summary="Now compiles">
+      <option name="closed" value="true" />
+      <created>1711026531002</created>
+      <option name="number" value="00018" />
+      <option name="presentableId" value="LOCAL-00018" />
+      <option name="project" value="LOCAL" />
+      <updated>1711026531002</updated>
+    </task>
+    <task id="LOCAL-00019" summary="Now compiles">
+      <option name="closed" value="true" />
+      <created>1711026902576</created>
+      <option name="number" value="00019" />
+      <option name="presentableId" value="LOCAL-00019" />
+      <option name="project" value="LOCAL" />
+      <updated>1711026902576</updated>
+    </task>
+    <task id="LOCAL-00020" summary="Now compiles with fewer runtime errors">
+      <option name="closed" value="true" />
+      <created>1711361513432</created>
+      <option name="number" value="00020" />
+      <option name="presentableId" value="LOCAL-00020" />
+      <option name="project" value="LOCAL" />
+      <updated>1711361513432</updated>
+    </task>
+    <task id="LOCAL-00021" summary="Implementing other offload types - still some runtime errors">
+      <option name="closed" value="true" />
+      <created>1711453016707</created>
+      <option name="number" value="00021" />
+      <option name="presentableId" value="LOCAL-00021" />
+      <option name="project" value="LOCAL" />
+      <updated>1711453016707</updated>
+    </task>
+    <task id="LOCAL-00022" summary="All implemented and running.  No checksum at the end">
+      <option name="closed" value="true" />
+      <created>1711457712445</created>
+      <option name="number" value="00022" />
+      <option name="presentableId" value="LOCAL-00022" />
+      <option name="project" value="LOCAL" />
+      <updated>1711457712445</updated>
+    </task>
+    <task id="LOCAL-00023" summary="Removing print statements">
+      <option name="closed" value="true" />
+      <created>1711457867311</created>
+      <option name="number" value="00023" />
+      <option name="presentableId" value="LOCAL-00023" />
+      <option name="project" value="LOCAL" />
+      <updated>1711457867311</updated>
+    </task>
+    <task id="LOCAL-00024" summary="All three offload types working for large problem sizes">
+      <option name="closed" value="true" />
+      <created>1711715754311</created>
+      <option name="number" value="00024" />
+      <option name="presentableId" value="LOCAL-00024" />
+      <option name="project" value="LOCAL" />
+      <updated>1711715754311</updated>
+    </task>
+    <task id="LOCAL-00025" summary="Removing print statements">
+      <option name="closed" value="true" />
+      <created>1711715920815</created>
+      <option name="number" value="00025" />
+      <option name="presentableId" value="LOCAL-00025" />
+      <option name="project" value="LOCAL" />
+      <updated>1711715920815</updated>
+    </task>
+    <task id="LOCAL-00026" summary="Superficial changes">
+      <option name="closed" value="true" />
+      <created>1711961476350</created>
+      <option name="number" value="00026" />
+      <option name="presentableId" value="LOCAL-00026" />
+      <option name="project" value="LOCAL" />
+      <updated>1711961476350</updated>
+    </task>
+    <task id="LOCAL-00027" summary="rebasing">
+      <option name="closed" value="true" />
+      <created>1711961618074</created>
+      <option name="number" value="00027" />
+      <option name="presentableId" value="LOCAL-00027" />
+      <option name="project" value="LOCAL" />
+      <updated>1711961618074</updated>
+    </task>
+    <task id="LOCAL-00028" summary="rebasing">
+      <option name="closed" value="true" />
+      <created>1711961836984</created>
+      <option name="number" value="00028" />
+      <option name="presentableId" value="LOCAL-00028" />
+      <option name="project" value="LOCAL" />
+      <updated>1711961836984</updated>
+    </task>
+    <task id="LOCAL-00029" summary="rebasing">
+      <option name="closed" value="true" />
+      <created>1711961942373</created>
+      <option name="number" value="00029" />
+      <option name="presentableId" value="LOCAL-00029" />
+      <option name="project" value="LOCAL" />
+      <updated>1711961942374</updated>
+    </task>
+    <task id="LOCAL-00030" summary="Fixing after rebase">
+      <option name="closed" value="true" />
+      <created>1712057111636</created>
+      <option name="number" value="00030" />
+      <option name="presentableId" value="LOCAL-00030" />
+      <option name="project" value="LOCAL" />
+      <updated>1712057111636</updated>
+    </task>
+    <task id="LOCAL-00031" summary="Tidying up spGEMM classes to remove duplicated code">
+      <option name="closed" value="true" />
+      <created>1712136173732</created>
+      <option name="number" value="00031" />
+      <option name="presentableId" value="LOCAL-00031" />
+      <option name="project" value="LOCAL" />
+      <updated>1712136173732</updated>
+    </task>
+    <task id="LOCAL-00032" summary="Fixing py script to accomodate new kernels">
+      <option name="closed" value="true" />
+      <created>1712141872451</created>
+      <option name="number" value="00032" />
+      <option name="presentableId" value="LOCAL-00032" />
+      <option name="project" value="LOCAL" />
+      <updated>1712141872451</updated>
+    </task>
+    <task id="LOCAL-00033" summary="Fixing memory bug.  Implementing --kernels flag">
+      <option name="closed" value="true" />
+      <created>1712153668999</created>
+      <option name="number" value="00033" />
+      <option name="presentableId" value="LOCAL-00033" />
+      <option name="project" value="LOCAL" />
+      <updated>1712153668999</updated>
+    </task>
+    <task id="LOCAL-00034" summary="Getting rid of print statements">
+      <option name="closed" value="true" />
+      <created>1712222760735</created>
+      <option name="number" value="00034" />
+      <option name="presentableId" value="LOCAL-00034" />
+      <option name="project" value="LOCAL" />
+      <updated>1712222760735</updated>
+    </task>
+    <task id="LOCAL-00035" summary="WIP">
+      <option name="closed" value="true" />
+      <created>1712311301376</created>
+      <option name="number" value="00035" />
+      <option name="presentableId" value="LOCAL-00035" />
+      <option name="project" value="LOCAL" />
+      <updated>1712311301376</updated>
+    </task>
+    <task id="LOCAL-00036" summary="Finalising">
+      <option name="closed" value="true" />
+      <created>1713959722407</created>
+      <option name="number" value="00036" />
+      <option name="presentableId" value="LOCAL-00036" />
+      <option name="project" value="LOCAL" />
+      <updated>1713959722407</updated>
+    </task>
+    <task id="LOCAL-00037" summary="Rebasing">
+      <option name="closed" value="true" />
+      <created>1715161012243</created>
+      <option name="number" value="00037" />
+      <option name="presentableId" value="LOCAL-00037" />
+      <option name="project" value="LOCAL" />
+      <updated>1715161012243</updated>
+    </task>
+    <task id="LOCAL-00038" summary="Rebasing">
+      <option name="closed" value="true" />
+      <created>1715161090646</created>
+      <option name="number" value="00038" />
+      <option name="presentableId" value="LOCAL-00038" />
+      <option name="project" value="LOCAL" />
+      <updated>1715161090646</updated>
+    </task>
+    <task id="LOCAL-00039" summary="Adding AOCL files">
+      <option name="closed" value="true" />
+      <created>1716198459677</created>
+      <option name="number" value="00039" />
+      <option name="presentableId" value="LOCAL-00039" />
+      <option name="project" value="LOCAL" />
+      <updated>1716198459677</updated>
+    </task>
+    <task id="LOCAL-00040" summary="Adding AOCL files">
+      <option name="closed" value="true" />
+      <created>1724234752813</created>
+      <option name="number" value="00040" />
+      <option name="presentableId" value="LOCAL-00040" />
+      <option name="project" value="LOCAL" />
+      <updated>1724234752813</updated>
+    </task>
+    <option name="localTasksCounter" value="41" />
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+  <component name="Vcs.Log.Tabs.Properties">
+    <option name="TAB_STATES">
+      <map>
+        <entry key="MAIN">
+          <value>
+            <State />
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
+  <component name="VcsManagerConfiguration">
+    <MESSAGE value="Adding sparse kernel to doGemm" />
+    <MESSAGE value="Adding matrix type enum class" />
+    <MESSAGE value="changes" />
+    <MESSAGE value="adding command line kernel selection" />
+    <MESSAGE value="Adding basic sparse multiplication kernel for default CPU and GPU" />
+    <MESSAGE value="Implementing cuSPARSE kernel" />
+    <MESSAGE value="Trying to work out CSR malloc bug" />
+    <MESSAGE value="cuSPARSE unified memory implementation" />
+    <MESSAGE value="Now compiles" />
+    <MESSAGE value="Now compiles with fewer runtime errors" />
+    <MESSAGE value="Implementing other offload types - still some runtime errors" />
+    <MESSAGE value="All implemented and running.  No checksum at the end" />
+    <MESSAGE value="All three offload types working for large problem sizes" />
+    <MESSAGE value="Removing print statements" />
+    <MESSAGE value="Superficial changes" />
+    <MESSAGE value="rebasing" />
+    <MESSAGE value="Fixing after rebase" />
+    <MESSAGE value="Tidying up spGEMM classes to remove duplicated code" />
+    <MESSAGE value="Fixing py script to accomodate new kernels" />
+    <MESSAGE value="Fixing memory bug.  Implementing --kernels flag" />
+    <MESSAGE value="Getting rid of print statements" />
+    <MESSAGE value="WIP" />
+    <MESSAGE value="Finalising" />
+    <MESSAGE value="Rebasing" />
+    <MESSAGE value="Adding AOCL files" />
+    <option name="LAST_COMMIT_MESSAGE" value="Adding AOCL files" />
+  </component>
\ No newline at end of file
diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh
index aba5814..47b0bf9 100644
--- a/ArmPL/sp_gemm.hh
+++ b/ArmPL/sp_gemm.hh
@@ -16,7 +16,7 @@ namespace cpu {
 template <typename T>
 class sp_gemm_cpu : public sp_gemm<T> {
-  using sp_gemm<T>::gemm;
+  using sp_gemm<T>::sp_gemm;
   using sp_gemm<T>::callConsume;
   using sp_gemm<T>::m_;
   using sp_gemm<T>::n_;
@@ -24,6 +24,7 @@ class sp_gemm_cpu : public sp_gemm<T> {
   using sp_gemm<T>::A_;
   using sp_gemm<T>::B_;
   using sp_gemm<T>::C_;
+  using sp_gemm<T>::nnz_;
   /** Make call to the GEMM kernel. */
@@ -52,22 +53,23 @@ class sp_gemm_cpu : public sp_gemm<T> {
-    if (std::is_same_v<T, float>) {
-      status_ = armpl_spmm_exec_s(transA,
-                                  transB,
+    if constexpr (std::is_same_v<T, float>) {
+      status_ = armpl_spmm_exec_s(transA_,
+                                  transB_,
-                                  A_armpl_,
-                                  B_armpl,
+                                  *A_armpl_,
+                                  *B_armpl_,
-                                  C_armpl_);
+                                  *B_armpl_);
     } else if constexpr (std::is_same_v<T, double>) {
-      status_ = armpl_spmm_exec_d(transA,
-                                  transB,
+      std::cout << "About to execute dgemm" << std::endl;
+      status_ = armpl_spmm_exec_d(transA_,
+                                  transB_,
-                                  A_armpl_,
-                                  B_armpl,
+                                  *A_armpl_,
+                                  *B_armpl_,
-                                  C_armpl_);
+                                  *B_armpl_);
     } else {
       // Un-specialised class will not do any work - print error and exit.
       std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported."
@@ -85,26 +87,42 @@ class sp_gemm_cpu : public sp_gemm<T> {
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
-  void preLoopRequirements() override {}
+  void preLoopRequirements() override {
+    // Need to put A_ and B_ into A_armpl_ and B_armpl_
+    // ToDo -- Error catching
+    toCSR_armpl();
+//    std::cout << "toCSR_armpl() wrapped up without a problem" << std::endl;
+  }
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
-    status_ = armpl_spmat_destroy(A_armpl_);
-    if (status_ != ARMPL_STATUS_SUCCESS) {
-      std::cout << "ERROR " << status_ << std::endl;
-      exit(1);
-    }
-    status_ = armpl_spmat_destroy(B_armpl_);
+    status_ = armpl_spmat_destroy(*A_armpl_);
     if (status_ != ARMPL_STATUS_SUCCESS) {
       std::cout << "ERROR " << status_ << std::endl;
-    status_ = armpl_spmat_destroy(C_armpl_);
+    status_ = armpl_spmat_destroy(*B_armpl_);
     if (status_ != ARMPL_STATUS_SUCCESS) {
       std::cout << "ERROR " << status_ << std::endl;
+//    status_ = armpl_spmat_destroy(*C_armpl_);
+//    if (status_ != ARMPL_STATUS_SUCCESS) {
+//      std::cout << "ERROR " << status_ << std::endl;
+//      exit(1);
+//    }
+//    delete [] A_armpl_row_ptr_;
+//    delete [] A_armpl_col_index_;
+//    delete [] A_vals_;
+//    delete [] B_armpl_row_ptr_;
+//    delete [] B_armpl_col_index_;
+//    delete [] B_vals_;
+//    delete [] C_armpl_row_ptr_;
+//    delete [] C_armpl_col_index_;
+//    delete [] C_vals_;
   /** The constant value Alpha. */
@@ -117,8 +135,7 @@ class sp_gemm_cpu : public sp_gemm<T> {
   armpl_spmat_t armpl_A, armpl_B, armpl_C;
-  @override
-  void toCSR() {
+  void toCSR_armpl() {
     n_armpl_ = n_;
     // ToDo -- check whether flags_ is correct!
     flags_ = 0;
@@ -127,85 +144,265 @@ class sp_gemm_cpu : public sp_gemm<T> {
     A_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
     A_armpl_col_index_ = new armpl_int_t[nnz_];
     A_vals_ = new T[nnz_];
+    A_armpl_row_ptr_[0] = 0;
     int nnz_encountered = 0;
+//    std::cout << "About to load A into csr" << std::endl;
     for (int row = 0; row < n_; row++) {
-      A_armpl_row_ptr_[row] = nnz_encountered;
+//      std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << std::endl;
+      A_armpl_row_ptr_[row + 1] = nnz_encountered;
       for (int col = 0; col < n_; col++) {
         if (A_[(row * n_) + col] != 0.0) {
+//          std::cout << "\t\tCol " << col << " = " << A_[(row * n_) + col] <<
+//          std::endl;
           A_armpl_col_index_[nnz_encountered] = col;
-          A_vals_[nnz_encountered] = A_[(row * n_) + col];
+          A_vals_[nnz_encountered] = static_cast<T>(A_[(row * n_) + col]);
+//          std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl;
+//    std::cout << "___A =" << std::endl << "\t\t[";
+//    for (int i = 0; i < (n_ + 1); i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << A_armpl_row_ptr_[i];
+//    }
+//    std::cout << "]" << std::endl << "\t\t[";
+//    for (int i = 0; i < nnz_; i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << A_armpl_col_index_[i];
+//    }
+//    std::cout << "]" << std::endl << "\t\t[";
+//    for (int i = 0; i < nnz_; i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << A_vals_[i];
+//    }
+//    std::cout << "]" << std::endl;
+//    std::cout << "About to load B into csr" << std::endl;
     // Move B to CSR
     B_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
     B_armpl_col_index_ = new armpl_int_t[nnz_];
     B_vals_ = new T[nnz_];
+    B_armpl_row_ptr_[0] = 0;
     nnz_encountered = 0;
     for (int row = 0; row < n_; row++) {
-      B_armpl_row_ptr_[row] = nnz_encountered;
+//      std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered <<
+//      std::endl;
+      B_armpl_row_ptr_[row + 1] = nnz_encountered;
       for (int col = 0; col < n_; col++) {
         if (B_[(row * n_) + col] != 0.0) {
+//          std::cout << "\t\tCol " << col << " = " << B_[(row * n_) + col] << std::endl;
           B_armpl_col_index_[nnz_encountered] = col;
-          B_vals_[nnz_encountered] = B_[(row * n_) + col];
+          B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
+//          std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl;
+//    std::cout << "___B =" << std::endl << "\t\t[";
+//    for (int i = 0; i < (n_ + 1); i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << B_armpl_row_ptr_[i];
+//    }
+//    std::cout << "]" << std::endl << "\t\t[";
+//    for (int i = 0; i < nnz_; i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << B_armpl_col_index_[i];
+//    }
+//    std::cout << "]" << std::endl << "\t\t[";
+//    for (int i = 0; i < nnz_; i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << B_vals_[i];
+//    }
+//    std::cout << "]" << std::endl;
+//    // Move B to CSR
+//    C_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+//    C_armpl_col_index_ = new armpl_int_t[nnz_];
+//    C_vals_ = new T[nnz_];
+//    C_armpl_row_ptr_[0] = 0;
+//    nnz_encountered = 0;
+////    std::cout << "About to load C into csr" << std::endl;
+//    for (int row = 0; row < n_; row++) {
+////      std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << std::endl;
+//      C_armpl_row_ptr_[row + 1] = nnz_encountered;
+//      for (int col = 0; col < n_; col++) {
+//        if (A_[(row * n_) + col] != 0.0) {
+//          C_armpl_col_index_[nnz_encountered] = col;
+//          C_vals_[nnz_encountered] = A_[(row * n_) + col];
+//          nnz_encountered++;
+////          std::cout << "\t\tCol " << col << " = " << C_vals_[nnz_encountered] <<
+////          std::endl;
+////          std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl;
+//        }
+//      }
+//    }
+//    std::cout << "___C =" << std::endl << "\t\t[";
+//    for (int i = 0; i < (n_ + 1); i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << C_armpl_row_ptr_[i];
+//    }
+//    std::cout << "]" << std::endl << "\t\t[";
+//    for (int i = 0; i < nnz_; i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << C_armpl_col_index_[i];
+//    }
+//    std::cout << "]" << std::endl << "\t\t[";
+//    for (int i = 0; i < nnz_; i++) {
+//      if (i != 0) {
+//        std::cout << ", ";
+//      }
+//      std::cout << C_vals_[i];
+//    }
+//    std::cout << "]" << std::endl;
+//    std::cout << "Loading csr A into armpl storage formats" << std::endl;
+    if constexpr (std::is_same_v<T, float>) {
+      std::cout << "\tn_armpl_ = " << n_armpl_ << std::endl;
+      std::cout << "\tA_armpl_row_ptr_ (size = " << sizeof
+      (A_armpl_row_ptr_[0]) << ") = [" << A_armpl_row_ptr_[0];
+      for (int i = 1; i < (n_ + 1); i++) {
+        std::cout << ", " << A_armpl_row_ptr_[i];
+      }
+      std::cout << "]" << std::endl << "\tA_armpl_col_index_ (size = " <<
+      sizeof(A_armpl_col_index_[0]) << ") = [" <<
+      A_armpl_col_index_[0];
+      for (int i = 1; i < nnz_; i++) {
+        std::cout << ", " << A_armpl_col_index_[i];
+      }
+      std::cout << "]" << std::endl << "\tA_vals_ (size = " << sizeof
+      (A_vals_[0]) << ") = [" << A_vals_[0];
+      for (int i = 1; i < nnz_; i++) {
+        std::cout << ", " << A_vals_[i];
+      }
+      std::cout << "]" << std::endl << "flags: " << flags_ << std::endl;
-    if (std::is_sam_v<T, float>) {
       status_ = armpl_spmat_create_csr_s(A_armpl_,
-                                         flags);
+                                         flags_);
       if (status_ != ARMPL_STATUS_SUCCESS) {
         std::cout << "ERROR " << status_ << std::endl;
+//      std::cout << "Loading csr C into armpl storage formats" << std::endl;
+//      status_ = armpl_spmat_create_csr_s(C_armpl_,
+//                                         n_armpl_,
+//                                         n_armpl_,
+//                                         C_armpl_row_ptr_,
+//                                         C_armpl_col_index_,
+//                                         C_vals_,
+//                                         flags_);
+//      if (status_ != ARMPL_STATUS_SUCCESS) {
+//        std::cout << "ERROR " << status_ << std::endl;
+//        exit(1);
+//      }
+//      std::cout << "Loading csr B into armpl storage formats" << std::endl;
       status_ = armpl_spmat_create_csr_s(B_armpl_,
-                                         flags);
+                                         flags_);
       if (status_ != ARMPL_STATUS_SUCCESS) {
         std::cout << "ERROR " << status_ << std::endl;
-    } else if (std::is_same_v<T, double>) {
+    } else if constexpr (std::is_same_v<T, double>) {
+      std::cout << "\tn_armpl_ = " << n_armpl_ << std::endl;
+      std::cout << "\tA_armpl_row_ptr_ (size = " << sizeof
+      (A_armpl_row_ptr_[0]) << ") = [" << A_armpl_row_ptr_[0];
+      for (int i = 1; i < (n_ + 1); i++) {
+        std::cout << ", " << A_armpl_row_ptr_[i];
+      }
+      std::cout << "]" << std::endl << "\tA_armpl_col_index_ (size = " <<
+      sizeof(A_armpl_col_index_[0]) << ") = [" <<
+      A_armpl_col_index_[0];
+      for (int i = 1; i < nnz_; i++) {
+        std::cout << ", " << A_armpl_col_index_[i];
+      }
+      std::cout << "]" << std::endl << "\tA_vals_ (size = " << sizeof
+      (A_vals_[0]) << ") = [" << A_vals_[0];
+      for (int i = 1; i < nnz_; i++) {
+        std::cout << ", " << A_vals_[i];
+      }
+      std::cout << "]" << std::endl << "flags: " << flags_ << std::endl;
+      std::cout << "About to create CSR A (double)" << std::endl;
       status_ = armpl_spmat_create_csr_d(A_armpl_,
-                                         flags);
+                                         flags_);
       if (status_ != ARMPL_STATUS_SUCCESS) {
         std::cout << "ERROR " << status_ << std::endl;
+//      std::cout << "Loading csr C into armpl storage formats" << std::endl;
+//      status_ = armpl_spmat_create_csr_d(C_armpl_,
+//                                         n_armpl_,
+//                                         n_armpl_,
+//                                         C_armpl_row_ptr_,
+//                                         C_armpl_col_index_,
+//                                         C_vals_,
+//                                         flags_);
+//      if (status_ != ARMPL_STATUS_SUCCESS) {
+//        std::cout << "ERROR " << status_ << std::endl;
+//        exit(1);
+//      }
+//      std::cout << "Loading csr B into armpl storage formats" << std::endl;
+      std::cout << "About to create CSR B (double)" << std::endl;
       status_ = armpl_spmat_create_csr_d(B_armpl_,
-                                         flags);
+                                         flags_);
       if (status_ != ARMPL_STATUS_SUCCESS) {
         std::cout << "ERROR " << status_ << std::endl;
+//    std::cout << "Okay, all matrices made!!" << std::endl;
   armpl_int_t flags_;
@@ -219,12 +416,16 @@ class sp_gemm_cpu : public sp_gemm<T> {
   armpl_int_t* C_armpl_row_ptr_;
   armpl_int_t* C_armpl_col_index_;
+  T* A_vals_;
+  T* B_vals_;
+  T* C_vals_;
   armpl_spmat_t* A_armpl_;
   armpl_spmat_t* B_armpl_;
   armpl_spmat_t* C_armpl_;
-  sparse_hint_value transA = ARMPL_SPARSE_OPERATION_NOTRANS;
-  sparse_hint_value transB = ARMPL_SPARSE_OPERATION_NOTRANS;
+  armpl_sparse_hint_value transA_ = ARMPL_SPARSE_OPERATION_NOTRANS;
+  armpl_sparse_hint_value transB_ = ARMPL_SPARSE_OPERATION_NOTRANS;
 }  // namespace cpu
diff --git a/DefaultCPU/sp_gemm.hh b/DefaultCPU/sp_gemm.hh
deleted file mode 100644
index d7ecb37..0000000
--- a/DefaultCPU/sp_gemm.hh
+++ /dev/null
@@ -1,55 +0,0 @@
-#pragma once
-#if defined CPU_DEFAULT
-#include "../include/kernels/CPU/sp_gemm.hh"
-#include "../include/utilities.hh"
-namespace cpu {
-/** A class for GEMM CPU BLAS kernels. */
-template <typename T>
-class sp_gemm_cpu : public sp_gemm<T> {
- public:
-  using sp_gemm<T>::sp_gemm;
-  using sp_gemm<T>::callConsume;
-  using sp_gemm<T>::m_;
-  using sp_gemm<T>::n_;
-  using sp_gemm<T>::k_;
-  using sp_gemm<T>::A_;
-  using sp_gemm<T>::B_;
-  using sp_gemm<T>::C_;
- private:
-  /** Perform the GEMM kernel. */
-  void callGemm() override {
-    /** A naive implementation of a column-major GEMM. Alpha and Beta are always
-     * 1 and 0 respectively.
-     * Operation takes the form of C[M,N] = A[M,K] * B[K,N].
-     * callConsume() is required to ensure that the compiler does not optimise
-     * away this function. */
-    int x, y, z;
-    T acc;
-    for (x = 0; x < m_; x++) {
-      for (y = 0; y < n_; y++) {
-        acc = 0.0;
-        for (z = 0; z < k_; z++) {
-          acc += A_[z * m_ + x] * B_[y * k_ + z];
-        }
-        C_[y * m_ + x] = acc;
-      }
-    }
-    // Ensure compiler doesn't optimise away the work being done
-    callConsume();
-  }
-  /** Perform any required steps before calling the GEMM kernel that should
-   * be timed. */
-  void preLoopRequirements() override {}
-  /** Perform any required steps after calling the GEMM kernel that should
-   * be timed. */
-  void postLoopRequirements() override {}
-}  // namespace cpu
diff --git a/DefaultGPU/sp_gemm.hh b/DefaultGPU/sp_gemm.hh
deleted file mode 100644
index 2a9f478..0000000
--- a/DefaultGPU/sp_gemm.hh
+++ /dev/null
@@ -1,54 +0,0 @@
-#pragma once
-#if defined GPU_DEFAULT
-#include <cmath>
-#include "../include/kernels/GPU/sp_gemm.hh"
-#include "../include/utilities.hh"
-namespace gpu {
-/** A class for GEMM GPU BLAS kernels. */
-template <typename T>
-class sp_gemm_gpu : public sp_gemm<T> {
- public:
-  using sp_gemm<T>::sp_gemm;
-  /** Call the BLAS kernel n times, with 1 warmup run.
-   * Returns the time elapsed for n BLAS calls in seconds. */
-  time_checksum_gflop compute() {
-    // Override function in base `kernel` class as DefaultGPU should do nothing.
-    return {INFINITY, INFINITY, 0.0};
-  }
-  /** Initialise the required data structures. */
-  void initialise(gpuOffloadType offload, int n, float sparsity) override {
-    // Default GPU implementation - do nothing.
-  }
- private:
-  /** Make a call to the BLAS Library Kernel. */
-  void callGemm() override {
-    // Default GPU implementation - do nothing.
-  }
-  /** Perform any required steps before calling the GEMM kernel that should
-   * be timed. */
-  void preLoopRequirements() override {
-    // Default GPU implementation - do nothing.
-  }
-  /** Perform any required steps after calling the GEMM kernel that should
-   * be timed. */
-  void postLoopRequirements() override {
-    // Default GPU implementation - do nothing.
-  }
-  /** Do any necessary cleanup (free pointers, close library handles, etc.)
-   * after Kernel has been called. */
-  void postCallKernelCleanup() override {
-    // Default GPU implementation - do nothing.
-  }
-}  // namespace gpu
\ No newline at end of file
diff --git a/Makefile b/Makefile
index bff0add..e5091e0 100644
--- a/Makefile
+++ b/Makefile
@@ -170,7 +170,7 @@ $(warning GPU_LIB not set (use CUBLAS, ONEMKL, ROCBLAS). No GPU kernels will be
 else ifeq ($(GPU_LIB), CUBLAS)
 # Do cuBLAS stuff
-override CXXFLAGS += -cudalib=cublas
+override CXXFLAGS += -cudalib=cublas -lcusparse_static
 $(warning Users may be required to do the following to use $(COMPILER) with $(GPU_LIB):)
 $(info $(TAB)$(TAB)Add `CXXFLAGS=-L<NVHPC_DIR>/.../math_libs/lib64 -L<NVHPC_DIR>/.../cuda/lib64` to make command)
diff --git a/NVPL/sp_gemv.hh b/NVPL/sp_gemv.hh
deleted file mode 100644
index d04f6b8..0000000
--- a/NVPL/sp_gemv.hh
+++ /dev/null
@@ -1,117 +0,0 @@
- * ToDo -- This is all currently written for GEMM, but NVPL does not support
- * GEMM, so this needs to be adjusted to spmv -- which is supported
- */
-#pragma once
-#ifdef CPU_NVPL
-#include <nvpl_sparse.h>
-#include "../include/kernels/CPU/gemm.hh"
-#include "../include/utilities.hh"
-namespace cpu {
-/** A class for GEMM CPU BLAS kernels. */
-template <typename T>
-class sp_gemm_cpu : public sp_gemm<T> {
- public:
-  using sp_gemm<T>::gemm;
-  using sp_gemm<T>::callConsume;
-  using sp_gemm<T>::m_;
-  using sp_gemm<T>::n_;
-  using sp_gemm<T>::k_;
-  using sp_gemm<T>::A_;
-  using sp_gemm<T>::B_;
-  using sp_gemm<T>::C_;
- private:
-  /** Make call to the GEMM kernel. */
-  void callGemm() override {
-    // Ensure compiler doesn't optimise away the work being done
-    callConsume();
-  }
-  /** Perform any required steps before calling the GEMM kernel that should
-   * be timed. */
-  void preLoopRequirements() override {
-    // Set type enum
-    if constexpr (std::is_same_v<T, float>) {
-      type_ = NVPL_SPARSE_R_32F;
-    } else if constexpr (std::is_same_v<T, double>) {
-      type_ = NVPL_SPARSE_R_64F;
-    } else {
-      // Un-specialised class will not do any work - print error and exit.
-      std::cout << "ERROR - Datatype for NVPL sparse GEMM kernel not supported."
-                << std::endl;
-      exit(1);
-    }
-    status_ = nvpl_sparse_create(&handle_);
-    // Todo -- error check
-    // Todo -- Make const?
-    status_ = nvpl_sparse_create_csr(A_nvpl_, n_, n_, nnz_, A_row_ptr_nvpl_,
-                                     A_col_index_nvpl_, A_vals_nvpl_,
-                                     index_type_, index_type_, base_, type_);
-    status_ = nvpl_sparse_create_csr(B_nvpl_, n_, n_, nnz_, B_row_ptr_nvpl_,
-                                     B_col_index_nvpl_, B_vals_nvpl_,
-                                     index_type_, index_type_, base_, type_);
-    // Todo -- error check
-  }
-  /** Perform any required steps after calling the GEMM kernel that should
-   * be timed. */
-  void postLoopRequirements() override {
-    status_ = nvpl_sparse_destroy(handle_);
-    // Todo -- error check
-    status_ = nvpl_sparse_destroy_sp_mat(A_nvpl_);
-    status_ = nvpl_sparse_destroy_sp_mat(B_nvpl_);
-    status_ = nvpl_sparse_destroy_sp_mat(C_nvpl_);
-  }
-  /** The constant value Alpha. */
-  T alpha = ALPHA;
-  /** The constant value Beta. */
-  T beta = BETA;
-  /**
-   * Sparse metadata
-  */
-  nvpl_sparse_status_t status_;
-  nvpl_sparse_handle_t handle_;
-  nvpl_sparse_data_type_t type_;
-  nvpl_sparse_operation_t op_ = NVPL_SPARSE_OPERATION_NON_TRANSPOSE;
-  nvpl_sparse_index_base_t base_ = NVPL_SPARSE_INDEX_BASE_ZERO;
-  nvpl_sparse_format_t format_ = NVPL_SPARSE_FORMAT_CSR;
-  nvpl_sparse_order_t order_ = NVPL_SPARSE_ORDER_COL;
-  nvpl_sparse_index_type_t index_type_ = NVPL_SPARSE_INDEX_64I;
-  /**
-   * Sparse matrix descriptors
-  */
-  nvpl_sparse_sp_mat_descr_t* A_nvpl_;
-  nvpl_sparse_sp_mat_descr_t* B_nvpl_;
-  nvpl_sparse_sp_mat_descr_t* C_nvpl_;
-  void* A_row_ptr_nvpl_;
-  void* B_row_ptr_nvpl_;
-  void* C_row_ptr_nvpl_;
-  void* A_col_idnex_nvpl_;
-  void* B_col_idnex_nvpl_;
-  void* C_col_idnex_nvpl_;
-  void* A_vals_nvpl_;
-  void* B_vals_nvpl_;
-  void* C_vals_nvpl_;
-}  // namespace cpu
\ No newline at end of file
diff --git a/ b/
index d323162..07ac243 100644
--- a/
+++ b/
@@ -123,6 +123,11 @@
         inputTypeStr = "Square x Short-Wide (M=K=32, N)"
         for j in range(0, len(mnk)):
+    elif "_sparse_square" in gemmFilenames[i]:
+        x_name = "Value of M, N, K"
+        inputTypeStr = "Sparse square matrices"
+        for j in range(0, len(mnk)):
+            xVals.append(mnk[j][0])
         # File not supported so go to next file
diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh
index d849d22..b5e8d93 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/sp_gemm.hh
@@ -1,8 +1,7 @@
 #pragma once
 #ifdef GPU_CUBLAS
-#include "cusparse.h"
-#include <cublas_v2.h>
+#include <cusparse_v2.h>
 #include <cuda_runtime.h>
 #include <type_traits>
 #include <random>
@@ -13,13 +12,13 @@
 #include "common.hh"
 namespace gpu {
-/** A class for GEMM GPU BLAS kernels. */
+/** A class for sparse GEMM GPU BLAS kernels. */
 template <typename T>
 class sp_gemm_gpu : public sp_gemm<T> {
   using sp_gemm<T>::sp_gemm;
   using sp_gemm<T>::initInputMatricesSparse;
-  using sp_gemm<T>::toCSR;
+  using sp_gemm<T>::toCSR_int;
   using sp_gemm<T>::n_;
   using sp_gemm<T>::A_;
   using sp_gemm<T>::B_;
@@ -44,7 +43,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
       std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
-    n_ = 100 * n;
+    n_ = n;
     // Get device identifier
diff --git a/cuBLAS/sp_gemv.hh b/cuBLAS/sp_gemv.hh
new file mode 100644
index 0000000..8027746
--- /dev/null
+++ b/cuBLAS/sp_gemv.hh
@@ -0,0 +1,261 @@
+//#pragma once
+//#ifdef GPU_CUBLAS
+//#include <cusparse_v2.h>
+//#include <cuda.h>
+//#include <cublas_v2.h>
+//#include <cuda_runtime.h>
+//#include <type_traits>
+//#include <random>
+//#include <iostream>
+//#include "../include/kernels/GPU/sp_gemv.hh"
+//#include "../include/utilities.hh"
+//#include "common.hh"
+//namespace gpu {
+///** A class for sparse GEMV GPU BLAS kernels. */
+//template <typename T>
+//class gemv_gpu : public gemv<T> {
+// public:
+//  using gemv<T>::gemv;
+//  using gemv<T>::initInputMatrixVector;
+//  using gemv<T>::m_;
+//  using gemv<T>::n_;
+//  using gemv<T>::A_;
+//  using gemv<T>::x_;
+//  using gemv<T>::y_;
+//  using gemv<T>::offload_;
+//  using gemv<T>::vecIncrement_;
+//  ~gemv_gpu() {
+//    if (alreadyInitialised_) {
+//      // Destroy the handle
+//      cublasCheckError(cublasDestroy(handle_));
+//      // Destroy streams after use
+//      cudaCheckError(cudaStreamDestroy(s1_));
+//      cudaCheckError(cudaStreamDestroy(s2_));
+//      cudaCheckError(cudaStreamDestroy(s3_));
+//    }
+//  }
+//  /** Initialise the required data structures.
+//   * `offload` refers to the data offload type:
+//   *  - Once:    Move data from host to device before all iterations & move from
+//   *             device to host after all iterations
+//   *  - Always:  Move data from host to device and device to host each iteration
+//   *  - Unified: Initialise data as unified memory; no data movement semantics
+//   *             required */
+//  void initialise(gpuOffloadType offload, int m, int n) override {
+//    if (!alreadyInitialised_) {
+//      alreadyInitialised_ = true;
+//      // Perform set-up which doesn't need to happen every problem size change.
+//      // Create a handle for CUBLAS
+//      cublasCheckError(cublasCreate(&handle_));
+//      // Get device identifier
+//      cudaCheckError(cudaGetDevice(&gpuDevice_));
+//      // Initialise 3 streams to asynchronously move data between host and
+//      // device
+//      cudaCheckError(cudaStreamCreate(&s1_));
+//      cudaCheckError(cudaStreamCreate(&s2_));
+//      cudaCheckError(cudaStreamCreate(&s3_));
+//    }
+//    offload_ = offload;
+//    m_ = m;
+//    n_ = n;
+//    if (offload_ == gpuOffloadType::unified) {
+//      cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * m_ * n_));
+//      cudaCheckError(cudaMallocManaged(&x_, sizeof(T) * n_));
+//      cudaCheckError(cudaMallocManaged(&y_, sizeof(T) * m_));
+//    } else {
+//      // Allocate matrices on host
+//      cudaCheckError(cudaMallocHost((void**)&A_, sizeof(T) * m_ * n_));
+//      cudaCheckError(cudaMallocHost((void**)&x_, sizeof(T) * n_));
+//      cudaCheckError(cudaMallocHost((void**)&y_, sizeof(T) * m_));
+//      // Allocate matrices on device
+//      cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * m_ * n_));
+//      cudaCheckError(cudaMalloc((void**)&x_device_, sizeof(T) * n_));
+//      cudaCheckError(cudaMalloc((void**)&y_device_, sizeof(T) * m_));
+//    }
+//    // Initialise the host data structures
+//    initInputMatrixVector();
+//  }
+// private:
+//  /** Perform any required steps before calling the GEMV kernel that should
+//   * be timed. */
+//  void preLoopRequirements() override {
+//    switch (offload_) {
+//      case gpuOffloadType::always: {
+//        // Offload data each iteration - no requirements
+//        break;
+//      }
+//      case gpuOffloadType::once: {
+//        // Offload input data from host to the device.
+//        cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_,
+//                                       cudaMemcpyHostToDevice, s1_));
+//        cudaCheckError(cudaMemcpyAsync(x_device_, x_, sizeof(T) * n_,
+//                                       cudaMemcpyHostToDevice, s2_));
+//        cudaCheckError(cudaMemcpyAsync(y_device_, y_, sizeof(T) * m_,
+//                                       cudaMemcpyHostToDevice, s3_));
+//        break;
+//      }
+//      case gpuOffloadType::unified: {
+//        // Prefetch input data to device
+//        cudaCheckError(
+//            cudaMemPrefetchAsync(A_, sizeof(T) * m_ * n_, gpuDevice_, s1_));
+//        cudaCheckError(
+//            cudaMemPrefetchAsync(x_, sizeof(T) * n_, gpuDevice_, s2_));
+//        cudaCheckError(
+//            cudaMemPrefetchAsync(y_, sizeof(T) * m_, gpuDevice_, s3_));
+//        break;
+//      }
+//    }
+//  }
+//  /** Make a call to the BLAS Library Kernel. */
+//  void callGemv() override {
+//    switch (offload_) {
+//      case gpuOffloadType::always: {
+//        // Offload input data from host to the device.
+//        cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_,
+//                                       cudaMemcpyHostToDevice, s1_));
+//        cudaCheckError(cudaMemcpyAsync(x_device_, x_, sizeof(T) * n_,
+//                                       cudaMemcpyHostToDevice, s2_));
+//        cudaCheckError(cudaMemcpyAsync(y_device_, y_, sizeof(T) * m_,
+//                                       cudaMemcpyHostToDevice, s3_));
+//        // Call cuBLAS GEMV kernel
+//        if constexpr (std::is_same_v<T, float>) {
+//          cublasCheckError(cublasSgemv(
+//              handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_),
+//              x_device_, vecIncrement_, &beta, y_device_, vecIncrement_));
+//        } else if constexpr (std::is_same_v<T, double>) {
+//          cublasCheckError(cublasDgemv(
+//              handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_),
+//              x_device_, vecIncrement_, &beta, y_device_, vecIncrement_));
+//        }
+//        // Offload output data from device to host
+//        cudaCheckError(cudaMemcpyAsync(y_, y_device_, sizeof(T) * m_,
+//                                       cudaMemcpyDeviceToHost, s3_));
+//        // Ensure device has finished all work.
+//        cudaCheckError(cudaDeviceSynchronize());
+//        break;
+//      }
+//      case gpuOffloadType::once: {
+//        // Call cuBLAS GEMV kernel
+//        if constexpr (std::is_same_v<T, float>) {
+//          cublasCheckError(cublasSgemv(
+//              handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_),
+//              x_device_, vecIncrement_, &beta, y_device_, vecIncrement_));
+//        } else if constexpr (std::is_same_v<T, double>) {
+//          cublasCheckError(cublasDgemv(
+//              handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_),
+//              x_device_, vecIncrement_, &beta, y_device_, vecIncrement_));
+//        }
+//        break;
+//      }
+//      case gpuOffloadType::unified: {
+//        // Call cuBLAS GEMV kernel
+//        if constexpr (std::is_same_v<T, float>) {
+//          cublasCheckError(cublasSgemv(handle_, CUBLAS_OP_N, m_, n_, &alpha, A_,
+//                                       std::max(1, m_), x_, vecIncrement_,
+//                                       &beta, y_, vecIncrement_));
+//        } else if constexpr (std::is_same_v<T, double>) {
+//          cublasCheckError(cublasDgemv(handle_, CUBLAS_OP_N, m_, n_, &alpha, A_,
+//                                       std::max(1, m_), x_, vecIncrement_,
+//                                       &beta, y_, vecIncrement_));
+//        }
+//        break;
+//      }
+//    }
+//  }
+//  /** Perform any required steps after calling the GEMV kernel that should
+//   * be timed. */
+//  void postLoopRequirements() override {
+//    switch (offload_) {
+//      case gpuOffloadType::always: {
+//        // Offload data each iteration - no requirements
+//        break;
+//      }
+//      case gpuOffloadType::once: {
+//        // Offload output data from device to host
+//        cudaCheckError(cudaMemcpyAsync(y_, y_device_, sizeof(T) * m_,
+//                                       cudaMemcpyDeviceToHost, s3_));
+//        // Ensure device has finished all work.
+//        cudaCheckError(cudaDeviceSynchronize());
+//        break;
+//      }
+//      case gpuOffloadType::unified: {
+//        // Ensure all output data resides on host once work has completed
+//        cudaCheckError(
+//            cudaMemPrefetchAsync(y_, sizeof(T) * m_, cudaCpuDeviceId, s3_));
+//        // Ensure device has finished all work.
+//        cudaCheckError(cudaDeviceSynchronize());
+//        break;
+//      }
+//    }
+//  }
+//  /** Do any necessary cleanup (free pointers, close library handles, etc.)
+//   * after Kernel has been called. */
+//  void postCallKernelCleanup() override {
+//    if (offload_ == gpuOffloadType::unified) {
+//      cudaFree(A_);
+//      cudaFree(x_);
+//      cudaFree(y_);
+//    } else {
+//      // Free the memory held on host and device
+//      cudaFreeHost((void*)A_);
+//      cudaFreeHost((void*)x_);
+//      cudaFreeHost((void*)y_);
+//      cudaFree(A_device_);
+//      cudaFree(x_device_);
+//      cudaFree(y_device_);
+//    }
+//  }
+//  /** Whether the initialise function has been called before. */
+//  bool alreadyInitialised_ = false;
+//  /** Handle used when calling cuBLAS. */
+//  cublasHandle_t handle_;
+//  /** CUDA Stream 1 - used to asynchronously move data between host and device.
+//   */
+//  cudaStream_t s1_;
+//  /** CUDA Stream 2 - used to asynchronously move data between host and device.
+//   */
+//  cudaStream_t s2_;
+//  /** CUDA Stream 3 - used to asynchronously move data between host and device.
+//   */
+//  cudaStream_t s3_;
+//  /** The ID of the target GPU Device. */
+//  int gpuDevice_;
+//  /** Input matrix A, held on the device. */
+//  T* A_device_;
+//  /** Input vector x, held on the device. */
+//  T* x_device_;
+//  /** Input vector y, held on the device. */
+//  T* y_device_;
+//  /** The constant value Alpha. */
+//  const T alpha = ALPHA;
+//  /** The constant value Beta. */
+//  const T beta = BETA;
+//}  // namespace gpu
\ No newline at end of file
diff --git a/include/.DS_Store b/include/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..869e02c3a673dee3916dd63df65263ee873d8adc
GIT binary patch
literal 6148

literal 0

diff --git a/include/doGemm.hh b/include/doGemm.hh
index e264273..a33ef7e 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -8,6 +8,7 @@
 #if defined CPU_ARMPL
 #include "../ArmPL/gemm.hh"
+#include "../ArmPL/sp_gemm.hh"
 #elif defined CPU_ONEMKL
 #include "../oneMKL/CPU/gemm.hh"
 #elif defined CPU_AOCL
@@ -62,7 +63,9 @@ class doGemm {
   /** Run all problem types and write data to CSV files. */
   void collectData() {
-    if (doDense_) {
+    // ToDo -- I've hard coded false here as kernel selection was not working
+    //  .  Needs to be fixed
+    if (false) {
       // Square Problem Sizes...
       // Re-initialise offload threshold structures
       cpuGpu_always_ = cpuGpu_offloadThreshold();
@@ -299,7 +302,7 @@ class doGemm {
-    if (doSparse_) {    // Square sparse matrix - sparse matrix multiplication
+    if (true) {    // Square sparse matrix - sparse matrix multiplication
       cpuGpu_always_ = cpuGpu_offloadThreshold();
       cpuGpu_once_ = cpuGpu_offloadThreshold();
       cpuGpu_unified_ = cpuGpu_offloadThreshold();
@@ -307,7 +310,7 @@ class doGemm {
               getKernelName() + "_sparse_square.csv");
       if (upperLimit_ >= 32) {
         for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-          callSparseKernels(csvFile, dim, 0.9999);
+          callSparseKernels(csvFile, dim, 0.99);
       // Close file
@@ -524,8 +527,12 @@ class doGemm {
     if (doCPU_) {
+//      std::cout << "about to initialise matrices with size = " << N <<
+//      std::endl;
       spGemmCpu_.initialise(N, sparsity);
+//      std::cout << "about to run spGEMM" << std::endl;
       time_checksum_gflop cpuResult = spGemmCpu_.compute();
+//      std::cout << "about to calculate flops" << std::endl;
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
 		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
 		               cpuResult.runtime, cpuResult.gflops);
@@ -536,31 +543,38 @@ class doGemm {
     // - UNIFIED : data passed from host to device (and device to host) as
     //             needed
     if (doGPU_) {
-    spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
-    time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
-    gpuResult_unified.gflops =
-    calcGflops(flops, iterations_, gpuResult_unified.runtime);
+      std::cout << "Starting with matrix of size " << N << std::endl;
+      std::cout << "\t\tUnified";
+      spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
+      std::cout << "\tInitialised" << std::endl;
+      time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
+      gpuResult_unified.gflops =
+      calcGflops(flops, iterations_, gpuResult_unified.runtime);
     // - ALWAYS: Offload to/from GPU every iteration
-    spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
-    time_checksum_gflop gpuResult_always = spGemmGpu_.compute();
-    gpuResult_always.gflops =
+      std::cout << "\t\tAlways";
+      spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
+      std::cout << "\tInitialised" << std::endl;
+      time_checksum_gflop gpuResult_always = spGemmGpu_.compute();
+      gpuResult_always.gflops =
             calcGflops(flops, iterations_, gpuResult_always.runtime);
 		// - ONCE : Offload to/from GPU once before all iterations and once
 		// after
-		spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
-		time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
-		gpuResult_once.gflops =
+      std::cout << "\t\tOnce";
+      spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
+      std::cout << "\tInitialised" << std::endl;
+		  time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
+		  gpuResult_once.gflops =
 						calcGflops(flops, iterations_, gpuResult_once.runtime);
 		// ToDo -- non-default GPU operations
 		// Write lines to CSV file
-		writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
+		  writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
 		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
-		writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
+		  writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
 		               iterations_, gpuResult_always.runtime,
-		writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
+		  writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
 		               iterations_, gpuResult_unified.runtime,
diff --git a/include/kernels/.DS_Store b/include/kernels/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..9cc84b2a4ce0fb9e6849637c24a43195d7749e28
GIT binary patch
literal 6148

literal 0

diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh
index 72fd5dc..dfab687 100644
--- a/include/kernels/CPU/sp_gemm.hh
+++ b/include/kernels/CPU/sp_gemm.hh
@@ -4,6 +4,7 @@
 #include <random>
 #include <memory>
+#include <iostream>
 namespace cpu {
@@ -11,10 +12,11 @@ namespace cpu {
 		template <typename T>
 		class sp_gemm : public ::gemm<T> {
-				using ::gemm<T>::gemm;
+        using ::gemm<T>::gemm;
         using ::gemm<T>::initInputMatricesSparse;
-        using ::gemm<T>::toCSR;
-				using ::gemm<T>::m_;
+        using ::gemm<T>::toCSR_int;
+				using ::gemm<T>::iterations_;
+        using ::gemm<T>::m_;
 				using ::gemm<T>::n_;
 				using ::gemm<T>::k_;
 				using ::gemm<T>::A_;
@@ -30,7 +32,8 @@ namespace cpu {
         // Note that the below should be the same as the edges calculation
         // used in the initInputMatricesSparse function.  If changed here,
         // change there
-        nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity_));
+        nnz_ = 1 + (int) ((double)n_ * (double)n_ * (1.0 - sparsity_));
+//        std::cout << "nnz_ = " << nnz_ << std::endl;
 				A_ = (T*)malloc(sizeof(T) * n_ * n_);
 				B_ = (T*)malloc(sizeof(T) * n_ * n_);
@@ -38,10 +41,12 @@ namespace cpu {
-        toCSR();
+        toCSR_int();
-			private:
+      int nnz_;
+    private:
 				/** Do any necessary cleanup (free pointers, close library handles, etc.)
 				 * after Kernel has been called. */
       void postCallKernelCleanup() {
@@ -50,7 +55,7 @@ namespace cpu {
-      void toCSR() {
+      void toCSR_int() {
         // Move A to CSR
         A_row_ptr_ = new int[n_ + 1];
         A_col_index_ = new int[nnz_];
@@ -86,8 +91,6 @@ namespace cpu {
       double sparsity_;
-      int nnz_;
       int* A_row_ptr_;
       int* A_col_index_;
       int* B_row_ptr_;
@@ -96,7 +99,7 @@ namespace cpu {
       int* C_col_index_;
       T* A_vals_;
       T* B_vals_;
-      T* C_vals;
+      T* C_vals_;
 }  // namespace cpu
diff --git a/include/kernels/CPU/sp_gemv.hh b/include/kernels/CPU/sp_gemv.hh
new file mode 100644
index 0000000..0c84cb0
--- /dev/null
+++ b/include/kernels/CPU/sp_gemv.hh
@@ -0,0 +1,47 @@
+#pragma once
+#include "../gemv.hh"
+#include <random>
+#include <memory>
+namespace cpu {
+/** An abstract class for GEMV BLAS kernels. */
+    template <typename T>
+    class sp_gemv : public ::gemv<T> {
+    public:
+        using ::gemv<T>::gemv;
+        using ::gemv<T>::initInputMatrixVectorSparse;
+        using ::gemv<T>::m_;
+        using ::gemv<T>::n_;
+        using ::gemv<T>::A_;
+        using ::gemv<T>::x_;
+        using ::gemv<T>::y_;
+        using ::gemv<T>::sparsity_;
+    public:
+        /** Initialise the required data structures. */
+        void initialise(int n, double sparsity) {
+          m_ = n;
+          n_ = n;
+          sparsity_ = sparsity;
+          A_ = (T*)malloc(sizeof(T) * m_ * n_);
+          x_ = (T*)malloc(sizeof(T) * n_);
+          y_ = (T*)malloc(sizeof(T) * m_);
+          // Initialise the matrix and vectors
+          initInputMatrixVectorSparse();
+        }
+    private:
+        /** Do any necessary cleanup (free pointers, close library handles, etc.)
+         * after Kernel has been called. */
+        void postCallKernelCleanup() override {
+          free(A_);
+          free(x_);
+          free(y_);
+        }
+    };
+}  // namespace cpu
\ No newline at end of file
diff --git a/include/kernels/GPU/sp_gemm.hh b/include/kernels/GPU/sp_gemm.hh
index dbfba87..52a5494 100644
--- a/include/kernels/GPU/sp_gemm.hh
+++ b/include/kernels/GPU/sp_gemm.hh
@@ -17,7 +17,8 @@ namespace gpu {
 				 *  - Always:  Move data from host to device and device to host each iteration
 				 *  - Unified: Initialise data as unified memory; no data movement semantics
 				 *             required */
-				virtual void initialise(gpuOffloadType offload, int n, float sparsity) = 0;
+				virtual void initialise(gpuOffloadType offload, int n, float sparsity)
+        = 0;
 				/** Whether data should be offloaded to/from the GPU each iteration, or just
diff --git a/include/kernels/GPU/sp_gemv.hh b/include/kernels/GPU/sp_gemv.hh
new file mode 100644
index 0000000..75fd126
--- /dev/null
+++ b/include/kernels/GPU/sp_gemv.hh
@@ -0,0 +1,28 @@
+#pragma once
+#include "../gemv.hh"
+namespace gpu {
+/** An abstract class for GEMV BLAS kernels. */
+    template <typename T>
+    class sp_gemv : public ::gemv<T> {
+    public:
+        using ::gemv<T>::gemv;
+        /** Initialise the required data structures.
+         * `offload` refers to the data offload type:
+         *  - Once:    Move data from host to device before all iterations & move from
+         *             device to host after all iterations
+         *  - Always:  Move data from host to device and device to host each iteration
+         *  - Unified: Initialise data as unified memory; no data movement semantics
+         *             required */
+        virtual void initialise(gpuOffloadType offload, int n, float sparsity)
+        = 0;
+    protected:
+        /** Whether data should be offloaded to/from the GPU each iteration, or just
+         * before & after. */
+        gpuOffloadType offload_ = gpuOffloadType::always;
+    };
+}  // namespace gpu
\ No newline at end of file
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index d357734..6d75554 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -9,6 +9,7 @@
 #include <cmath>
 #include <limits>
 #include <random>
+#include <iostream>
 #include "../utilities.hh"
@@ -27,10 +28,13 @@ class gemm {
     // Perform all GEMM calls
+//    std::cout << "about to do pre-loop requirements" << std::endl;
     for (int i = 0; i < iterations_; i++) {
+//      std::cout << "entering loop " << i << std::endl;
+//    std::cout << "about to do post-loop requirements" << std::endl;
     // Stop Timer
diff --git a/include/kernels/gemv.hh b/include/kernels/gemv.hh
index ba12d02..665fe59 100644
--- a/include/kernels/gemv.hh
+++ b/include/kernels/gemv.hh
@@ -4,6 +4,7 @@
 #include <chrono>
 #include <cmath>
 #include <limits>
+#include <random>
 #include "../utilities.hh"
@@ -82,6 +83,82 @@ class gemv {
+  void initInputMatrixVectorSparse() {
+    // Initialise sparse matrix
+    for (int i = 0; i < (n_ * n_); i++) {
+      A_[i] = 0.0;
+    }
+    // Random number generator objects for use in descent
+    std::default_random_engine gen;
+    gen.seed(std::chrono::system_clock::now()
+                     .time_since_epoch().count());
+    std::uniform_real_distribution<double> dist(0.0, 1.0);
+    int edges = 1 + (int) (n_ * n_ * (1 - sparsity_));
+    // Using a=0.45 and b=c=0.22 as default probabilities
+    for (int i = 0; i < edges; i++) {
+      while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
+                   false)) {}
+    }
+    // Initialise the input and output vectors
+    for (int y = 0; y < n_; y++) {
+      x_[y] = (T)((double)(rand() % 100) / 3.0);
+    }
+    for (int y = 0; y < m_; y++) {
+      y_[y] = (T)0.0;
+    }
+  }
+  /** Recursive function to populate sparse matrices */
+  bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b,
+            float c, std::default_random_engine* gen,
+            std::uniform_real_distribution<double> dist, bool bin) {
+    // If a 1x1 submatrix, then add an edge and return out
+    if (x1 >= x2 && y1 >= y2) {
+      // Needed to avoid overfloe segfaults with large problem sizes
+      uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1);
+      if (abs(M[index]) > 0.1) {
+        return false;
+      } else {
+        // Add 1.0 if this is a binary graph, and a random real number otherwise
+        M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0);
+        return true;
+      }
+    } else {
+      // Divide up the matrix
+      int xMidPoint = x1 + floor((x2 - x1) / 2);
+      int yMidPoint = y1 + floor((y2 - y1) / 2);
+      // ToDo -- add some noise to these values between iterations
+      float newA = a;
+      float newB = b;
+      float newC = c;
+      // Work out which quarter to recurse into
+      // There are some ugly ternary operators here to avoid going out of bounds in the edge case
+      // that we are already at 1 width or 1 height
+      float randomNum = dist(*gen);
+      if (randomNum < a) {
+        return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+                    newA, newB, newC, gen, dist, bin);
+      } else if (randomNum < (a + b)) {
+        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+                    newA, newB, newC, gen, dist, bin);
+      } else if (randomNum < (a + b + c)) {
+        return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+                    newA, newB, newC, gen, dist, bin);
+      } else {
+        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+                    ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
+                    gen, dist, bin);
+      }
+    }
+    return true;
+  }
   /** Call the extern consume() function. */
   void callConsume() { consume((void*)A_, (void*)x_, (void*)y_); }
@@ -105,4 +182,6 @@ class gemv {
   /** The distance between two vector elements. */
   const int vecIncrement_ = 1;
+  double sparsity_ = 0.0;

From a8e5c4690238832761286e2cde7ab7f2170acf26 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Fri, 11 Oct 2024 15:53:08 +0100
Subject: [PATCH 25/38] Adding AOCL files

 .idea/workspace.xml            |   6 +-
 ArmPL/sp_gemm.hh               | 266 +++++++--------------------------          |   2 +-
 cuBLAS/common.hh               |   2 +-
 include/doGemm.hh              |  11 --
 include/kernels/CPU/sp_gemm.hh |  10 +-
 include/kernels/gemm.hh        |   3 -
 src/                    |  24 +--
 8 files changed, 80 insertions(+), 244 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index b954508..e9a4d65 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -125,9 +125,9 @@
-      <item itemvalue="C/C++" />
       <item itemvalue="Native Application.all" />
       <item itemvalue="Native Application.gpu-blob" />
+      <item itemvalue="C/C++" />
@@ -171,7 +171,9 @@
       <workItem from="1723101242209" duration="21225000" />
       <workItem from="1724244974273" duration="40294000" />
       <workItem from="1726568120590" duration="8508000" />
-      <workItem from="1726828018604" duration="38592000" />
+      <workItem from="1726828018604" duration="52619000" />
+      <workItem from="1727941759103" duration="43000" />
+      <workItem from="1727941814674" duration="165000" />
     <task id="LOCAL-00001" summary="trivial changes">
       <option name="closed" value="true" />
diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh
index 47b0bf9..cb6b443 100644
--- a/ArmPL/sp_gemm.hh
+++ b/ArmPL/sp_gemm.hh
@@ -25,6 +25,9 @@ class sp_gemm_cpu : public sp_gemm<T> {
   using sp_gemm<T>::B_;
   using sp_gemm<T>::C_;
   using sp_gemm<T>::nnz_;
+  using sp_gemm<T>::A_vals_;
+  using sp_gemm<T>::B_vals_;
+  using sp_gemm<T>::C_vals_;
   /** Make call to the GEMM kernel. */
@@ -57,19 +60,18 @@ class sp_gemm_cpu : public sp_gemm<T> {
       status_ = armpl_spmm_exec_s(transA_,
-                                  *A_armpl_,
-                                  *B_armpl_,
+                                  A_armpl_,
+                                  B_armpl_,
-                                  *B_armpl_);
+                                  B_armpl_);
     } else if constexpr (std::is_same_v<T, double>) {
-      std::cout << "About to execute dgemm" << std::endl;
       status_ = armpl_spmm_exec_d(transA_,
-                                  *A_armpl_,
-                                  *B_armpl_,
+                                  A_armpl_,
+                                  B_armpl_,
-                                  *B_armpl_);
+                                  B_armpl_);
     } else {
       // Un-specialised class will not do any work - print error and exit.
       std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported."
@@ -89,20 +91,18 @@ class sp_gemm_cpu : public sp_gemm<T> {
    * be timed. */
   void preLoopRequirements() override {
     // Need to put A_ and B_ into A_armpl_ and B_armpl_
-    // ToDo -- Error catching
-//    std::cout << "toCSR_armpl() wrapped up without a problem" << std::endl;
   /** Perform any required steps after calling the GEMM kernel that should
    * be timed. */
   void postLoopRequirements() override {
-    status_ = armpl_spmat_destroy(*A_armpl_);
+    status_ = armpl_spmat_destroy(A_armpl_);
     if (status_ != ARMPL_STATUS_SUCCESS) {
       std::cout << "ERROR " << status_ << std::endl;
-    status_ = armpl_spmat_destroy(*B_armpl_);
+    status_ = armpl_spmat_destroy(B_armpl_);
     if (status_ != ARMPL_STATUS_SUCCESS) {
       std::cout << "ERROR " << status_ << std::endl;
@@ -113,12 +113,12 @@ class sp_gemm_cpu : public sp_gemm<T> {
 //      exit(1);
 //    }
-//    delete [] A_armpl_row_ptr_;
-//    delete [] A_armpl_col_index_;
-//    delete [] A_vals_;
-//    delete [] B_armpl_row_ptr_;
-//    delete [] B_armpl_col_index_;
-//    delete [] B_vals_;
+    delete [] A_armpl_row_ptr_;
+    delete [] A_armpl_col_index_;
+    delete [] A_vals_;
+    delete [] B_armpl_row_ptr_;
+    delete [] B_armpl_col_index_;
+    delete [] B_vals_;
 //    delete [] C_armpl_row_ptr_;
 //    delete [] C_armpl_col_index_;
 //    delete [] C_vals_;
@@ -131,10 +131,6 @@ class sp_gemm_cpu : public sp_gemm<T> {
   /** The constant value Beta. */
   const T beta = BETA;
-  armpl_status_t status_;
-  armpl_spmat_t armpl_A, armpl_B, armpl_C;
   void toCSR_armpl() {
     n_armpl_ = n_;
     // ToDo -- check whether flags_ is correct!
@@ -145,50 +141,19 @@ class sp_gemm_cpu : public sp_gemm<T> {
     A_armpl_col_index_ = new armpl_int_t[nnz_];
     A_vals_ = new T[nnz_];
     A_armpl_row_ptr_[0] = 0;
     int nnz_encountered = 0;
-//    std::cout << "About to load A into csr" << std::endl;
     for (int row = 0; row < n_; row++) {
-//      std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << std::endl;
       A_armpl_row_ptr_[row + 1] = nnz_encountered;
       for (int col = 0; col < n_; col++) {
         if (A_[(row * n_) + col] != 0.0) {
-//          std::cout << "\t\tCol " << col << " = " << A_[(row * n_) + col] <<
-//          std::endl;
           A_armpl_col_index_[nnz_encountered] = col;
           A_vals_[nnz_encountered] = static_cast<T>(A_[(row * n_) + col]);
-//          std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl;
-//    std::cout << "___A =" << std::endl << "\t\t[";
-//    for (int i = 0; i < (n_ + 1); i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << A_armpl_row_ptr_[i];
-//    }
-//    std::cout << "]" << std::endl << "\t\t[";
-//    for (int i = 0; i < nnz_; i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << A_armpl_col_index_[i];
-//    }
-//    std::cout << "]" << std::endl << "\t\t[";
-//    for (int i = 0; i < nnz_; i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << A_vals_[i];
-//    }
-//    std::cout << "]" << std::endl;
-//    std::cout << "About to load B into csr" << std::endl;
     // Move B to CSR
     B_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
     B_armpl_col_index_ = new armpl_int_t[nnz_];
@@ -197,113 +162,20 @@ class sp_gemm_cpu : public sp_gemm<T> {
     nnz_encountered = 0;
     for (int row = 0; row < n_; row++) {
-//      std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered <<
-//      std::endl;
       B_armpl_row_ptr_[row + 1] = nnz_encountered;
       for (int col = 0; col < n_; col++) {
         if (B_[(row * n_) + col] != 0.0) {
-//          std::cout << "\t\tCol " << col << " = " << B_[(row * n_) + col] << std::endl;
           B_armpl_col_index_[nnz_encountered] = col;
           B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
-//          std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl;
-//    std::cout << "___B =" << std::endl << "\t\t[";
-//    for (int i = 0; i < (n_ + 1); i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << B_armpl_row_ptr_[i];
-//    }
-//    std::cout << "]" << std::endl << "\t\t[";
-//    for (int i = 0; i < nnz_; i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << B_armpl_col_index_[i];
-//    }
-//    std::cout << "]" << std::endl << "\t\t[";
-//    for (int i = 0; i < nnz_; i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << B_vals_[i];
-//    }
-//    std::cout << "]" << std::endl;
-//    // Move B to CSR
-//    C_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
-//    C_armpl_col_index_ = new armpl_int_t[nnz_];
-//    C_vals_ = new T[nnz_];
-//    C_armpl_row_ptr_[0] = 0;
-//    nnz_encountered = 0;
-////    std::cout << "About to load C into csr" << std::endl;
-//    for (int row = 0; row < n_; row++) {
-////      std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << std::endl;
-//      C_armpl_row_ptr_[row + 1] = nnz_encountered;
-//      for (int col = 0; col < n_; col++) {
-//        if (A_[(row * n_) + col] != 0.0) {
-//          C_armpl_col_index_[nnz_encountered] = col;
-//          C_vals_[nnz_encountered] = A_[(row * n_) + col];
-//          nnz_encountered++;
-////          std::cout << "\t\tCol " << col << " = " << C_vals_[nnz_encountered] <<
-////          std::endl;
-////          std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl;
-//        }
-//      }
-//    }
-//    std::cout << "___C =" << std::endl << "\t\t[";
-//    for (int i = 0; i < (n_ + 1); i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << C_armpl_row_ptr_[i];
-//    }
-//    std::cout << "]" << std::endl << "\t\t[";
-//    for (int i = 0; i < nnz_; i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << C_armpl_col_index_[i];
-//    }
-//    std::cout << "]" << std::endl << "\t\t[";
-//    for (int i = 0; i < nnz_; i++) {
-//      if (i != 0) {
-//        std::cout << ", ";
-//      }
-//      std::cout << C_vals_[i];
-//    }
-//    std::cout << "]" << std::endl;
-//    std::cout << "Loading csr A into armpl storage formats" << std::endl;
     if constexpr (std::is_same_v<T, float>) {
-      std::cout << "\tn_armpl_ = " << n_armpl_ << std::endl;
-      std::cout << "\tA_armpl_row_ptr_ (size = " << sizeof
-      (A_armpl_row_ptr_[0]) << ") = [" << A_armpl_row_ptr_[0];
-      for (int i = 1; i < (n_ + 1); i++) {
-        std::cout << ", " << A_armpl_row_ptr_[i];
-      }
-      std::cout << "]" << std::endl << "\tA_armpl_col_index_ (size = " <<
-      sizeof(A_armpl_col_index_[0]) << ") = [" <<
-      A_armpl_col_index_[0];
-      for (int i = 1; i < nnz_; i++) {
-        std::cout << ", " << A_armpl_col_index_[i];
-      }
-      std::cout << "]" << std::endl << "\tA_vals_ (size = " << sizeof
-      (A_vals_[0]) << ") = [" << A_vals_[0];
-      for (int i = 1; i < nnz_; i++) {
-        std::cout << ", " << A_vals_[i];
-      }
-      std::cout << "]" << std::endl << "flags: " << flags_ << std::endl;
-      status_ = armpl_spmat_create_csr_s(A_armpl_,
+//      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&A_armpl_,
@@ -315,21 +187,9 @@ class sp_gemm_cpu : public sp_gemm<T> {
-//      std::cout << "Loading csr C into armpl storage formats" << std::endl;
-//      status_ = armpl_spmat_create_csr_s(C_armpl_,
-//                                         n_armpl_,
-//                                         n_armpl_,
-//                                         C_armpl_row_ptr_,
-//                                         C_armpl_col_index_,
-//                                         C_vals_,
-//                                         flags_);
-//      if (status_ != ARMPL_STATUS_SUCCESS) {
-//        std::cout << "ERROR " << status_ << std::endl;
-//        exit(1);
-//      }
-//      std::cout << "Loading csr B into armpl storage formats" << std::endl;
-      status_ = armpl_spmat_create_csr_s(B_armpl_,
+//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&B_armpl_,
@@ -341,28 +201,9 @@ class sp_gemm_cpu : public sp_gemm<T> {
     } else if constexpr (std::is_same_v<T, double>) {
-      std::cout << "\tn_armpl_ = " << n_armpl_ << std::endl;
-      std::cout << "\tA_armpl_row_ptr_ (size = " << sizeof
-      (A_armpl_row_ptr_[0]) << ") = [" << A_armpl_row_ptr_[0];
-      for (int i = 1; i < (n_ + 1); i++) {
-        std::cout << ", " << A_armpl_row_ptr_[i];
-      }
-      std::cout << "]" << std::endl << "\tA_armpl_col_index_ (size = " <<
-      sizeof(A_armpl_col_index_[0]) << ") = [" <<
-      A_armpl_col_index_[0];
-      for (int i = 1; i < nnz_; i++) {
-        std::cout << ", " << A_armpl_col_index_[i];
-      }
-      std::cout << "]" << std::endl << "\tA_vals_ (size = " << sizeof
-      (A_vals_[0]) << ") = [" << A_vals_[0];
-      for (int i = 1; i < nnz_; i++) {
-        std::cout << ", " << A_vals_[i];
-      }
-      std::cout << "]" << std::endl << "flags: " << flags_ << std::endl;
-      std::cout << "About to create CSR A (double)" << std::endl;
-      status_ = armpl_spmat_create_csr_d(A_armpl_,
+//      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
+//                nnz_, flags_
+      status_ = armpl_spmat_create_csr_d(&A_armpl_,
@@ -374,22 +215,9 @@ class sp_gemm_cpu : public sp_gemm<T> {
-//      std::cout << "Loading csr C into armpl storage formats" << std::endl;
-//      status_ = armpl_spmat_create_csr_d(C_armpl_,
-//                                         n_armpl_,
-//                                         n_armpl_,
-//                                         C_armpl_row_ptr_,
-//                                         C_armpl_col_index_,
-//                                         C_vals_,
-//                                         flags_);
-//      if (status_ != ARMPL_STATUS_SUCCESS) {
-//        std::cout << "ERROR " << status_ << std::endl;
-//        exit(1);
-//      }
-//      std::cout << "Loading csr B into armpl storage formats" << std::endl;
-      std::cout << "About to create CSR B (double)" << std::endl;
-      status_ = armpl_spmat_create_csr_d(B_armpl_,
+//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_d(&B_armpl_,
@@ -400,11 +228,33 @@ class sp_gemm_cpu : public sp_gemm<T> {
         std::cout << "ERROR " << status_ << std::endl;
+//      std::cout << "Okay, all matrices made!!" << std::endl;
-//    std::cout << "Okay, all matrices made!!" << std::endl;
+  void printCSR(armpl_int_t n, armpl_int_t* rp, armpl_int_t* ci, T* v,
+                armpl_int_t nz, armpl_int_t f) {
+    std::cout << "\tn = " << n << std::endl;
+    std::cout << "\trow ptr (size = " << sizeof(rp[0]) << ") = [" << rp[0];
+    for (int i = 1; i < (n + 1); i++) {
+      std::cout << ", " << rp[i];
+    }
+    std::cout << "]" << std::endl << "\tcol ind (size = " << sizeof(ci[0]) <<
+    ") = [" << ci[0];
+    for (int i = 1; i < nz; i++) {
+      std::cout << ", " << ci[i];
+    }
+    std::cout << "]" << std::endl << "\tvals (size = " << sizeof(v[0]) <<
+    ") = [" << v[0];
+    for (int i = 1; i < nz; i++) {
+      std::cout << ", " << v[i];
+    }
+    std::cout << "]" << std::endl << "\tflags = " << f << std::endl;
+  }
+  armpl_status_t status_;
   armpl_int_t flags_;
   armpl_int_t n_armpl_;
@@ -416,13 +266,9 @@ class sp_gemm_cpu : public sp_gemm<T> {
   armpl_int_t* C_armpl_row_ptr_;
   armpl_int_t* C_armpl_col_index_;
-  T* A_vals_;
-  T* B_vals_;
-  T* C_vals_;
-  armpl_spmat_t* A_armpl_;
-  armpl_spmat_t* B_armpl_;
-  armpl_spmat_t* C_armpl_;
+  armpl_spmat_t A_armpl_;
+  armpl_spmat_t B_armpl_;
+  armpl_spmat_t C_armpl_;
   armpl_sparse_hint_value transA_ = ARMPL_SPARSE_OPERATION_NOTRANS;
   armpl_sparse_hint_value transB_ = ARMPL_SPARSE_OPERATION_NOTRANS;
diff --git a/ b/
index 07ac243..ee1a389 100644
--- a/
+++ b/
@@ -372,7 +372,7 @@
     plt.margins(x=0.01, y=0.01)
     leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
-    for obj in leg.legendHandles:
+    for obj in leg.legend_handles:
diff --git a/cuBLAS/common.hh b/cuBLAS/common.hh
index c8086db..f3ff6ef 100644
--- a/cuBLAS/common.hh
+++ b/cuBLAS/common.hh
@@ -2,7 +2,7 @@
 #if defined GPU_CUBLAS
-#include "cusparse.h"
+#include <cusparse_v2.h>
 /** Macro function to check if error occurred when calling cuBLAS. */
 /** Macro function to check if error occurred when calling CUDA. */
diff --git a/include/doGemm.hh b/include/doGemm.hh
index a33ef7e..c71684f 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -527,12 +527,8 @@ class doGemm {
     if (doCPU_) {
-//      std::cout << "about to initialise matrices with size = " << N <<
-//      std::endl;
       spGemmCpu_.initialise(N, sparsity);
-//      std::cout << "about to run spGEMM" << std::endl;
       time_checksum_gflop cpuResult = spGemmCpu_.compute();
-//      std::cout << "about to calculate flops" << std::endl;
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
 		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
 		               cpuResult.runtime, cpuResult.gflops);
@@ -543,26 +539,19 @@ class doGemm {
     // - UNIFIED : data passed from host to device (and device to host) as
     //             needed
     if (doGPU_) {
-      std::cout << "Starting with matrix of size " << N << std::endl;
-      std::cout << "\t\tUnified";
       spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
-      std::cout << "\tInitialised" << std::endl;
       time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
       gpuResult_unified.gflops =
       calcGflops(flops, iterations_, gpuResult_unified.runtime);
     // - ALWAYS: Offload to/from GPU every iteration
-      std::cout << "\t\tAlways";
       spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
-      std::cout << "\tInitialised" << std::endl;
       time_checksum_gflop gpuResult_always = spGemmGpu_.compute();
       gpuResult_always.gflops =
             calcGflops(flops, iterations_, gpuResult_always.runtime);
 		// - ONCE : Offload to/from GPU once before all iterations and once
 		// after
-      std::cout << "\t\tOnce";
       spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
-      std::cout << "\tInitialised" << std::endl;
 		  time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
 		  gpuResult_once.gflops =
 						calcGflops(flops, iterations_, gpuResult_once.runtime);
diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh
index dfab687..a11dcd0 100644
--- a/include/kernels/CPU/sp_gemm.hh
+++ b/include/kernels/CPU/sp_gemm.hh
@@ -33,7 +33,6 @@ namespace cpu {
         // used in the initInputMatricesSparse function.  If changed here,
         // change there
         nnz_ = 1 + (int) ((double)n_ * (double)n_ * (1.0 - sparsity_));
-//        std::cout << "nnz_ = " << nnz_ << std::endl;
 				A_ = (T*)malloc(sizeof(T) * n_ * n_);
 				B_ = (T*)malloc(sizeof(T) * n_ * n_);
@@ -46,6 +45,12 @@ namespace cpu {
       int nnz_;
+    protected:
+        T* A_vals_;
+        T* B_vals_;
+        T* C_vals_;
 				/** Do any necessary cleanup (free pointers, close library handles, etc.)
 				 * after Kernel has been called. */
@@ -97,9 +102,6 @@ namespace cpu {
       int* B_col_index_;
       int* C_row_ptr_;
       int* C_col_index_;
-      T* A_vals_;
-      T* B_vals_;
-      T* C_vals_;
 }  // namespace cpu
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index 6d75554..bbd17cb 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -28,13 +28,10 @@ class gemm {
     // Perform all GEMM calls
-//    std::cout << "about to do pre-loop requirements" << std::endl;
     for (int i = 0; i < iterations_; i++) {
-//      std::cout << "entering loop " << i << std::endl;
-//    std::cout << "about to do post-loop requirements" << std::endl;
     // Stop Timer
diff --git a/src/ b/src/
index 51d1cf1..e508b5b 100644
--- a/src/
+++ b/src/
@@ -50,18 +50,18 @@ int main(int argc, char** argv) {
   // -------- GEMV --------
   // SGEMV Comparison
-  std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl;
-  doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                      doGpu);
-  sgemv.collectData();
-  std::cout << "Finished!" << std::endl;
-  // DGEMV Comparison
-  std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl;
-  doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                       doGpu);
-  dgemv.collectData();
-  std::cout << "Finished!" << std::endl;
+//  std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl;
+//  doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
+//                      doGpu);
+//  sgemv.collectData();
+//  std::cout << "Finished!" << std::endl;
+//  // DGEMV Comparison
+//  std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl;
+//  doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
+//                       doGpu);
+//  dgemv.collectData();
+//  std::cout << "Finished!" << std::endl;
   return 0;

From 9eb464668e481ef1148dd4a160ccea3fe5e7563f Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Thu, 3 Oct 2024 10:51:18 +0100
Subject: [PATCH 26/38] No longer overwriting B_

 .idea/workspace.xml | 22 +++++++++++----
 ArmPL/sp_gemm.hh    | 69 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 73 insertions(+), 18 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index e9a4d65..cb692bc 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -15,7 +15,10 @@
   <component name="ChangeListManager">
-    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding AOCL files" />
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="working changes">
+      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" afterDir="false" />
+    </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
@@ -125,9 +128,9 @@
+      <item itemvalue="C/C++" />
       <item itemvalue="Native Application.all" />
       <item itemvalue="Native Application.gpu-blob" />
-      <item itemvalue="C/C++" />
@@ -174,6 +177,7 @@
       <workItem from="1726828018604" duration="52619000" />
       <workItem from="1727941759103" duration="43000" />
       <workItem from="1727941814674" duration="165000" />
+      <workItem from="1727941995420" duration="3199000" />
     <task id="LOCAL-00001" summary="trivial changes">
       <option name="closed" value="true" />
@@ -495,7 +499,15 @@
       <option name="project" value="LOCAL" />
-    <option name="localTasksCounter" value="41" />
+    <task id="LOCAL-00041" summary="working changes">
+      <option name="closed" value="true" />
+      <created>1727942003511</created>
+      <option name="number" value="00041" />
+      <option name="presentableId" value="LOCAL-00041" />
+      <option name="project" value="LOCAL" />
+      <updated>1727942003511</updated>
+    </task>
+    <option name="localTasksCounter" value="42" />
     <servers />
   <component name="TypeScriptGeneratedFilesManager">
@@ -513,7 +525,6 @@
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="Adding sparse kernel to doGemm" />
     <MESSAGE value="Adding matrix type enum class" />
     <MESSAGE value="changes" />
     <MESSAGE value="adding command line kernel selection" />
@@ -538,6 +549,7 @@
     <MESSAGE value="Finalising" />
     <MESSAGE value="Rebasing" />
     <MESSAGE value="Adding AOCL files" />
-    <option name="LAST_COMMIT_MESSAGE" value="Adding AOCL files" />
+    <MESSAGE value="working changes" />
+    <option name="LAST_COMMIT_MESSAGE" value="working changes" />
\ No newline at end of file
diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh
index cb6b443..28a2ca3 100644
--- a/ArmPL/sp_gemm.hh
+++ b/ArmPL/sp_gemm.hh
@@ -53,9 +53,6 @@ class sp_gemm_cpu : public sp_gemm<T> {
     // Todo -- See if using armpl_spmat_hint can improve performance here.
     //  If so, follow with optimisation functions
     if constexpr (std::is_same_v<T, float>) {
       status_ = armpl_spmm_exec_s(transA_,
@@ -63,7 +60,7 @@ class sp_gemm_cpu : public sp_gemm<T> {
-                                  B_armpl_);
+                                  C_armpl_);
     } else if constexpr (std::is_same_v<T, double>) {
       status_ = armpl_spmm_exec_d(transA_,
@@ -71,7 +68,7 @@ class sp_gemm_cpu : public sp_gemm<T> {
-                                  B_armpl_);
+                                  C_armpl_);
     } else {
       // Un-specialised class will not do any work - print error and exit.
       std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported."
@@ -107,11 +104,11 @@ class sp_gemm_cpu : public sp_gemm<T> {
       std::cout << "ERROR " << status_ << std::endl;
-//    status_ = armpl_spmat_destroy(*C_armpl_);
-//    if (status_ != ARMPL_STATUS_SUCCESS) {
-//      std::cout << "ERROR " << status_ << std::endl;
-//      exit(1);
-//    }
+    status_ = armpl_spmat_destroy(C_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
     delete [] A_armpl_row_ptr_;
     delete [] A_armpl_col_index_;
@@ -119,9 +116,9 @@ class sp_gemm_cpu : public sp_gemm<T> {
     delete [] B_armpl_row_ptr_;
     delete [] B_armpl_col_index_;
     delete [] B_vals_;
-//    delete [] C_armpl_row_ptr_;
-//    delete [] C_armpl_col_index_;
-//    delete [] C_vals_;
+    delete [] C_armpl_row_ptr_;
+    delete [] C_armpl_col_index_;
+    delete [] C_vals_;
@@ -172,6 +169,24 @@ class sp_gemm_cpu : public sp_gemm<T> {
+    // Move C to CSR
+    C_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+    C_armpl_col_index_ = new armpl_int_t[nnz_];
+    C_vals_ = new T[nnz_];
+    C_armpl_row_ptr_[0] = 0;
+    nnz_encountered = 0;
+    for (int row = 0; row < n_; row++) {
+      C_armpl_row_ptr_[row + 1] = nnz_encountered;
+      for (int col = 0; col < n_; col++) {
+        if (B_[(row * n_) + col] != 0.0) {
+          C_armpl_col_index_[nnz_encountered] = col;
+          C_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
+          nnz_encountered++;
+        }
+      }
+    }
     if constexpr (std::is_same_v<T, float>) {
 //      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
 //                nnz_, flags_);
@@ -200,6 +215,20 @@ class sp_gemm_cpu : public sp_gemm<T> {
         std::cout << "ERROR " << status_ << std::endl;
+//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&C_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         C_armpl_row_ptr_,
+                                         C_armpl_col_index_,
+                                         C_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
     } else if constexpr (std::is_same_v<T, double>) {
 //      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
 //                nnz_, flags_
@@ -228,6 +257,20 @@ class sp_gemm_cpu : public sp_gemm<T> {
         std::cout << "ERROR " << status_ << std::endl;
+//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_d(&C_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         C_armpl_row_ptr_,
+                                         C_armpl_col_index_,
+                                         C_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
 //      std::cout << "Okay, all matrices made!!" << std::endl;

From 7f82b7d52f0ab2420774159d9099fb40aef00ce2 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Fri, 11 Oct 2024 15:56:42 +0100
Subject: [PATCH 27/38] Adding AOCL files

 .idea/workspace.xml            | 25 +++++++++----
 include/doGemm.hh              | 66 +++++++++++++++++++++++++++++-----
 include/doGemv.hh              | 57 ++++++++++++++++-------------
 include/kernels/CPU/sp_gemm.hh |  7 ++--
 include/kernels/gemm.hh        |  7 ++--
 include/kernels/gemv.hh        |  5 +--
 src/                    | 62 +++++++++++++++++++++-----------
 7 files changed, 160 insertions(+), 69 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index cb692bc..a5afad2 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -15,9 +15,14 @@
   <component name="ChangeListManager">
-    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="working changes">
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="No longer overwriting B_">
       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/doGemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/" beforeDir="false" afterPath="$PROJECT_DIR$/src/" afterDir="false" />
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -177,7 +182,7 @@
       <workItem from="1726828018604" duration="52619000" />
       <workItem from="1727941759103" duration="43000" />
       <workItem from="1727941814674" duration="165000" />
-      <workItem from="1727941995420" duration="3199000" />
+      <workItem from="1727941995420" duration="22747000" />
     <task id="LOCAL-00001" summary="trivial changes">
       <option name="closed" value="true" />
@@ -507,7 +512,15 @@
       <option name="project" value="LOCAL" />
-    <option name="localTasksCounter" value="42" />
+    <task id="LOCAL-00042" summary="No longer overwriting B_">
+      <option name="closed" value="true" />
+      <created>1727949079616</created>
+      <option name="number" value="00042" />
+      <option name="presentableId" value="LOCAL-00042" />
+      <option name="project" value="LOCAL" />
+      <updated>1727949079616</updated>
+    </task>
+    <option name="localTasksCounter" value="43" />
     <servers />
   <component name="TypeScriptGeneratedFilesManager">
@@ -525,7 +538,6 @@
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="Adding matrix type enum class" />
     <MESSAGE value="changes" />
     <MESSAGE value="adding command line kernel selection" />
     <MESSAGE value="Adding basic sparse multiplication kernel for default CPU and GPU" />
@@ -550,6 +562,7 @@
     <MESSAGE value="Rebasing" />
     <MESSAGE value="Adding AOCL files" />
     <MESSAGE value="working changes" />
-    <option name="LAST_COMMIT_MESSAGE" value="working changes" />
+    <MESSAGE value="No longer overwriting B_" />
+    <option name="LAST_COMMIT_MESSAGE" value="No longer overwriting B_" />
\ No newline at end of file
diff --git a/include/doGemm.hh b/include/doGemm.hh
index c71684f..a3e5e77 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -65,7 +65,7 @@ class doGemm {
   void collectData() {
     // ToDo -- I've hard coded false here as kernel selection was not working
     //  .  Needs to be fixed
-    if (false) {
+    if (doDense_) {
       // Square Problem Sizes...
       // Re-initialise offload threshold structures
       cpuGpu_always_ = cpuGpu_offloadThreshold();
@@ -301,13 +301,12 @@ class doGemm {
-    if (true) {    // Square sparse matrix - sparse matrix multiplication
+    if (doSparse_) {    // Square sparse matrix - sparse matrix multiplication
       cpuGpu_always_ = cpuGpu_offloadThreshold();
       cpuGpu_once_ = cpuGpu_offloadThreshold();
       cpuGpu_unified_ = cpuGpu_offloadThreshold();
       std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
-              getKernelName() + "_sparse_square.csv");
+              getKernelName() + "_sparse_square_99.csv");
       if (upperLimit_ >= 32) {
         for (int dim = startDimention_; dim <= upperLimit_; dim++) {
           callSparseKernels(csvFile, dim, 0.99);
@@ -316,10 +315,59 @@ class doGemm {
       // Close file
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-	    printOffloadThreshold("Sparse Square");
-    }
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Sparse Square 0.99");
+      }
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
+              getKernelName() + "_sparse_square_999.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+          callSparseKernels(csvFile, dim, 0.999);
+        }
+      }
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Sparse Square 0.999");
+      }
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
+              getKernelName() + "_sparse_square_9999.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+          callSparseKernels(csvFile, dim, 0.9999);
+        }
+      }
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Sparse Square 0.9999");
+      }
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
+                                          getKernelName() +
+                                          "_sparse_square_99999.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+          callSparseKernels(csvFile, dim, 0.99999);
+        }
+      }
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Sparse Square 0.99999");
+      }
@@ -530,7 +578,7 @@ class doGemm {
       spGemmCpu_.initialise(N, sparsity);
       time_checksum_gflop cpuResult = spGemmCpu_.compute();
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
-		writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
+		  writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
 		               cpuResult.runtime, cpuResult.gflops);
diff --git a/include/doGemv.hh b/include/doGemv.hh
index b86aad6..12cd097 100644
--- a/include/doGemv.hh
+++ b/include/doGemv.hh
@@ -33,13 +33,16 @@ class doGemv {
   doGemv(const std::string csvDir, const int iters, const int startDim,
          const int upperLimit, const bool cpuEnabled = true,
-         const bool gpuEnabled = true)
+         const bool gpuEnabled = true, const bool doDense = true, const bool
+         doSparse = true)
       : CSV_DIR(csvDir),
-        doGPU_(gpuEnabled)
+        doGPU_(gpuEnabled),
+        doDense_(doDense),
+        doSparse_(doSparse)
@@ -56,28 +59,29 @@ class doGemv {
   /** Run all problem types and write data to CSV files. */
   void collectData() {
-    // Square Problem Sizes...
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    std::ofstream csvFile =
-        initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv");
-    for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-      // M = dim, N = dim;
-      callKernels(csvFile, dim, dim);
-    }
-    // Close file
-    csvFile.close();
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Square x Vector (M=N)");
-    }
+    if (doDense_) {
+      // Square Problem Sizes...
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      std::ofstream csvFile =
+          initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv");
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        // M = dim, N = dim;
+        callKernels(csvFile, dim, dim);
+      }
+      // Close file
+      csvFile.close();
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Square x Vector (M=N)");
+      }
+  #endif
     // Rectangular Problem Sizes:
     // Tall and thin x Vector
@@ -182,6 +186,7 @@ class doGemv {
+  }
   /** Call the appropriate CPU and GPU GEMV kernels. */
@@ -494,6 +499,10 @@ class doGemv {
   /** Whether the GPU kernels should be run. */
   const bool doGPU_ = true;
+  /** Whether sparse and or dense kernels should be run. */
+  const bool doSparse_;
+  const bool doDense_;
   /** The GEMV CPU kernel. */
   cpu::gemv_cpu<T> gemvCpu_;
diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh
index a11dcd0..c431d4d 100644
--- a/include/kernels/CPU/sp_gemm.hh
+++ b/include/kernels/CPU/sp_gemm.hh
@@ -32,18 +32,19 @@ namespace cpu {
         // Note that the below should be the same as the edges calculation
         // used in the initInputMatricesSparse function.  If changed here,
         // change there
-        nnz_ = 1 + (int) ((double)n_ * (double)n_ * (1.0 - sparsity_));
+        nnz_ = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity_));
+//        std::cout << "\t____About to malloc()____" << std::endl;
 				A_ = (T*)malloc(sizeof(T) * n_ * n_);
 				B_ = (T*)malloc(sizeof(T) * n_ * n_);
 				C_ = (T*)malloc(sizeof(T) * n_ * n_);
-				initInputMatricesSparse(sparsity_);
+				initInputMatricesSparse(sparsity);
-      int nnz_;
+      uint64_t nnz_;
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index bbd17cb..6e1328e 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -107,14 +107,14 @@ class gemm {
     std::uniform_real_distribution<double> dist(0.0, 1.0);
-    int edges = 1 + (int) (n_ * n_ * (1 - sparsity));
+    int edges = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity));
     // Using a=0.45 and b=c=0.22 as default probabilities
     for (int i = 0; i < edges; i++) {
       while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
-              false)) {}
+                   false)) {}
       while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
-              false)) {}
+                   false)) {}
@@ -165,7 +165,6 @@ class gemm {
                     gen, dist, bin);
-    return true;
   void toCSR_int(T* dense, int n_col, int n_row, T* vals, int* col_index,
diff --git a/include/kernels/gemv.hh b/include/kernels/gemv.hh
index 665fe59..a64b19c 100644
--- a/include/kernels/gemv.hh
+++ b/include/kernels/gemv.hh
@@ -95,10 +95,11 @@ class gemv {
     std::uniform_real_distribution<double> dist(0.0, 1.0);
-    int edges = 1 + (int) (n_ * n_ * (1 - sparsity_));
+    uint64_t edges = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 -
+            sparsity_));
     // Using a=0.45 and b=c=0.22 as default probabilities
-    for (int i = 0; i < edges; i++) {
+    for (uint64_t i = 0; i < edges; i++) {
       while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
                    false)) {}
diff --git a/src/ b/src/
index e508b5b..bdc1db2 100644
--- a/src/
+++ b/src/
@@ -7,6 +7,10 @@ bool doSgemm = true;
 bool doDgemm = true;
 bool doSp_sgemm = true;
 bool doSp_dgemm = true;
+bool doSgemv = true;
+bool doDgemv = true;
+bool doSp_sgemv = true;
+bool doSp_dgemv = true;
 bool doCpu = CPU_ENABLED;
 bool doGpu = GPU_ENABLED;
@@ -50,18 +54,18 @@ int main(int argc, char** argv) {
   // -------- GEMV --------
   // SGEMV Comparison
-//  std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl;
-//  doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
-//                      doGpu);
-//  sgemv.collectData();
-//  std::cout << "Finished!" << std::endl;
-//  // DGEMV Comparison
-//  std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl;
-//  doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
-//                       doGpu);
-//  dgemv.collectData();
-//  std::cout << "Finished!" << std::endl;
+  std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl;
+  doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
+                      doGpu, doSgemv, doSp_sgemv);
+  sgemv.collectData();
+  std::cout << "Finished!" << std::endl;
+  // DGEMV Comparison
+  std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl;
+  doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
+                       doGpu, doDgemv, doSp_dgemv);
+  dgemv.collectData();
+  std::cout << "Finished!" << std::endl;
   return 0;
@@ -146,7 +150,8 @@ void getParameters(int argc, char** argv) {
     } else if (!strcmp(argv[i], "--no_gpu")) {
       doGpu = false;
     } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) {
-      doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false;
+      doSgemm = doDgemm = doSp_sgemm = doSp_dgemm =
+      doSgemv = doDgemv = doSp_sgemv = doSp_dgemv = false;
       std::string kernelList = argv[++i];
       if (kernelList.find("sp-sgemm") != std::string::npos) {
         doSp_sgemm = true;
@@ -167,13 +172,28 @@ void getParameters(int argc, char** argv) {
         doDgemm = true;
-	    if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm) {
-		    std::cout << "ERROR - no implemented kernels in list" << std::endl;
-		    exit(1);
-	    }
-    } else if (!strcmp(argv[i], "--output_dir") || !strcmp(argv[i], "-o")) {
-      if (++i >= argc) {
-        std::cout << "ERROR - Invalid output directory" << std::endl;
+      if (kernelList.find("sp-sgemv") != std::string::npos) {
+        doSp_sgemv = true;
+        if (kernelList.find("sgemv") != std::string::npos &&
+            kernelList.find("sgemv") != kernelList.find("sp-sgemv") + 3) {
+          doSgemv = true;
+        }
+      } else if (kernelList.find("sgemv") != std::string::npos) {
+        doSgemv = true;
+      }
+      if (kernelList.find("sp-dgemv") != std::string::npos) {
+        doSp_dgemv = true;
+        if (kernelList.find("dgemv") != std::string::npos &&
+            kernelList.find("dgemv") != kernelList.find("sp-dgemv") + 3) {
+          doDgemv = true;
+        }
+      } else if (kernelList.find("dgemv") != std::string::npos) {
+        doDgemv = true;
+      }
+      if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm &&
+          !doSgemv && !doDgemv && !doSp_sgemv && !doSp_dgemv) {
+        std::cout << "ERROR - no implemented kernels in list" << std::endl;
       } else {
         CSV_DIR = argv[i];
@@ -212,4 +232,4 @@ void getParameters(int argc, char** argv) {
\ No newline at end of file

From 0130b81655b1fa04b433c4d22f9288df723cefd2 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Fri, 11 Oct 2024 15:58:16 +0100
Subject: [PATCH 28/38] Adding AOCL files

 .idea/workspace.xml | 23 ++++++++-----
 ArmPL/sp_gemm.hh    | 84 +++++++++++++++++++++++++++++++++++++++++++++
 Makefile            |  2 +-
 include/doGemm.hh   | 26 +++++++-------
 include/doGemv.hh   | 12 +++----
 include/helpers.hh  | 12 ++++---
 6 files changed, 127 insertions(+), 32 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index a5afad2..2bb35d8 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -15,14 +15,13 @@
   <component name="ChangeListManager">
-    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="No longer overwriting B_">
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding kernel selection for gemv">
       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/Makefile" beforeDir="false" afterPath="$PROJECT_DIR$/Makefile" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/include/doGemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemm.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/" beforeDir="false" afterPath="$PROJECT_DIR$/src/" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/helpers.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/helpers.hh" afterDir="false" />
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -520,7 +519,15 @@
       <option name="project" value="LOCAL" />
-    <option name="localTasksCounter" value="43" />
+    <task id="LOCAL-00043" summary="Adding kernel selection for gemv">
+      <option name="closed" value="true" />
+      <created>1728650780575</created>
+      <option name="number" value="00043" />
+      <option name="presentableId" value="LOCAL-00043" />
+      <option name="project" value="LOCAL" />
+      <updated>1728650780575</updated>
+    </task>
+    <option name="localTasksCounter" value="44" />
     <servers />
   <component name="TypeScriptGeneratedFilesManager">
@@ -538,7 +545,6 @@
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="changes" />
     <MESSAGE value="adding command line kernel selection" />
     <MESSAGE value="Adding basic sparse multiplication kernel for default CPU and GPU" />
     <MESSAGE value="Implementing cuSPARSE kernel" />
@@ -563,6 +569,7 @@
     <MESSAGE value="Adding AOCL files" />
     <MESSAGE value="working changes" />
     <MESSAGE value="No longer overwriting B_" />
-    <option name="LAST_COMMIT_MESSAGE" value="No longer overwriting B_" />
+    <MESSAGE value="Adding kernel selection for gemv" />
+    <option name="LAST_COMMIT_MESSAGE" value="Adding kernel selection for gemv" />
\ No newline at end of file
diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh
index 28a2ca3..612f4f1 100644
--- a/ArmPL/sp_gemm.hh
+++ b/ArmPL/sp_gemm.hh
@@ -89,6 +89,90 @@ class sp_gemm_cpu : public sp_gemm<T> {
   void preLoopRequirements() override {
     // Need to put A_ and B_ into A_armpl_ and B_armpl_
+    /** providing hints to ARMPL and optimizing the matrix datastructures */
+    // TODO -- is noallocs best here?
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_MEMORY,
+                               ARMPL_SPARSE_MEMORY_NOALLOCS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_MEMORY,
+                               ARMPL_SPARSE_MEMORY_NOALLOCS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_STRUCTURE,
+                               ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_STRUCTURE,
+                               ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    // TODO -- will this be FEW?
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
+                               ARMPL_SPARSE_INVOCATIONS_MANY);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
+                               ARMPL_SPARSE_INVOCATIONS_MANY);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION,
+                               ARMPL_SPARSE_OPERATION_NOTRANS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION,
+                               ARMPL_SPARSE_OPERATION_NOTRANS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    // TODO -- investigate whch is better here
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+                               ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+                               ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+//  TODO -- this is thorwing an error -- couldn't immediately fix so come
+//   back to
+//    /** provide hints for the optimisation of the spmm execution */
+//    status_ = armpl_spmm_optimize(ARMPL_SPARSE_OPERATION_NOTRANS,
+//                                  ARMPL_SPARSE_OPERATION_NOTRANS,
+//                                  ARMPL_SPARSE_SCALAR_ONE,
+//                                  A_armpl_, B_armpl_,
+//                                  ARMPL_SPARSE_SCALAR_ZERO,
+//                                  C_armpl_);
+//    if (status_ != ARMPL_STATUS_SUCCESS) {
+//      std::cout << "ERROR " << status_ << std::endl;
+//      exit(1);
+//    }
   /** Perform any required steps after calling the GEMM kernel that should
diff --git a/Makefile b/Makefile
index e5091e0..22d080c 100644
--- a/Makefile
+++ b/Makefile
@@ -51,7 +51,7 @@ CXX = $(CXX_$(COMPILER))
 CXXFLAGS_ARM     = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
 CXXFLAGS_CLANG   = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
-CXXFLAGS_GNU     = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
+CXXFLAGS_GNU     = -std=c++17 -Wall -Wno-deprecated-declarations -Ofast -$(ARCHFLAG)=native
 CXXFLAGS_INTEL   = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native -Wno-tautological-constant-compare
 CXXFLAGS_NVIDIA  = -std=c++17 -Wall -O3 -fast -$(ARCHFLAG)=native
 CXXFLAGS_HIP     = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
diff --git a/include/doGemm.hh b/include/doGemm.hh
index a3e5e77..93cc058 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -392,8 +392,8 @@ class doGemm {
       cpuResult = gemmCpu_.compute();
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
       // Write result to CSV file
-      writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize, iterations_,
-                     cpuResult.runtime, cpuResult.gflops);
+      writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize,
+                     0.0, iterations_, cpuResult.runtime, cpuResult.gflops);
@@ -422,13 +422,13 @@ class doGemm {
       // Write results to CSV file
       writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, K, probSize,
-                     iterations_, gpuResult_once.runtime,
+                     0.0, iterations_, gpuResult_once.runtime,
       writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, K,
-                     probSize, iterations_, gpuResult_always.runtime,
+                     probSize, 0.0, iterations_, gpuResult_always.runtime,
       writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, K, probSize,
-                     iterations_, gpuResult_unified.runtime,
+                     0.0, iterations_, gpuResult_unified.runtime,
@@ -578,8 +578,9 @@ class doGemm {
       spGemmCpu_.initialise(N, sparsity);
       time_checksum_gflop cpuResult = spGemmCpu_.compute();
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
-		  writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
-		               cpuResult.runtime, cpuResult.gflops);
+		  writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize,
+                     sparsity, iterations_, cpuResult.runtime,
+                     cpuResult.gflops);
@@ -607,13 +608,14 @@ class doGemm {
 		// Write lines to CSV file
 		  writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
-		               iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
+		                sparsity, iterations_, gpuResult_once.runtime,
+                    gpuResult_once.gflops);
 		  writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
-		               iterations_, gpuResult_always.runtime,
-		               gpuResult_always.gflops);
+		                sparsity, iterations_, gpuResult_always.runtime,
+		                gpuResult_always.gflops);
 		  writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
-		               iterations_, gpuResult_unified.runtime,
-		               gpuResult_unified.gflops);
+		                sparsity, iterations_, gpuResult_unified.runtime,
+		                gpuResult_unified.gflops);
diff --git a/include/doGemv.hh b/include/doGemv.hh
index 12cd097..2ab5fb1 100644
--- a/include/doGemv.hh
+++ b/include/doGemv.hh
@@ -207,8 +207,8 @@ class doGemv {
       cpuResult = gemvCpu_.compute();
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
       // Write result to CSV file
-      writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, iterations_,
-                     cpuResult.runtime, cpuResult.gflops);
+      writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, 0.0,
+                     iterations_, cpuResult.runtime, cpuResult.gflops);
@@ -237,13 +237,13 @@ class doGemv {
       // Write results to CSV file
       writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, 0, probSize,
-                     iterations_, gpuResult_once.runtime,
+                     0.0, iterations_, gpuResult_once.runtime,
       writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, 0,
-                     probSize, iterations_, gpuResult_always.runtime,
+                     probSize, 0.0, iterations_, gpuResult_always.runtime,
       writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, 0, probSize,
-                     iterations_, gpuResult_unified.runtime,
+                     0.0, iterations_, gpuResult_unified.runtime,
@@ -500,8 +500,8 @@ class doGemv {
   const bool doGPU_ = true;
   /** Whether sparse and or dense kernels should be run. */
-  const bool doSparse_;
   const bool doDense_;
+  const bool doSparse_;
   /** The GEMV CPU kernel. */
diff --git a/include/helpers.hh b/include/helpers.hh
index 5618557..d760cd7 100644
--- a/include/helpers.hh
+++ b/include/helpers.hh
@@ -17,8 +17,8 @@ std::ofstream initCSVFile(const std::string filename) {
   std::ofstream newFile(filename);
-  newFile << "Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total "
-             "Seconds,GFLOP/s"
+  newFile << "Device,Kernel,M,N,K,Total Problem Size (KiB),sparsity,Iterations,"
+             "Total Seconds,GFLOP/s"
           << std::endl;
   return newFile;
@@ -28,15 +28,17 @@ std::ofstream initCSVFile(const std::string filename) {
  * Function does not close the file. */
 void writeLineToCsv(std::ofstream& file, const std::string device,
                     const std::string kernel, const int M, const int N,
-                    const int K, const double totalProbSize, const int iters,
-                    const double totalTime, const double gflops) {
+                    const int K, const double totalProbSize, const float
+                    sparsity, const int iters, const double totalTime,
+                    const double gflops) {
   if (!file.is_open()) {
     std::cout << "ERROR - Attempted to write line to a closed CSV file."
               << std::endl;
   file << device << "," << kernel << "," << M << "," << N << "," << K << ","
-       << std::fixed << std::setprecision(3) << totalProbSize << "," << iters
+       << std::fixed << std::setprecision(3) << totalProbSize << ","
+       << std::fixed << std::setprecision(8) << sparsity << "," << iters
        << "," << std::fixed << std::setprecision(5) << totalTime << ","
        << std::fixed << std::setprecision(3) << gflops << std::endl;

From 4581637b57e14c92b4b4ca40c200565aae9e3d91 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Fri, 11 Oct 2024 15:12:42 +0100
Subject: [PATCH 29/38] Providing armpl with hints

 .idea/workspace.xml | 21 ++++++++++++---------
 ArmPL/sp_gemm.hh    |  1 +
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 2bb35d8..d791fa3 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -15,13 +15,8 @@
   <component name="ChangeListManager">
-    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding kernel selection for gemv">
-      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Providing armpl with hints">
       <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/Makefile" beforeDir="false" afterPath="$PROJECT_DIR$/Makefile" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/doGemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/helpers.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/helpers.hh" afterDir="false" />
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -527,7 +522,15 @@
       <option name="project" value="LOCAL" />
-    <option name="localTasksCounter" value="44" />
+    <task id="LOCAL-00044" summary="Providing armpl with hints">
+      <option name="closed" value="true" />
+      <created>1728655865948</created>
+      <option name="number" value="00044" />
+      <option name="presentableId" value="LOCAL-00044" />
+      <option name="project" value="LOCAL" />
+      <updated>1728655865948</updated>
+    </task>
+    <option name="localTasksCounter" value="45" />
     <servers />
   <component name="TypeScriptGeneratedFilesManager">
@@ -545,7 +548,6 @@
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="adding command line kernel selection" />
     <MESSAGE value="Adding basic sparse multiplication kernel for default CPU and GPU" />
     <MESSAGE value="Implementing cuSPARSE kernel" />
     <MESSAGE value="Trying to work out CSR malloc bug" />
@@ -570,6 +572,7 @@
     <MESSAGE value="working changes" />
     <MESSAGE value="No longer overwriting B_" />
     <MESSAGE value="Adding kernel selection for gemv" />
-    <option name="LAST_COMMIT_MESSAGE" value="Adding kernel selection for gemv" />
+    <MESSAGE value="Providing armpl with hints" />
+    <option name="LAST_COMMIT_MESSAGE" value="Providing armpl with hints" />
\ No newline at end of file
diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh
index 612f4f1..e8e28a5 100644
--- a/ArmPL/sp_gemm.hh
+++ b/ArmPL/sp_gemm.hh
@@ -355,6 +355,7 @@ class sp_gemm_cpu : public sp_gemm<T> {
         std::cout << "ERROR " << status_ << std::endl;
 //      std::cout << "Okay, all matrices made!!" << std::endl;

From 477b7a0a050caeeb86ff4776ab75cbe4982cf883 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Mon, 21 Oct 2024 15:14:42 +0100
Subject: [PATCH 30/38] Updating to show sparsity

 .idea/workspace.xml   | 6 ++++-- | 7 +++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index d791fa3..d27d844 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -15,8 +15,9 @@
   <component name="ChangeListManager">
-    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Providing armpl with hints">
-      <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" afterDir="false" />
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding AOCL files">
+      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/" beforeDir="false" afterPath="$PROJECT_DIR$/" afterDir="false" />
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -177,6 +178,7 @@
       <workItem from="1727941759103" duration="43000" />
       <workItem from="1727941814674" duration="165000" />
       <workItem from="1727941995420" duration="22747000" />
+      <workItem from="1729503392250" duration="1773000" />
     <task id="LOCAL-00001" summary="trivial changes">
       <option name="closed" value="true" />
diff --git a/ b/
index ee1a389..7739eeb 100644
--- a/
+++ b/
@@ -54,7 +54,8 @@
     # Get number of iterations performed and kernel name
     line1 = lines[0].split(',')
-    iters = int(line1[6])
+    sparsity = float(line1[6])
+    iters = int(line1[7])
     kernel = line1[1]
     # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types
@@ -143,7 +144,9 @@
     elif kernel == "dgemm":
         fp = "FP64"
     y_name = "{} GFLOP/s".format(fp)        
-    title = "{}GEMM Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters)
+    title = ("{}GEMM Performance for {} Problems (sparsity = {})- {} "
+             "iterations per problemize").format(kernel[0].upper(),
+                                                 inputTypeStr, sparsity, iters)
     # Make Graph
     fig1 = plt.figure(figsize=(28,16))

From 407c008a75384457002c105c71311461af48854e Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Mon, 21 Oct 2024 15:50:36 +0100
Subject: [PATCH 31/38] Beginning gemv ARMPL

 ArmPL/sp_gemv.hh | 406 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 406 insertions(+)
 create mode 100644 ArmPL/sp_gemv.hh

diff --git a/ArmPL/sp_gemv.hh b/ArmPL/sp_gemv.hh
new file mode 100644
index 0000000..818c95e
--- /dev/null
+++ b/ArmPL/sp_gemv.hh
@@ -0,0 +1,406 @@
+#pragma once
+#ifdef CPU_ARMPL
+#include <stdio.h>
+#include <stdlib.h>
+#include <armpl.h>
+#include <omp.h>
+#include <algorithm>
+#include "../include/kernels/CPU/sp_gemv.hh"
+#include "../include/utilities.hh"
+namespace cpu {
+/** A class for GEMM CPU BLAS kernels. */
+template <typename T>
+class sp_gemv_cpu : public sp_gemv<T> {
+ public:
+  using sp_gemv<T>::sp_gemv;
+  using sp_gemv<T>::callConsume;
+  using sp_gemv<T>::m_;
+  using sp_gemv<T>::n_;
+  using sp_gemv<T>::k_;
+  using sp_gemv<T>::A_;
+  using sp_gemv<T>::B_;
+  using sp_gemv<T>::C_;
+  using sp_gemv<T>::nnz_;
+  using sp_gemv<T>::A_vals_;
+  using sp_gemv<T>::B_vals_;
+  using sp_gemv<T>::C_vals_;
+ private:
+  /** Make call to the GEMM kernel. */
+  void callGemv() override {
+    /**
+     * Flow of ARMPL Sparse LA:
+     *
+     * 1. Create sparse matrix objects: armpl_spmat_create_csr[sdcz]()
+     *
+     * 2. Supply hints on usage: armpl_spmat_hint()
+     *
+     * 3. Optimise for SpMV: armpl_spmv_optimize()
+     *
+     * 4. Solve SpMV case: armpl_spmv_exec_[sdcz]()
+     *
+     * 5. Destroy sparse matrix object: armpl_spmat_destroy()
+     *
+     * In addiion, users can choose to update a set of non-zero values using
+     * armpl_spmat_update_[sdcz]()
+     */
+    // Todo -- See if using armpl_spmat_hint can improve performance here.
+    //  If so, follow with optimisation functions
+    if constexpr (std::is_same_v<T, float>) {
+      status_ = armpl_spmm_exec_s(transA_,
+                                  transB_,
+                                  alpha,
+                                  A_armpl_,
+                                  B_armpl_,
+                                  beta,
+                                  C_armpl_);
+    } else if constexpr (std::is_same_v<T, double>) {
+      status_ = armpl_spmm_exec_d(transA_,
+                                  transB_,
+                                  alpha,
+                                  A_armpl_,
+                                  B_armpl_,
+                                  beta,
+                                  C_armpl_);
+    } else {
+      // Un-specialised class will not do any work - print error and exit.
+      std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported."
+                << std::endl;
+      exit(1);
+    }
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    // Ensure compiler doesn't optimise away the work being done
+    callConsume();
+  }
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {
+    // Need to put A_ and B_ into A_armpl_ and B_armpl_
+    toCSR_armpl();
+    /** providing hints to ARMPL and optimizing the matrix datastructures */
+    // TODO -- is noallocs best here?
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_MEMORY,
+                               ARMPL_SPARSE_MEMORY_NOALLOCS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_MEMORY,
+                               ARMPL_SPARSE_MEMORY_NOALLOCS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_STRUCTURE,
+                               ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_STRUCTURE,
+                               ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    // TODO -- will this be FEW?
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
+                               ARMPL_SPARSE_INVOCATIONS_MANY);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
+                               ARMPL_SPARSE_INVOCATIONS_MANY);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION,
+                               ARMPL_SPARSE_OPERATION_NOTRANS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION,
+                               ARMPL_SPARSE_OPERATION_NOTRANS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    // TODO -- investigate whch is better here
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+                               ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+                               ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+//  TODO -- this is thorwing an error -- couldn't immediately fix so come
+//   back to
+//    /** provide hints for the optimisation of the spmm execution */
+//    status_ = armpl_spmm_optimize(ARMPL_SPARSE_OPERATION_NOTRANS,
+//                                  ARMPL_SPARSE_OPERATION_NOTRANS,
+//                                  ARMPL_SPARSE_SCALAR_ONE,
+//                                  A_armpl_, B_armpl_,
+//                                  ARMPL_SPARSE_SCALAR_ZERO,
+//                                  C_armpl_);
+//    if (status_ != ARMPL_STATUS_SUCCESS) {
+//      std::cout << "ERROR " << status_ << std::endl;
+//      exit(1);
+//    }
+  }
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {
+    status_ = armpl_spmat_destroy(A_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_destroy(B_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_destroy(C_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    delete [] A_armpl_row_ptr_;
+    delete [] A_armpl_col_index_;
+    delete [] A_vals_;
+    delete [] B_armpl_row_ptr_;
+    delete [] B_armpl_col_index_;
+    delete [] B_vals_;
+    delete [] C_armpl_row_ptr_;
+    delete [] C_armpl_col_index_;
+    delete [] C_vals_;
+  }
+  /** The constant value Alpha. */
+  const T alpha = ALPHA;
+  /** The constant value Beta. */
+  const T beta = BETA;
+  void toCSR_armpl() {
+    n_armpl_ = n_;
+    // ToDo -- check whether flags_ is correct!
+    flags_ = 0;
+    // Move A to CSR
+    A_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+    A_armpl_col_index_ = new armpl_int_t[nnz_];
+    A_vals_ = new T[nnz_];
+    A_armpl_row_ptr_[0] = 0;
+    int nnz_encountered = 0;
+    for (int row = 0; row < n_; row++) {
+      A_armpl_row_ptr_[row + 1] = nnz_encountered;
+      for (int col = 0; col < n_; col++) {
+        if (A_[(row * n_) + col] != 0.0) {
+          A_armpl_col_index_[nnz_encountered] = col;
+          A_vals_[nnz_encountered] = static_cast<T>(A_[(row * n_) + col]);
+          nnz_encountered++;
+        }
+      }
+    }
+    // Move B to CSR
+    B_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+    B_armpl_col_index_ = new armpl_int_t[nnz_];
+    B_vals_ = new T[nnz_];
+    B_armpl_row_ptr_[0] = 0;
+    nnz_encountered = 0;
+    for (int row = 0; row < n_; row++) {
+      B_armpl_row_ptr_[row + 1] = nnz_encountered;
+      for (int col = 0; col < n_; col++) {
+        if (B_[(row * n_) + col] != 0.0) {
+          B_armpl_col_index_[nnz_encountered] = col;
+          B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
+          nnz_encountered++;
+        }
+      }
+    }
+    // Move C to CSR
+    C_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+    C_armpl_col_index_ = new armpl_int_t[nnz_];
+    C_vals_ = new T[nnz_];
+    C_armpl_row_ptr_[0] = 0;
+    nnz_encountered = 0;
+    for (int row = 0; row < n_; row++) {
+      C_armpl_row_ptr_[row + 1] = nnz_encountered;
+      for (int col = 0; col < n_; col++) {
+        if (B_[(row * n_) + col] != 0.0) {
+          C_armpl_col_index_[nnz_encountered] = col;
+          C_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
+          nnz_encountered++;
+        }
+      }
+    }
+    if constexpr (std::is_same_v<T, float>) {
+//      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&A_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         A_armpl_row_ptr_,
+                                         A_armpl_col_index_,
+                                         A_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&B_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         B_armpl_row_ptr_,
+                                         B_armpl_col_index_,
+                                         B_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&C_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         C_armpl_row_ptr_,
+                                         C_armpl_col_index_,
+                                         C_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+    } else if constexpr (std::is_same_v<T, double>) {
+//      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
+//                nnz_, flags_
+      status_ = armpl_spmat_create_csr_d(&A_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         A_armpl_row_ptr_,
+                                         A_armpl_col_index_,
+                                         A_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_d(&B_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         B_armpl_row_ptr_,
+                                         B_armpl_col_index_,
+                                         B_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_d(&C_armpl_,
+                                         n_armpl_,
+                                         n_armpl_,
+                                         C_armpl_row_ptr_,
+                                         C_armpl_col_index_,
+                                         C_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+//      std::cout << "Okay, all matrices made!!" << std::endl;
+    }
+  }
+  void printCSR(armpl_int_t n, armpl_int_t* rp, armpl_int_t* ci, T* v,
+                armpl_int_t nz, armpl_int_t f) {
+    std::cout << "\tn = " << n << std::endl;
+    std::cout << "\trow ptr (size = " << sizeof(rp[0]) << ") = [" << rp[0];
+    for (int i = 1; i < (n + 1); i++) {
+      std::cout << ", " << rp[i];
+    }
+    std::cout << "]" << std::endl << "\tcol ind (size = " << sizeof(ci[0]) <<
+    ") = [" << ci[0];
+    for (int i = 1; i < nz; i++) {
+      std::cout << ", " << ci[i];
+    }
+    std::cout << "]" << std::endl << "\tvals (size = " << sizeof(v[0]) <<
+    ") = [" << v[0];
+    for (int i = 1; i < nz; i++) {
+      std::cout << ", " << v[i];
+    }
+    std::cout << "]" << std::endl << "\tflags = " << f << std::endl;
+  }
+  armpl_status_t status_;
+  armpl_int_t flags_;
+  armpl_int_t n_armpl_;
+  armpl_int_t* A_armpl_row_ptr_;
+  armpl_int_t* A_armpl_col_index_;
+  armpl_int_t* B_armpl_row_ptr_;
+  armpl_int_t* B_armpl_col_index_;
+  armpl_int_t* C_armpl_row_ptr_;
+  armpl_int_t* C_armpl_col_index_;
+  armpl_spmat_t A_armpl_;
+  armpl_spmat_t B_armpl_;
+  armpl_spmat_t C_armpl_;
+  armpl_sparse_hint_value transA_ = ARMPL_SPARSE_OPERATION_NOTRANS;
+  armpl_sparse_hint_value transB_ = ARMPL_SPARSE_OPERATION_NOTRANS;
+}  // namespace cpu
\ No newline at end of file

From 893458824dc6d343e34a66207a7ebbfc9d67f9b3 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Mon, 21 Oct 2024 15:50:43 +0100
Subject: [PATCH 32/38] Beginning gemv ARMPL

 .idea/workspace.xml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index d27d844..5a61e8c 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -16,8 +16,7 @@
   <component name="ChangeListManager">
     <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding AOCL files">
-      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/" beforeDir="false" afterPath="$PROJECT_DIR$/" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" afterDir="false" />
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -550,7 +549,6 @@
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="Adding basic sparse multiplication kernel for default CPU and GPU" />
     <MESSAGE value="Implementing cuSPARSE kernel" />
     <MESSAGE value="Trying to work out CSR malloc bug" />
     <MESSAGE value="cuSPARSE unified memory implementation" />
@@ -575,6 +573,7 @@
     <MESSAGE value="No longer overwriting B_" />
     <MESSAGE value="Adding kernel selection for gemv" />
     <MESSAGE value="Providing armpl with hints" />
-    <option name="LAST_COMMIT_MESSAGE" value="Providing armpl with hints" />
+    <MESSAGE value="Updating to show sparsity" />
+    <option name="LAST_COMMIT_MESSAGE" value="Updating to show sparsity" />
\ No newline at end of file

From 2e61261a2ea804360db9bd4adbbb031198552f7d Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Thu, 2 Jan 2025 12:03:18 +0000
Subject: [PATCH 33/38] still trying to figure out segfault...

 .idea/workspace.xml            |  32 +-
 ArmPL/sp_gemv.hh               | 175 +------
 cuBLAS/sp_gemv.hh              | 885 +++++++++++++++++++++++----------
 include/doGemm.hh              |  28 +-
 include/doGemv.hh              | 279 +++++++----
 include/kernels/CPU/sp_gemv.hh |   9 +
 6 files changed, 864 insertions(+), 544 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 5a61e8c..9592790 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -15,8 +15,13 @@
   <component name="ChangeListManager">
-    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding AOCL files">
-      <change afterPath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" afterDir="false" />
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Beginning gemv ARMPL">
+      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/cuBLAS/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/sp_gemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/doGemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/sp_gemv.hh" afterDir="false" />
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -178,6 +183,7 @@
       <workItem from="1727941814674" duration="165000" />
       <workItem from="1727941995420" duration="22747000" />
       <workItem from="1729503392250" duration="1773000" />
+      <workItem from="1730878516596" duration="9915000" />
     <task id="LOCAL-00001" summary="trivial changes">
       <option name="closed" value="true" />
@@ -531,7 +537,23 @@
       <option name="project" value="LOCAL" />
-    <option name="localTasksCounter" value="45" />
+    <task id="LOCAL-00045" summary="Beginning gemv ARMPL">
+      <option name="closed" value="true" />
+      <created>1729522236773</created>
+      <option name="number" value="00045" />
+      <option name="presentableId" value="LOCAL-00045" />
+      <option name="project" value="LOCAL" />
+      <updated>1729522236773</updated>
+    </task>
+    <task id="LOCAL-00046" summary="Beginning gemv ARMPL">
+      <option name="closed" value="true" />
+      <created>1729522244950</created>
+      <option name="number" value="00046" />
+      <option name="presentableId" value="LOCAL-00046" />
+      <option name="project" value="LOCAL" />
+      <updated>1729522244950</updated>
+    </task>
+    <option name="localTasksCounter" value="47" />
     <servers />
   <component name="TypeScriptGeneratedFilesManager">
@@ -549,7 +571,6 @@
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="Implementing cuSPARSE kernel" />
     <MESSAGE value="Trying to work out CSR malloc bug" />
     <MESSAGE value="cuSPARSE unified memory implementation" />
     <MESSAGE value="Now compiles" />
@@ -574,6 +595,7 @@
     <MESSAGE value="Adding kernel selection for gemv" />
     <MESSAGE value="Providing armpl with hints" />
     <MESSAGE value="Updating to show sparsity" />
-    <option name="LAST_COMMIT_MESSAGE" value="Updating to show sparsity" />
+    <MESSAGE value="Beginning gemv ARMPL" />
+    <option name="LAST_COMMIT_MESSAGE" value="Beginning gemv ARMPL" />
\ No newline at end of file
diff --git a/ArmPL/sp_gemv.hh b/ArmPL/sp_gemv.hh
index 818c95e..f39a764 100644
--- a/ArmPL/sp_gemv.hh
+++ b/ArmPL/sp_gemv.hh
@@ -20,14 +20,10 @@ class sp_gemv_cpu : public sp_gemv<T> {
   using sp_gemv<T>::callConsume;
   using sp_gemv<T>::m_;
   using sp_gemv<T>::n_;
-  using sp_gemv<T>::k_;
   using sp_gemv<T>::A_;
-  using sp_gemv<T>::B_;
-  using sp_gemv<T>::C_;
+  using sp_gemv<T>::x_;
+  using sp_gemv<T>::y_;
   using sp_gemv<T>::nnz_;
-  using sp_gemv<T>::A_vals_;
-  using sp_gemv<T>::B_vals_;
-  using sp_gemv<T>::C_vals_;
   /** Make call to the GEMM kernel. */
@@ -50,25 +46,20 @@ class sp_gemv_cpu : public sp_gemv<T> {
      * armpl_spmat_update_[sdcz]()
-    // Todo -- See if using armpl_spmat_hint can improve performance here.
-    //  If so, follow with optimisation functions
     if constexpr (std::is_same_v<T, float>) {
-      status_ = armpl_spmm_exec_s(transA_,
-                                  transB_,
+      status_ = armpl_spmv_exec_s(trans_,
-                                  B_armpl_,
+                                  x_,
-                                  C_armpl_);
+                                  y_);
     } else if constexpr (std::is_same_v<T, double>) {
-      status_ = armpl_spmm_exec_d(transA_,
-                                  transB_,
+      status_ = armpl_spmv_exec_d(trans_,
-                                  B_armpl_,
+                                  x_,
-                                  C_armpl_);
+                                  y_);
     } else {
       // Un-specialised class will not do any work - print error and exit.
       std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported."
@@ -98,12 +89,6 @@ class sp_gemv_cpu : public sp_gemv<T> {
       std::cout << "ERROR " << status_ << std::endl;
-    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_MEMORY,
-                               ARMPL_SPARSE_MEMORY_NOALLOCS);
-    if (status_ != ARMPL_STATUS_SUCCESS) {
-      std::cout << "ERROR " << status_ << std::endl;
-      exit(1);
-    }
     status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_STRUCTURE,
@@ -111,12 +96,6 @@ class sp_gemv_cpu : public sp_gemv<T> {
       std::cout << "ERROR " << status_ << std::endl;
-    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_STRUCTURE,
-                               ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
-    if (status_ != ARMPL_STATUS_SUCCESS) {
-      std::cout << "ERROR " << status_ << std::endl;
-      exit(1);
-    }
     // TODO -- will this be FEW?
     status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
@@ -125,12 +104,6 @@ class sp_gemv_cpu : public sp_gemv<T> {
       std::cout << "ERROR " << status_ << std::endl;
-    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
-                               ARMPL_SPARSE_INVOCATIONS_MANY);
-    if (status_ != ARMPL_STATUS_SUCCESS) {
-      std::cout << "ERROR " << status_ << std::endl;
-      exit(1);
-    }
     status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION,
@@ -138,12 +111,6 @@ class sp_gemv_cpu : public sp_gemv<T> {
       std::cout << "ERROR " << status_ << std::endl;
-    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION,
-                               ARMPL_SPARSE_OPERATION_NOTRANS);
-    if (status_ != ARMPL_STATUS_SUCCESS) {
-      std::cout << "ERROR " << status_ << std::endl;
-      exit(1);
-    }
     // TODO -- investigate whch is better here
     status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY,
@@ -152,12 +119,6 @@ class sp_gemv_cpu : public sp_gemv<T> {
       std::cout << "ERROR " << status_ << std::endl;
-    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY,
-                               ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT);
-    if (status_ != ARMPL_STATUS_SUCCESS) {
-      std::cout << "ERROR " << status_ << std::endl;
-      exit(1);
-    }
 //  TODO -- this is thorwing an error -- couldn't immediately fix so come
 //   back to
@@ -183,27 +144,10 @@ class sp_gemv_cpu : public sp_gemv<T> {
       std::cout << "ERROR " << status_ << std::endl;
-    status_ = armpl_spmat_destroy(B_armpl_);
-    if (status_ != ARMPL_STATUS_SUCCESS) {
-      std::cout << "ERROR " << status_ << std::endl;
-      exit(1);
-    }
-    status_ = armpl_spmat_destroy(C_armpl_);
-    if (status_ != ARMPL_STATUS_SUCCESS) {
-      std::cout << "ERROR " << status_ << std::endl;
-      exit(1);
-    }
     delete [] A_armpl_row_ptr_;
     delete [] A_armpl_col_index_;
     delete [] A_vals_;
-    delete [] B_armpl_row_ptr_;
-    delete [] B_armpl_col_index_;
-    delete [] B_vals_;
-    delete [] C_armpl_row_ptr_;
-    delete [] C_armpl_col_index_;
-    delete [] C_vals_;
   /** The constant value Alpha. */
@@ -235,42 +179,6 @@ class sp_gemv_cpu : public sp_gemv<T> {
-    // Move B to CSR
-    B_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
-    B_armpl_col_index_ = new armpl_int_t[nnz_];
-    B_vals_ = new T[nnz_];
-    B_armpl_row_ptr_[0] = 0;
-    nnz_encountered = 0;
-    for (int row = 0; row < n_; row++) {
-      B_armpl_row_ptr_[row + 1] = nnz_encountered;
-      for (int col = 0; col < n_; col++) {
-        if (B_[(row * n_) + col] != 0.0) {
-          B_armpl_col_index_[nnz_encountered] = col;
-          B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
-          nnz_encountered++;
-        }
-      }
-    }
-    // Move C to CSR
-    C_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
-    C_armpl_col_index_ = new armpl_int_t[nnz_];
-    C_vals_ = new T[nnz_];
-    C_armpl_row_ptr_[0] = 0;
-    nnz_encountered = 0;
-    for (int row = 0; row < n_; row++) {
-      C_armpl_row_ptr_[row + 1] = nnz_encountered;
-      for (int col = 0; col < n_; col++) {
-        if (B_[(row * n_) + col] != 0.0) {
-          C_armpl_col_index_[nnz_encountered] = col;
-          C_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
-          nnz_encountered++;
-        }
-      }
-    }
     if constexpr (std::is_same_v<T, float>) {
 //      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
 //                nnz_, flags_);
@@ -285,34 +193,6 @@ class sp_gemv_cpu : public sp_gemv<T> {
         std::cout << "ERROR " << status_ << std::endl;
-//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
-//                nnz_, flags_);
-      status_ = armpl_spmat_create_csr_s(&B_armpl_,
-                                         n_armpl_,
-                                         n_armpl_,
-                                         B_armpl_row_ptr_,
-                                         B_armpl_col_index_,
-                                         B_vals_,
-                                         flags_);
-      if (status_ != ARMPL_STATUS_SUCCESS) {
-        std::cout << "ERROR " << status_ << std::endl;
-        exit(1);
-      }
-//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
-//                nnz_, flags_);
-      status_ = armpl_spmat_create_csr_s(&C_armpl_,
-                                         n_armpl_,
-                                         n_armpl_,
-                                         C_armpl_row_ptr_,
-                                         C_armpl_col_index_,
-                                         C_vals_,
-                                         flags_);
-      if (status_ != ARMPL_STATUS_SUCCESS) {
-        std::cout << "ERROR " << status_ << std::endl;
-        exit(1);
-      }
     } else if constexpr (std::is_same_v<T, double>) {
 //      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
 //                nnz_, flags_
@@ -328,34 +208,6 @@ class sp_gemv_cpu : public sp_gemv<T> {
-//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
-//                nnz_, flags_);
-      status_ = armpl_spmat_create_csr_d(&B_armpl_,
-                                         n_armpl_,
-                                         n_armpl_,
-                                         B_armpl_row_ptr_,
-                                         B_armpl_col_index_,
-                                         B_vals_,
-                                         flags_);
-      if (status_ != ARMPL_STATUS_SUCCESS) {
-        std::cout << "ERROR " << status_ << std::endl;
-        exit(1);
-      }
-//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
-//                nnz_, flags_);
-      status_ = armpl_spmat_create_csr_d(&C_armpl_,
-                                         n_armpl_,
-                                         n_armpl_,
-                                         C_armpl_row_ptr_,
-                                         C_armpl_col_index_,
-                                         C_vals_,
-                                         flags_);
-      if (status_ != ARMPL_STATUS_SUCCESS) {
-        std::cout << "ERROR " << status_ << std::endl;
-        exit(1);
-      }
 //      std::cout << "Okay, all matrices made!!" << std::endl;
@@ -381,25 +233,20 @@ class sp_gemv_cpu : public sp_gemv<T> {
     std::cout << "]" << std::endl << "\tflags = " << f << std::endl;
   armpl_status_t status_;
   armpl_int_t flags_;
   armpl_int_t n_armpl_;
+  T* A_vals_;
   armpl_int_t* A_armpl_row_ptr_;
   armpl_int_t* A_armpl_col_index_;
-  armpl_int_t* B_armpl_row_ptr_;
-  armpl_int_t* B_armpl_col_index_;
-  armpl_int_t* C_armpl_row_ptr_;
-  armpl_int_t* C_armpl_col_index_;
   armpl_spmat_t A_armpl_;
-  armpl_spmat_t B_armpl_;
-  armpl_spmat_t C_armpl_;
-  armpl_sparse_hint_value transA_ = ARMPL_SPARSE_OPERATION_NOTRANS;
-  armpl_sparse_hint_value transB_ = ARMPL_SPARSE_OPERATION_NOTRANS;
+  armpl_sparse_hint_value trans_ = ARMPL_SPARSE_OPERATION_NOTRANS;
 }  // namespace cpu
diff --git a/cuBLAS/sp_gemv.hh b/cuBLAS/sp_gemv.hh
index 8027746..f35a63a 100644
--- a/cuBLAS/sp_gemv.hh
+++ b/cuBLAS/sp_gemv.hh
@@ -1,261 +1,624 @@
-//#pragma once
-//#ifdef GPU_CUBLAS
-//#include <cusparse_v2.h>
-//#include <cuda.h>
-//#include <cublas_v2.h>
-//#include <cuda_runtime.h>
-//#include <type_traits>
-//#include <random>
-//#include <iostream>
-//#include "../include/kernels/GPU/sp_gemv.hh"
-//#include "../include/utilities.hh"
-//#include "common.hh"
-//namespace gpu {
-///** A class for sparse GEMV GPU BLAS kernels. */
-//template <typename T>
-//class gemv_gpu : public gemv<T> {
-// public:
-//  using gemv<T>::gemv;
-//  using gemv<T>::initInputMatrixVector;
-//  using gemv<T>::m_;
-//  using gemv<T>::n_;
-//  using gemv<T>::A_;
-//  using gemv<T>::x_;
-//  using gemv<T>::y_;
-//  using gemv<T>::offload_;
-//  using gemv<T>::vecIncrement_;
-//  ~gemv_gpu() {
-//    if (alreadyInitialised_) {
-//      // Destroy the handle
-//      cublasCheckError(cublasDestroy(handle_));
-//      // Destroy streams after use
-//      cudaCheckError(cudaStreamDestroy(s1_));
-//      cudaCheckError(cudaStreamDestroy(s2_));
-//      cudaCheckError(cudaStreamDestroy(s3_));
-//    }
-//  }
-//  /** Initialise the required data structures.
-//   * `offload` refers to the data offload type:
-//   *  - Once:    Move data from host to device before all iterations & move from
-//   *             device to host after all iterations
-//   *  - Always:  Move data from host to device and device to host each iteration
-//   *  - Unified: Initialise data as unified memory; no data movement semantics
-//   *             required */
-//  void initialise(gpuOffloadType offload, int m, int n) override {
-//    if (!alreadyInitialised_) {
-//      alreadyInitialised_ = true;
-//      // Perform set-up which doesn't need to happen every problem size change.
-//      // Create a handle for CUBLAS
-//      cublasCheckError(cublasCreate(&handle_));
-//      // Get device identifier
-//      cudaCheckError(cudaGetDevice(&gpuDevice_));
-//      // Initialise 3 streams to asynchronously move data between host and
-//      // device
-//      cudaCheckError(cudaStreamCreate(&s1_));
-//      cudaCheckError(cudaStreamCreate(&s2_));
-//      cudaCheckError(cudaStreamCreate(&s3_));
-//    }
-//    offload_ = offload;
-//    m_ = m;
-//    n_ = n;
-//    if (offload_ == gpuOffloadType::unified) {
-//      cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * m_ * n_));
-//      cudaCheckError(cudaMallocManaged(&x_, sizeof(T) * n_));
-//      cudaCheckError(cudaMallocManaged(&y_, sizeof(T) * m_));
-//    } else {
-//      // Allocate matrices on host
-//      cudaCheckError(cudaMallocHost((void**)&A_, sizeof(T) * m_ * n_));
-//      cudaCheckError(cudaMallocHost((void**)&x_, sizeof(T) * n_));
-//      cudaCheckError(cudaMallocHost((void**)&y_, sizeof(T) * m_));
-//      // Allocate matrices on device
-//      cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * m_ * n_));
-//      cudaCheckError(cudaMalloc((void**)&x_device_, sizeof(T) * n_));
-//      cudaCheckError(cudaMalloc((void**)&y_device_, sizeof(T) * m_));
-//    }
-//    // Initialise the host data structures
-//    initInputMatrixVector();
-//  }
-// private:
-//  /** Perform any required steps before calling the GEMV kernel that should
-//   * be timed. */
-//  void preLoopRequirements() override {
-//    switch (offload_) {
-//      case gpuOffloadType::always: {
-//        // Offload data each iteration - no requirements
-//        break;
-//      }
-//      case gpuOffloadType::once: {
-//        // Offload input data from host to the device.
-//        cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_,
-//                                       cudaMemcpyHostToDevice, s1_));
-//        cudaCheckError(cudaMemcpyAsync(x_device_, x_, sizeof(T) * n_,
-//                                       cudaMemcpyHostToDevice, s2_));
-//        cudaCheckError(cudaMemcpyAsync(y_device_, y_, sizeof(T) * m_,
-//                                       cudaMemcpyHostToDevice, s3_));
-//        break;
-//      }
-//      case gpuOffloadType::unified: {
-//        // Prefetch input data to device
-//        cudaCheckError(
-//            cudaMemPrefetchAsync(A_, sizeof(T) * m_ * n_, gpuDevice_, s1_));
-//        cudaCheckError(
-//            cudaMemPrefetchAsync(x_, sizeof(T) * n_, gpuDevice_, s2_));
-//        cudaCheckError(
-//            cudaMemPrefetchAsync(y_, sizeof(T) * m_, gpuDevice_, s3_));
-//        break;
-//      }
-//    }
-//  }
-//  /** Make a call to the BLAS Library Kernel. */
-//  void callGemv() override {
-//    switch (offload_) {
-//      case gpuOffloadType::always: {
-//        // Offload input data from host to the device.
-//        cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_,
-//                                       cudaMemcpyHostToDevice, s1_));
-//        cudaCheckError(cudaMemcpyAsync(x_device_, x_, sizeof(T) * n_,
-//                                       cudaMemcpyHostToDevice, s2_));
-//        cudaCheckError(cudaMemcpyAsync(y_device_, y_, sizeof(T) * m_,
-//                                       cudaMemcpyHostToDevice, s3_));
-//        // Call cuBLAS GEMV kernel
-//        if constexpr (std::is_same_v<T, float>) {
-//          cublasCheckError(cublasSgemv(
-//              handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_),
-//              x_device_, vecIncrement_, &beta, y_device_, vecIncrement_));
-//        } else if constexpr (std::is_same_v<T, double>) {
-//          cublasCheckError(cublasDgemv(
-//              handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_),
-//              x_device_, vecIncrement_, &beta, y_device_, vecIncrement_));
-//        }
-//        // Offload output data from device to host
-//        cudaCheckError(cudaMemcpyAsync(y_, y_device_, sizeof(T) * m_,
-//                                       cudaMemcpyDeviceToHost, s3_));
-//        // Ensure device has finished all work.
-//        cudaCheckError(cudaDeviceSynchronize());
-//        break;
-//      }
-//      case gpuOffloadType::once: {
-//        // Call cuBLAS GEMV kernel
-//        if constexpr (std::is_same_v<T, float>) {
-//          cublasCheckError(cublasSgemv(
-//              handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_),
-//              x_device_, vecIncrement_, &beta, y_device_, vecIncrement_));
-//        } else if constexpr (std::is_same_v<T, double>) {
-//          cublasCheckError(cublasDgemv(
-//              handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_),
-//              x_device_, vecIncrement_, &beta, y_device_, vecIncrement_));
-//        }
-//        break;
-//      }
-//      case gpuOffloadType::unified: {
-//        // Call cuBLAS GEMV kernel
-//        if constexpr (std::is_same_v<T, float>) {
-//          cublasCheckError(cublasSgemv(handle_, CUBLAS_OP_N, m_, n_, &alpha, A_,
-//                                       std::max(1, m_), x_, vecIncrement_,
-//                                       &beta, y_, vecIncrement_));
-//        } else if constexpr (std::is_same_v<T, double>) {
-//          cublasCheckError(cublasDgemv(handle_, CUBLAS_OP_N, m_, n_, &alpha, A_,
-//                                       std::max(1, m_), x_, vecIncrement_,
-//                                       &beta, y_, vecIncrement_));
-//        }
-//        break;
-//      }
-//    }
-//  }
-//  /** Perform any required steps after calling the GEMV kernel that should
-//   * be timed. */
-//  void postLoopRequirements() override {
-//    switch (offload_) {
-//      case gpuOffloadType::always: {
-//        // Offload data each iteration - no requirements
-//        break;
-//      }
-//      case gpuOffloadType::once: {
-//        // Offload output data from device to host
-//        cudaCheckError(cudaMemcpyAsync(y_, y_device_, sizeof(T) * m_,
-//                                       cudaMemcpyDeviceToHost, s3_));
-//        // Ensure device has finished all work.
-//        cudaCheckError(cudaDeviceSynchronize());
-//        break;
-//      }
-//      case gpuOffloadType::unified: {
-//        // Ensure all output data resides on host once work has completed
-//        cudaCheckError(
-//            cudaMemPrefetchAsync(y_, sizeof(T) * m_, cudaCpuDeviceId, s3_));
-//        // Ensure device has finished all work.
-//        cudaCheckError(cudaDeviceSynchronize());
-//        break;
-//      }
-//    }
-//  }
-//  /** Do any necessary cleanup (free pointers, close library handles, etc.)
-//   * after Kernel has been called. */
-//  void postCallKernelCleanup() override {
-//    if (offload_ == gpuOffloadType::unified) {
-//      cudaFree(A_);
-//      cudaFree(x_);
-//      cudaFree(y_);
-//    } else {
-//      // Free the memory held on host and device
-//      cudaFreeHost((void*)A_);
-//      cudaFreeHost((void*)x_);
-//      cudaFreeHost((void*)y_);
-//      cudaFree(A_device_);
-//      cudaFree(x_device_);
-//      cudaFree(y_device_);
-//    }
-//  }
-//  /** Whether the initialise function has been called before. */
-//  bool alreadyInitialised_ = false;
-//  /** Handle used when calling cuBLAS. */
-//  cublasHandle_t handle_;
-//  /** CUDA Stream 1 - used to asynchronously move data between host and device.
-//   */
-//  cudaStream_t s1_;
-//  /** CUDA Stream 2 - used to asynchronously move data between host and device.
-//   */
-//  cudaStream_t s2_;
-//  /** CUDA Stream 3 - used to asynchronously move data between host and device.
-//   */
-//  cudaStream_t s3_;
-//  /** The ID of the target GPU Device. */
-//  int gpuDevice_;
-//  /** Input matrix A, held on the device. */
-//  T* A_device_;
-//  /** Input vector x, held on the device. */
-//  T* x_device_;
-//  /** Input vector y, held on the device. */
-//  T* y_device_;
-//  /** The constant value Alpha. */
-//  const T alpha = ALPHA;
-//  /** The constant value Beta. */
-//  const T beta = BETA;
-//}  // namespace gpu
\ No newline at end of file
+#pragma once
+#ifdef GPU_CUBLAS
+#include <cusparse_v2.h>
+#include <cuda_runtime_api.h>
+#include <type_traits>
+#include <random>
+#include <iostream>
+#include "../include/kernels/GPU/sp_gemv.hh"
+#include "../include/utilities.hh"
+#include "common.hh"
+namespace gpu {
+/** A class for sparse GEMM GPU BLAS kernels. */
+template <typename T>
+class sp_gemv_gpu : public sp_gemv<T> {
+ public:
+  using sp_gemv<T>::sp_gemv;
+  using sp_gemv<T>::initInputMatrixVectorSparse;
+//  using sp_gemv<T>::toCSR_int;
+  using sp_gemv<T>::m_;
+  using sp_gemv<T>::n_;
+  using sp_gemv<T>::A_;
+  using sp_gemv<T>::x_;
+  using sp_gemv<T>::y_;
+  using sp_gemv<T>::offload_;
+  using sp_gemv<T>::sparsity_;
+  ~sp_gemv_gpu() {
+    // ToDo -- destroy the handle
+    // Destroy streams after use
+    cudaCheckError(cudaStreamDestroy(s1_));
+    cudaCheckError(cudaStreamDestroy(s2_));
+    cudaCheckError(cudaStreamDestroy(s3_));
+  }
+	// ToDo -- No checksum for sparse yet.  Need to do
+  /** Initialise the required data structures.
+   * `offload` refers to the data offload type:
+   *  - Once:    Move data from host to device before all iterations & move from
+   *             device to host after all iterations
+   *  - Always:  Move data from host to device and device to host each iteration
+   *  - Unified: Initialise data as unified memory; no data movement semantics
+   *             required */
+  void initialise(gpuOffloadType offload, int n, float sparsity) override {
+    std::cout << std::endl << "##############################" << std::endl
+              << "\tCUSPARSE GEMV\t\tInitialising n = " << n << "\tOffload"
+              << " type = " <<
+              (((offload == gpuOffloadType::unified) ? "Unified" : (offload
+              == gpuOffloadType::always) ? "Always" : "Once"))
+              << std::endl
+              << "##############################" << std::endl;
+    offload_ = offload;
+    sparsity_ = sparsity;
+    /**
+     *
+     * 	T* A_val_;
+     * 	int *A_col_, *A_row_;
+     * 	T* A_val_dev_;
+     * 	int *A_col_dev_, *A_row_dev_;
+     * 	uint64_t A_nnz_, vals_size_, cols_size_, rows_size_;
+     *
+     *
+     * 	T * x_host_, *y_host_;
+     * 	T *x_dev_, *y_dev_;
+     * 	uint64_t x_size_, y_size_;
+     *
+     */
+    // Create a handle for cuSPARSE
+    cusparseCheckError(cusparseCreate(&handle_));
+    cudaCheckError(cudaGetDevice(&gpuDevice_));
+    if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
+    else if (std::is_same_v<T, double>) cudaDataType_ = CUDA_R_64F;
+    else {
+      std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
+      exit(1);
+    }
+    n_ = n;
+    // Initialise 3 streams to asynchronously move data between host and device
+    cudaCheckError(cudaStreamCreate(&s1_));
+    cudaCheckError(cudaStreamCreate(&s2_));
+    cudaCheckError(cudaStreamCreate(&s3_));
+    std::cout << "\tcuda streams created" << std::endl;
+   // Work out the sizes of all the vectors
+    A_nnz_ = 1 + (uint64_t)(n_ * n_ * (1 - sparsity));
+    vals_size_ = sizeof(T) * A_nnz_;
+    cols_size_ = sizeof(int) * A_nnz_;
+    rows_size_ = sizeof(int) * (n_ + 1);
+    x_size_ = sizeof(T) * n_;
+    y_size_ = sizeof(T) * n_;
+    if (offload_ == gpuOffloadType::unified) {
+      // Get device identifier
+      cudaCheckError(cudaMallocManaged(&A_val_, vals_size_));
+      cudaCheckError(cudaMallocManaged(&A_col_, cols_size_));
+      cudaCheckError(cudaMallocManaged(&A_row_, rows_size_));
+      cudaCheckError(cudaMallocManaged(&x_, x_size_));
+      cudaCheckError(cudaMallocManaged(&y_, y_size_));
+    } else {
+      A_val_ = (T*)malloc(vals_size_);
+      A_col_ = (int*)malloc(cols_size_);
+      A_row_ = (int*)malloc(rows_size_);
+      std::cout << "\tA_ local csr arrays made" << std::endl;
+      x_ = (T*)malloc(x_size_);
+      y_ = (T*)malloc(y_size_);
+      std::cout << "\tx_ and y_ local arrays made" << std::endl;
+      cudaCheckError(cudaMalloc((void**)&A_val_dev_, vals_size_));
+      cudaCheckError(cudaMalloc((void**)&A_col_dev_, cols_size_));
+      cudaCheckError(cudaMalloc((void**)&A_row_dev_, rows_size_));
+      std::cout << "\tA_ dev csr arrays made" << std::endl;
+      cudaCheckError(cudaMalloc((void**)&x_dev_, x_size_));
+      cudaCheckError(cudaMalloc((void**)&y_dev_, y_size_));
+      std::cout << "\tx_ and y_ dev arrays made" << std::endl;
+    }
+    // Initialise the host matricies
+    // cusparseSpGEMM() works on CSR format only.  This helpfully makes our
+    // sparse matrix format decision for us!
+    // Initialise the matrices
+    // Set initial values to 0
+    A_ = (T*)malloc(sizeof(T) * n_ * n_);
+    std::cout << "\tA_ dense array made" << std::endl;
+    initInputMatrixVectorSparse();git branc
+    std::cout << "\tinputs made" << std::endl;
+    toCSR_int(A_, n_, n_, A_val_, A_col_, A_row_);
+    std::cout << "\tA_ moved to CSR" << std::endl;
+//    std::cout << "_____Matrix A_____" << std::endl;
+//    printDenseMatrix(A_, n_, n_);
+//    std::cout << std::endl << std::endl;
+//    printCSR(A_val_, A_col_, A_row_, nnz_, n_, n_);
+    std::cout << "\tInitialising done!" << std::endl;
+  }
+ private:
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {
+    std::cout << std::endl << "##############################" << std::endl
+              << "\tPreloop Requirements" << std::endl
+              << "##############################" << std::endl;
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        // Make matrix descriptor
+        cusparseCheckError(
+                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
+                                  A_col_dev_, A_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        std::cout << "\tA_ description made" << std::endl;
+        // Create vector descriptor
+        cusparseCheckError(cusparseCreateDnVec(&descrx_, n_, x_dev_,
+                                               cudaDataType_));
+        std::cout << "\tx_ description made" << std::endl;
+        cusparseCheckError(cusparseCreateDnVec(&descry_, n_, NULL,
+                                               cudaDataType_));
+        std::cout << "\ty_ description made" << std::endl;
+        break;
+      }
+      case gpuOffloadType::once: {
+        cudaCheckError(cudaMemcpy(A_val_dev_, A_val_, vals_size_,
+                                  cudaMemcpyHostToDevice));
+        cudaCheckError(cudaMemcpy(A_col_dev_, A_col_, cols_size_,
+                                  cudaMemcpyHostToDevice));
+        cudaCheckError(cudaMemcpy(A_row_dev_, A_row_, rows_size_,
+                                  cudaMemcpyHostToDevice));
+        std::cout << "\tA_ csr dev arrays sunc" << std::endl;
+        cudaCheckError(cudaMemcpy(x_dev_, x_, x_size_,
+                                       cudaMemcpyHostToDevice));
+        std::cout << "\tx_ dev array sunc" << std::endl;
+        cudaCheckError(cudaMemcpy(y_dev_, y_, y_size_,
+                                       cudaMemcpyHostToDevice));
+        std::cout << "\ty_ dev array sunc" << std::endl;
+        // Create matrix descriptor
+        cusparseCheckError(
+                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
+                                  A_col_dev_, A_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        std::cout << "\tA_ description made" << std::endl;
+        // Create vector descriptor
+        cusparseCheckError(cusparseCreateDnVec(&descrx_, n_, x_dev_,
+                                               cudaDataType_));
+        std::cout << "\tx_ description made" << std::endl;
+        cusparseCheckError(cusparseCreateDnVec(&descry_, n_, NULL,
+                                               cudaDataType_));
+        std::cout << "\ty_ description made" << std::endl;
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Prefetch memory to device
+        cudaCheckError(cudaMemPrefetchAsync(A_val_, vals_size_, gpuDevice_,
+                                            s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_col_, cols_size_, gpuDevice_,
+                                            s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_row_, rows_size_, gpuDevice_,
+                                            s1_));
+        std::cout << "\tA_ csr dev arrays sunc" << std::endl;
+        cudaCheckError(cudaMemPrefetchAsync(x_, x_size_, gpuDevice_, s2_));
+        std::cout << "\tx_ dev array sunc" << std::endl;
+        cudaCheckError(cudaMemPrefetchAsync(y_, y_size_, gpuDevice_, s3_));
+        std::cout << "\ty_ dev array sunc" << std::endl;
+        cudaCheckError(cudaDeviceSynchronize());
+        break;
+      }
+    }
+  }
+  /** Make a call to the BLAS Library Kernel. */
+  void callGemv() override {
+    std::cout << std::endl << "##############################" << std::endl
+              << "\tCalling GEMV" << std::endl
+              << "##############################" << std::endl;
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        cudaCheckError(cudaMemcpy(A_val_dev_, A_val_, vals_size_,
+                                  cudaMemcpyHostToDevice));
+        cudaCheckError(cudaMemcpy(A_col_dev_, A_col_, cols_size_,
+                                  cudaMemcpyHostToDevice));
+        cudaCheckError(cudaMemcpy(A_row_dev_, A_row_, rows_size_,
+                                  cudaMemcpyHostToDevice));
+        std::cout << "\tA_ csr dev arrays sunc" << std::endl;
+        cudaCheckError(cudaMemcpy(x_dev_, x_, x_size_, cudaMemcpyHostToDevice));
+        std::cout << "\tx_ dev array sunc" << std::endl;
+        cudaCheckError(cudaMemcpy(y_dev_, y_, y_size_, cudaMemcpyHostToDevice));
+        std::cout << "\ty_ dev array sunc" << std::endl;
+        /**
+         * Workflow is :
+         *    cusparseSpMV_bufferSize
+         *    cisparseSpMV_preprocess
+         *    cusparseSpMV
+         */
+        cusparseCheckError(cusparseSpMV_bufferSize(handle_,
+                                                   opA_,
+                                                   &alpha,
+                                                   descrA_,
+                                                   descrx_,
+                                                   &beta,
+                                                   descry_,
+                                                   cudaDataType_,
+                                                   alg_,
+                                                   &buffer_size_));
+        std::cout << "\tbufferSize run" << std::endl;
+        cudaCheckError(cudaMalloc((void**)&buffer_, buffer_size_));
+        std::cout << "\tbuffer allocated" << std::endl;
+        cusparseCheckError(cusparseSpMV_preprocess(handle_,
+                                                   opA_,
+                                                   &alpha,
+                                                   descrA_,
+                                                   descrx_,
+                                                   &beta,
+                                                   descry_,
+                                                   cudaDataType_,
+                                                   alg_,
+                                                   buffer_));
+        std::cout << "\tpreProcess run" << std::endl;
+        cusparseCheckError(cusparseSpMV(handle_,
+                                        opA_,
+                                        &alpha,
+                                        descrA_,
+                                        descrx_,
+                                        &beta,
+                                        descry_,
+                                        cudaDataType_,
+                                        alg_,
+                                        buffer_));
+        std::cout << "\tSpMV run" << std::endl;
+        cudaCheckError(cudaMemcpy(A_val_, A_val_dev_, vals_size_,
+                                  cudaMemcpyDeviceToHost));
+        cudaCheckError(cudaMemcpy(A_col_, A_col_dev_, cols_size_,
+                                  cudaMemcpyDeviceToHost));
+        cudaCheckError(cudaMemcpy(A_row_, A_row_dev_, rows_size_,
+                                  cudaMemcpyDeviceToHost));
+        std::cout << "\tA_ csr host arrays sunc" << std::endl;
+        cudaCheckError(cudaMemcpy(x_, x_dev_, x_size_, cudaMemcpyDeviceToHost));
+        std::cout << "\tx_ host array sunc" << std::endl;
+        cudaCheckError(cudaMemcpy(y_, y_dev_, y_size_, cudaMemcpyDeviceToHost));
+        std::cout << "\ty_ host array sunc" << std::endl;
+        // Freeing memory
+        cudaCheckError(cudaFree(buffer_));
+        std::cout << "\tBuffer 1 freed" << std::endl;
+        buffer_size_ = 0;
+        break;
+      }
+      case gpuOffloadType::once: {
+        cusparseCheckError(
+                cusparseSpMV_bufferSize(handle_,
+                                        opA_,
+                                        &alpha,
+                                        descrA_,
+                                        descrx_,
+                                        &beta,
+                                        descry_,
+                                        cudaDataType_,
+                                        alg_,
+                                        &buffer_size_));
+        std::cout << "\tbufferSize run" << std::endl;
+        cudaCheckError(cudaMalloc(&buffer_, buffer_size_));
+        std::cout << "\tbuffer allocated" << std::endl;
+        // ToDo -- only preprocess once?
+        cusparseCheckError(
+                cusparseSpMV_preprocess(handle_,
+                                        opA_,
+                                        &alpha,
+                                        descrA_,
+                                        descrx_,
+                                        &beta,
+                                        descry_,
+                                        cudaDataType_,
+                                        alg_,
+                                        buffer_));
+        std::cout << "\tpreProcess run" << std::endl;
+        cusparseCheckError(
+                cusparseSpMV(handle_,
+                             opA_,
+                             &alpha,
+                             descrA_,
+                             descrx_,
+                             &beta,
+                             descry_,
+                             cudaDataType_,
+                             alg_,
+                             buffer_));
+        std::cout << "\tSpMV run" << std::endl;
+        // Freeing memory
+        cudaCheckError(cudaFree(buffer_));
+        std::cout << "\tBuffer 1 freed" << std::endl;
+        break;
+      }
+      case gpuOffloadType::unified: {
+        cusparseCheckError(cusparseSpMV_bufferSize(handle_,
+                                                   opA_,
+                                                   &alpha,
+                                                   descrA_,
+                                                   descrx_,
+                                                   &beta,
+                                                   descry_,
+                                                   cudaDataType_,
+                                                   alg_,
+                                                   &buffer_size_));
+        std::cout << "\tbufferSize run" << std::endl;
+        cudaCheckError(cudaMallocManaged((void**)&buffer_, buffer_size_));
+        std::cout << "\tbuffer allocated" << std::endl;
+        cusparseCheckError(cusparseSpMV_preprocess(handle_,
+                                                   opA_,
+                                                   &alpha,
+                                                   descrA_,
+                                                   descrx_,
+                                                   &beta,
+                                                   descry_,
+                                                   cudaDataType_,
+                                                   alg_,
+                                                   buffer_));
+        std::cout << "\tpreProcess run" << std::endl;
+        cusparseCheckError(cusparseSpMV(handle_,
+                                        opA_,
+                                        &alpha,
+                                        descrA_,
+                                        descrx_,
+                                        &beta,
+                                        descry_,
+                                        cudaDataType_,
+                                        alg_,
+                                        buffer_));
+        std::cout << "\tSpMV run" << std::endl;
+        // Freeing memory
+        cudaCheckError(cudaFree(buffer_));
+        buffer_size_ = 0;
+        break;
+      }
+    }
+	}
+  /** Perform any required steps after calling the GEMM kernel that should
+   * be timed. */
+  void postLoopRequirements() override {
+    std::cout << std::endl << "##############################" << std::endl
+              << "\tpostloop Requirements" << std::endl
+              << "##############################" << std::endl;
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        break;
+      }
+      case gpuOffloadType::once: {
+        cudaCheckError(cudaMemcpy(A_val_, A_val_dev_, vals_size_,
+                                  cudaMemcpyDeviceToHost));
+        cudaCheckError(cudaMemcpy(A_col_, A_col_dev_, cols_size_,
+                                  cudaMemcpyDeviceToHost));
+        cudaCheckError(cudaMemcpy(A_row_, A_row_dev_, rows_size_,
+                                  cudaMemcpyDeviceToHost));
+        std::cout << "\tA_ csr host arrays sunc" << std::endl;
+        cudaCheckError(cudaMemcpy(x_, x_dev_, x_size_, cudaMemcpyDeviceToHost));
+        std::cout << "\tx_ host array sunc" << std::endl;
+        cudaCheckError(cudaMemcpy(y_, y_dev_, y_size_,
+                                       cudaMemcpyDeviceToHost));
+        std::cout << "\ty_ host array sunc" << std::endl;
+        cusparseCheckError(cusparseDestroySpMat(descrA_));
+        cusparseCheckError(cusparseDestroyDnVec(descrx_));
+        cusparseCheckError(cusparseDestroyDnVec(descry_));
+        break;
+      }
+      case gpuOffloadType::unified: {
+        // Ensure all data resides on host once work has completed
+        cudaCheckError(cudaMemPrefetchAsync(A_val_, vals_size_,
+                                            cudaCpuDeviceId, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_col_, cols_size_,
+                                            cudaCpuDeviceId, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_row_, rows_size_,
+                                            cudaCpuDeviceId, s1_));
+        std::cout << "\tA_ csr arrays sunc" << std::endl;
+        cudaCheckError(cudaMemPrefetchAsync(x_, x_size_, cudaCpuDeviceId, s2_));
+        std::cout << "\tx_ array sunc" << std::endl;
+        cudaCheckError(cudaMemPrefetchAsync(y_, y_size_, cudaCpuDeviceId, s3_));
+        std::cout << "\ty_ array sunc" << std::endl;
+        // Ensure device has finished all work.
+        cudaCheckError(cudaDeviceSynchronize());
+        std::cout << "\tdevice and host sunc" << std::endl;
+        cusparseCheckError(cusparseDestroySpMat(descrA_));
+        cusparseCheckError(cusparseDestroyDnVec(descrx_));
+        cusparseCheckError(cusparseDestroyDnVec(descry_));
+        break;
+      }
+    }
+  }
+  /** Do any necessary cleanup (free pointers, close library handles, etc.)
+   * after Kernel has been called. */
+  void postCallKernelCleanup() override {
+    free(A_);
+    if (offload_ == gpuOffloadType::unified) {
+      cudaCheckError(cudaFree(A_val_));
+      cudaCheckError(cudaFree(A_col_));
+      cudaCheckError(cudaFree(A_row_));
+    } else {
+      free(A_val_);
+      free(A_col_);
+      free(A_row_);
+      cudaCheckError(cudaFree(A_val_dev_));
+      cudaCheckError(cudaFree(A_col_dev_));
+      cudaCheckError(cudaFree(A_row_dev_));
+    }
+    // Destroy the handle
+    cusparseCheckError(cusparseDestroy(handle_));
+    // Destroy streams after use
+    cudaCheckError(cudaStreamDestroy(s1_));
+    cudaCheckError(cudaStreamDestroy(s2_));
+    cudaCheckError(cudaStreamDestroy(s3_));
+  }
+    void toCSR_int(T* dense, int n_col, int n_row, T* vals, int* col_index,
+             int* row_ptr) {
+      int nnz_encountered = 0;
+      for (int row = 0; row < n_row; row++) {
+        row_ptr[row] = nnz_encountered;
+        for (int col = 0; col < n_col; col++) {
+          if (dense[(row * n_) + col] != 0.0) {
+            col_index[nnz_encountered] = col;
+            vals[nnz_encountered] = dense[(row * n_) + col];
+            nnz_encountered++;
+          }
+        }
+      }
+		};
+  // ToDo -- the two following functons are useful for debugging.  I'm
+  //  keeping them in to that end, though they are not used by the benchmark
+  //  itself
+  void printDenseMatrix(T* M, int rows, int cols) {
+    for (int row = 0; row < rows; row++) {
+      std::cout << "| ";
+      for (int col = 0; col < cols; col++) {
+        std::cout << M[(row * cols) + col] << " | ";
+      }
+      std::cout << std::endl;
+    }
+  }
+  void printCSR(T* values, int* col_indices, int* row_pointers, int nnz,
+                int rows, int cols) {
+    std::cout << "\tRow pointers__" << std::endl;
+    for (int p = 0; p < (rows + 1); p++) {
+      std::cout << row_pointers[p] << ", ";
+    }
+    std::cout << std::endl << "\tColumn Indices__" << std::endl;
+    for (int i = 0; i < nnz; i++) {
+      std::cout << col_indices[i] << ", ";
+    }
+    std::cout << std::endl << "\tValues__" << std::endl;
+    for (int v = 0; v < nnz; v++) {
+      std::cout << values[v] << ", ";
+    }
+    std::cout << std::endl;
+  }
+  /**
+   * ################################
+   *        CUSPARSE STUFF
+   * ################################
+   */
+  /** Handle used when calling cuBLAS. */
+  cusparseHandle_t handle_;
+  /** CUDA Streams - used to asynchronously move data between host and device.
+   */
+  cudaStream_t s1_;
+  cudaStream_t s2_;
+  cudaStream_t s3_;
+  /** The ID of the target GPU Device. */
+  int gpuDevice_;
+	// Create descriptors for matrices A->C
+	cusparseSpMatDescr_t descrA_;
+  cusparseDnVecDescr_t descrx_, descry_;
+	// Data type depends on kernel being run
+	cudaDataType_t cudaDataType_;
+	size_t buffer_size_ = 0;
+  void* buffer_ = NULL;
+  cusparseOperation_t opA_ = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cusparseOperation_t opB_ = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cusparseSpMVAlg_t alg_ = CUSPARSE_SPMV_CSR_ALG2;
+  cusparseIndexType_t rType_ = CUSPARSE_INDEX_32I;
+  cusparseIndexType_t cType_ = CUSPARSE_INDEX_32I;
+  cusparseIndexBase_t indType_ = CUSPARSE_INDEX_BASE_ZERO;
+  /** The constant value Alpha. */
+  const T alpha = ALPHA;
+  /** The constant value Beta. */
+  const T beta = BETA;
+  /**
+   * ################################
+   *        Matrix A parameters
+   * ################################
+   */
+	/** CSR format vectors on the host (also used for USM) */
+	T* A_val_;
+	int *A_col_, *A_row_;
+  /** CSR format vectors on the device. */
+	T* A_val_dev_;
+	int *A_col_dev_, *A_row_dev_;
+  /** Metadata */
+  uint64_t A_nnz_, vals_size_, cols_size_, rows_size_;
+  /**
+   * ################################
+   *    Vectors x and y parameters
+   * ################################
+   */
+  /** Vectors on the host (also used for USM) */
+  T * x_host_, *y_host_;
+  /** Vectors on the device */
+  T *x_dev_, *y_dev_;
+  /** Metadata */
+  uint64_t x_size_, y_size_;
+}  // namespace gpu
\ No newline at end of file
diff --git a/include/doGemm.hh b/include/doGemm.hh
index 93cc058..23caa6f 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -292,14 +292,14 @@ class doGemm {
         callDenseKernels(csvFile, 32, dim, 32);
-    // Close file
-    csvFile.close();
     if (doCPU_ && doGPU_) {
       // Print offload results to stdout
       printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)");
+      // Close file
+      csvFile.close();
     if (doSparse_) {    // Square sparse matrix - sparse matrix multiplication
       cpuGpu_always_ = cpuGpu_offloadThreshold();
@@ -307,10 +307,8 @@ class doGemm {
       cpuGpu_unified_ = cpuGpu_offloadThreshold();
       std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
               getKernelName() + "_sparse_square_99.csv");
-      if (upperLimit_ >= 32) {
-        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-          callSparseKernels(csvFile, dim, 0.99);
-        }
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        callSparseKernels(csvFile, dim, 0.99);
       // Close file
@@ -325,10 +323,8 @@ class doGemm {
       cpuGpu_unified_ = cpuGpu_offloadThreshold();
       csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
               getKernelName() + "_sparse_square_999.csv");
-      if (upperLimit_ >= 32) {
-        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-          callSparseKernels(csvFile, dim, 0.999);
-        }
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        callSparseKernels(csvFile, dim, 0.999);
       if (doCPU_ && doGPU_) {
@@ -341,10 +337,8 @@ class doGemm {
       cpuGpu_unified_ = cpuGpu_offloadThreshold();
       csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
               getKernelName() + "_sparse_square_9999.csv");
-      if (upperLimit_ >= 32) {
-        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-          callSparseKernels(csvFile, dim, 0.9999);
-        }
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        callSparseKernels(csvFile, dim, 0.9999);
       if (doCPU_ && doGPU_) {
@@ -358,10 +352,8 @@ class doGemm {
       csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
                                           getKernelName() +
-      if (upperLimit_ >= 32) {
-        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-          callSparseKernels(csvFile, dim, 0.99999);
-        }
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        callSparseKernels(csvFile, dim, 0.99999);
       if (doCPU_ && doGPU_) {
diff --git a/include/doGemv.hh b/include/doGemv.hh
index 2ab5fb1..0ecd814 100644
--- a/include/doGemv.hh
+++ b/include/doGemv.hh
@@ -8,6 +8,7 @@
 #if defined CPU_ARMPL
 #include "../ArmPL/gemv.hh"
+#include "../ArmPL/sp_gemv.hh"
 #elif defined CPU_ONEMKL
 #include "../oneMKL/CPU/gemv.hh"
 #elif defined CPU_AOCL
@@ -20,6 +21,7 @@
 #if defined GPU_CUBLAS
 #include "../cuBLAS/gemv.hh"
+#include "../cuBLAS/sp_gemv.hh"
 #elif defined GPU_ONEMKL
 #include "../oneMKL/GPU/gemv.hh"
 #elif defined GPU_ROCBLAS
@@ -45,11 +47,13 @@ class doGemv {
-        gemvCpu_(iterations_)
+        gemvCpu_(iterations_),
+        spGemvCpu_(iterations_)
-        gemvGpu_(iterations_)
+        gemvGpu_(iterations_),
+        spGemvGpu_(iterations_)
     static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) &&
@@ -72,125 +76,148 @@ class doGemv {
           initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv");
       for (int dim = startDimention_; dim <= upperLimit_; dim++) {
         // M = dim, N = dim;
-        callKernels(csvFile, dim, dim);
+        callDenseKernels(csvFile, dim, dim);
       // Close file
       if (doCPU_ && doGPU_) {
         // Print offload results to stdout
         printOffloadThreshold("Square x Vector (M=N)");
-  #endif
-    // Rectangular Problem Sizes:
-    // Tall and thin x Vector
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_tall-thin_vector_M=16N.csv");
-    int N = startDimention_;
-    int M = 16 * N;
-    while (M <= upperLimit_) {
-      callKernels(csvFile, M, N);
-      M += 16;
-      N++;
-    }
-    // Close file
-    csvFile.close();
+      // Rectangular Problem Sizes:
+      // Tall and thin x Vector
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_vector_M=16N.csv");
+      int N = startDimention_;
+      int M = 16 * N;
+      while (M <= upperLimit_) {
+        callDenseKernels(csvFile, M, N);
+        M += 16;
+        N++;
+      }
+      // Close file
+      csvFile.close();
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Tall-and-Thin x Vector (M=16N)");
-    }
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Tall-and-Thin x Vector (M=16N)");
+      }
-    // Tall and thin x Vector
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_tall-thin_vector_M_N=32.csv");
-    if (upperLimit_ >= 32) {
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        // M = dim, N = 32;
-        callKernels(csvFile, dim, 32);
+      // Tall and thin x Vector
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_vector_M_N=32.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+          // M = dim, N = 32;
+          callDenseKernels(csvFile, dim, 32);
+        }
-    }
-    // Close file
-    csvFile.close();
+      // Close file
+      csvFile.close();
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Tall-and-Thin x Vector (M, N=32)");
-    }
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Tall-and-Thin x Vector (M, N=32)");
+      }
-    // Short and wide x Vector
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_short-wide_vector_N=16M.csv");
-    M = startDimention_;
-    N = 16 * M;
-    while (N <= upperLimit_) {
-      callKernels(csvFile, M, N);
-      M++;
-      N += 16;
-    }
-    // Close file
-    csvFile.close();
+      // Short and wide x Vector
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_short-wide_vector_N=16M.csv");
+      M = startDimention_;
+      N = 16 * M;
+      while (N <= upperLimit_) {
+        callDenseKernels(csvFile, M, N);
+        M++;
+        N += 16;
+      }
+      // Close file
+      csvFile.close();
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Short-and-Wide x Vector (N=16M)");
-    }
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Short-and-Wide x Vector (N=16M)");
+      }
-    // Short and wide x Vector
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_short-wide_vector_M=32_N.csv");
-    if (upperLimit_ >= 32) {
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        // M = 32, N = dim;
-        callKernels(csvFile, 32, dim);
+      // Short and wide x Vector
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_short-wide_vector_M=32_N.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+          // M = 32, N = dim;
+          callDenseKernels(csvFile, 32, dim);
+        }
-    }
-    // Close file
-    csvFile.close();
+      // Close file
+      csvFile.close();
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Short-and-Wide x Vector (M=32, N)");
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Short-and-Wide x Vector (M=32, N)");
+      }
+    if (doSparse_) {
+      // Sparse square matrix
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_sparse_square_9999.csv");
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        callSparseKernels(csvFile, dim, 0.9999);
+      }
+      // Close filex1
+      csvFile.close();
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Sparse square // sparsity = 0.9999");
+      }
-  }
+      csvFile.close();
+    }
   /** Call the appropriate CPU and GPU GEMV kernels. */
-  void callKernels(std::ofstream& csvFile, const int M, const int N) {
+  void callDenseKernels(std::ofstream& csvFile, const int M, const int N) {
     const double probSize = calcKib(M, N);
     const uint64_t flops = calcFlops(M, N);
     std::string kernelName = getKernelName();
@@ -275,6 +302,64 @@ class doGemv {
+  void callSparseKernels(std::ofstream& csvFile, const int N, const float
+  sparsity) {
+    const double probSize = calcKib(N, N);
+    const uint64_t flops = calcFlops(N, N);
+    std::string kernelName = getKernelName();
+    time_checksum_gflop cpuResult;
+    time_checksum_gflop gpuResult_once;
+    time_checksum_gflop gpuResult_always;
+    time_checksum_gflop gpuResult_unified;
+    if (doCPU_) {
+      spGemvCpu_.initialise(N, sparsity);
+      time_checksum_gflop cpuResult = spGemvCpu_.compute();
+      cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
+      // Write result to CSV file
+      writeLineToCsv(csvFile, "cpu", kernelName, N, N, 0, probSize, sparsity,
+                     iterations_, cpuResult.runtime, cpuResult.gflops);
+    }
+    if (doGPU_) {
+      // - ONCE : Offload to/from GPU once before all iterations and once
+      // after
+      spGemvGpu_.initialise(gpuOffloadType::once, N, sparsity);
+      gpuResult_once = spGemvGpu_.compute();
+      gpuResult_once.gflops =
+          calcGflops(flops, iterations_, gpuResult_once.runtime);
+      // - ALWAYS: Offload to/from GPU every iteration
+      spGemvGpu_.initialise(gpuOffloadType::always, N, sparsity);
+      gpuResult_always = spGemvGpu_.compute();
+      gpuResult_always.gflops =
+          calcGflops(flops, iterations_, gpuResult_always.runtime);
+      // - UNIFIED : data passed from host to device (and device to host) as
+      //             needed
+      spGemvGpu_.initialise(gpuOffloadType::unified, N, sparsity);
+      gpuResult_unified = spGemvGpu_.compute();
+      gpuResult_unified.gflops =
+          calcGflops(flops, iterations_, gpuResult_unified.runtime);
+      // Write results to CSV file
+      writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, 0, probSize,
+                     sparsity, iterations_, gpuResult_once.runtime,
+                     gpuResult_once.gflops);
+      writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, 0,
+                     probSize, sparsity, iterations_, gpuResult_always.runtime,
+                     gpuResult_always.gflops);
+      writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, 0, probSize,
+                     sparsity, iterations_, gpuResult_unified.runtime,
+                     gpuResult_unified.gflops);
+    }
+  }
   /** Ensure all CPU and GPU checksums are within the permitted limit of
    * eachother. */
   void checkChecksums(time_checksum_gflop cpuResult,
@@ -506,11 +591,13 @@ class doGemv {
   /** The GEMV CPU kernel. */
   cpu::gemv_cpu<T> gemvCpu_;
+  cpu::sp_gemv_cpu<T> spGemvCpu_;
   /** The GEMV GPU kernel. */
   gpu::gemv_gpu<T> gemvGpu_;
+  gpu::sp_gemv_gpu<T> spGemvGpu_;
   /** The point at which offloading to GPU (offload once) becomes worthwhile. */
diff --git a/include/kernels/CPU/sp_gemv.hh b/include/kernels/CPU/sp_gemv.hh
index 0c84cb0..28b0caf 100644
--- a/include/kernels/CPU/sp_gemv.hh
+++ b/include/kernels/CPU/sp_gemv.hh
@@ -27,6 +27,11 @@ namespace cpu {
           n_ = n;
           sparsity_ = sparsity;
+          // Note that the below should be the same as the edges calculation
+          // used in the initInputMatricesSparse function.  If changed here,
+          // change there
+          nnz_ = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity_));
           A_ = (T*)malloc(sizeof(T) * m_ * n_);
           x_ = (T*)malloc(sizeof(T) * n_);
           y_ = (T*)malloc(sizeof(T) * m_);
@@ -35,6 +40,9 @@ namespace cpu {
+    protected:
+        uint64_t nnz_;
         /** Do any necessary cleanup (free pointers, close library handles, etc.)
          * after Kernel has been called. */
@@ -43,5 +51,6 @@ namespace cpu {
 }  // namespace cpu
\ No newline at end of file

From bc70814a714608e7f492d5e331150f8a68263ced Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Thu, 2 Jan 2025 13:11:51 +0000
Subject: [PATCH 34/38] Getting rid of old oneMKL sparse file

 oneMKL/CPU/sp_gemm.hh | 239 ------------------------------------------
 1 file changed, 239 deletions(-)
 delete mode 100644 oneMKL/CPU/sp_gemm.hh

diff --git a/oneMKL/CPU/sp_gemm.hh b/oneMKL/CPU/sp_gemm.hh
deleted file mode 100644
index 0b4e32b..0000000
--- a/oneMKL/CPU/sp_gemm.hh
+++ /dev/null
@@ -1,239 +0,0 @@
-#pragma once
-#ifdef CPU_ONEMKL
-#include <mkl.h>
-#include <algorithm>
-#include "../../include/kernels/CPU/sp_gemm.hh"
-#include "../../include/utilities.hh"
-namespace cpu {
-/** A class for GEMM CPU BLAS kernels. */
-template <typename T>
-class sp_gemm_cpu : public sp_gemm<T> {
- public:
-  using sp_gemm<T>::sp_gemm;
-  using sp_gemm<T>::initInputMatricesSparse;
-  using sp_gemm<T>::toCSR;
-  using sp_gemm<T>::callConsume;
-  using sp_gemm<T>::n_;
-  using sp_gemm<T>::A_;
-  using sp_gemm<T>::B_;
-  using sp_gemm<T>::C_;
-  /** Initialise the required data structures. */
-  void initialise(int n, float sparsity) {
-    A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64);
-    B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64);
-    C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64);
-    n_ = n * 100;
-    nnz_ = (1 + (int)(n_ * n_ * (1 - sparsity)));
-    values_A_ = (T*)mkl_malloc(sizeof(T) * nnz_, ALIGN);
-    columns_A_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, ALIGN);
-    rowIndex_A_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (n_ + 1), ALIGN);
-    values_B_ = (T*)mkl_malloc(sizeof(T) * nnz_, ALIGN);
-    columns_B_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, ALIGN);
-    rowIndex_B_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (n_ + 1), ALIGN);
-    x_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN);
-    y_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN);
-    rslt_mv_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN);
-    rslt_mv_trans_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN);
-    // Initialise the matricies
-    initInputMatricesSparse(sparsity);
-    descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL;
-    // Transfer from dense to CSR format
-    toCSR_mkl(A_, n_, n_, values_A_, columns_A_, rowIndex_A_);
-    toCSR_mkl(B_, n_, n_, values_B_, columns_B_, rowIndex_B_);
-    // ToDo -- Set values for x and y (which are vectors of length n_?)
-    if constexpr (std::is_same_v<T, float>) {
-      CALL_AND_CHECK_STATUS(mkl_sparse_s_create_csr(&csrA_,
-                                                    SPARSE_INDEX_BASE_ZERO, n_,
-                                                    n_, rowIndex_A_,
-                                                    rowIndex_A_+1, columns_A_,
-                                                    values_A_),
-                            "Error after MKL_SPARSE_D_CREATE_CSR for csrA\n");
-      CALL_AND_CHECK_STATUS(mkl_sparse_s_create_csr(&csrB_,
-                                                    SPARSE_INDEX_BASE_ZERO, n_,
-                                                    n_, rowIndex_B_,
-                                                    rowIndex_B_+1, columns_B_,
-                                                    values_B_),
-                            "Error after MKL_SPARSE_D_CREATE_CSR for csrB\n");
-    } else if constexpr (std::is_same_v<T, double>) {
-      CALL_AND_CHECK_STATUS(mkl_sparse_d_create_csr(&csrA_,
-                                                    SPARSE_INDEX_BASE_ZERO, n_,
-                                                    n_, rowIndex_A_,
-                                                    rowIndex_A_+1, columns_A_,
-                                                    values_A_),
-                            "Error after MKL_SPARSE_D_CREATE_CSR for csrA\n");
-      CALL_AND_CHECK_STATUS(mkl_sparse_d_create_csr(&csrB_,
-                                                    SPARSE_INDEX_BASE_ZERO, n_,
-                                                    n_, rowIndex_B_,
-                                                    rowIndex_B_+1, columns_B_,
-                                                    values_B_),
-                            "Error after MKL_SPARSE_D_CREATE_CSR for csrB\n");
-    } else {
-      std::cout << "ERROR - Datatype for OneMKL CPU spGEMM kernel not "
-                   "supported." << std::endl;
-      exit(1)
-    };
-                                            csrA_, csrB_, &csrC_),
-                            "Error after MKL_SPARSE_SPMM\n");
-    // ToDo -- check that transpose is what I want here
-    CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrA_,
-                                                 SPARSE_OPERATION_TRANSPOSE,
-                                                 descr_type_gen_, 1),
-                          "Error after MKL_SPARSE_SET_MV_HINT with csrA_\n");
-    CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrB_,
-                                                 SPARSE_OPERATION_NON_TRANSPOSE,
-                                                 descr_type_gen_, 1),
-                          "Error after MKL_SPARSE_SET_MV_HINT with csrB_\n");
-    CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrC_,
-                                                 SPARSE_OPERATION_NON_TRANSPOSE,
-                                                 descr_type_gen_, 1),
-                          "Error after MKL_SPARSE_SET_MV_HINT with csrC_\n");
-    CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrA_),
-                          "Error after MKL_SPARSE_OPTIMIZE with csrA_\n");
-    CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrB_),
-                          "Error after MKL_SPARSE_OPTIMIZE with csrB_\n");
-    CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrC_),
-                          "Error after MKL_SPARSE_OPTIMIZE with csrC_\n");
-  }
- private:
-  /** Make call to the GEMM kernel. */
-  void callGemm() override {
-    if constexpr (std::is_same_v<T, float>) {
-      .0, csrC_, descr_type_gen_, x_, 0.0, rslt_mv_),
-                            "Error after MKL_SPARSE_S_MV for csrC_ * x_\n");
-      left_ = cblas_sdot(n_, rstl_mv_, 1, y_, 1);
-      .0, csrB_, descr_type_gen_, x, 0.0, trslt_mv_),
-                            "Error adter MKL_SPARSE_S_MV for csrB_ * x_\n");
-                                            csrA_, descr_type_gen_, y_, 0.0,
-                                            rslt_mv_trans_),
-                            "Error adter MKL_SPARSE_S_MV for csrA_ * y_\n");
-      right_ = cblas_sdot(n_, rslt_mv_, 1, rslt_mv_trans_, 1);
-      residual = fabs(left - right)/(fabs(left) + 1);
-      CALL_AND_CHECK_STATUS(mkl_sparse_s_export_csr(csrC_, &indexing_,
-                                                    &rows_, &cols_,
-                                                    &pointerB_C_,
-                                                    &pointerE_C_,
-                                                    &columns_C_, &values_C_),
-                            "Error after MKL_SPARSE_S_EXPORT_CSR\n");
-    } else if constexpr (std::is_same_v<T, double) {
-      .0, csrC_, descr_type_gen_, x_, 0.0, rslt_mv_),
-                            "Error after MKL_SPARSE_D_MV for csrC_ * x_\n");
-      left_ = cblas_ddot(n_, rstl_mv_, 1, y_, 1);
-      .0, csrB_, descr_type_gen_, x, 0.0, trslt_mv_),
-                            "Error adter MKL_SPARSE_D_MV for csrB_ * x_\n");
-                                            csrA_, descr_type_gen_, y_, 0.0,
-                                            rslt_mv_trans_),
-                            "Error adter MKL_SPARSE_D_MV for csrA_ * y_\n");
-      right_ = cblas_ddot(n_, rslt_mv_, 1, rslt_mv_trans_, 1);
-      residual = fabs(left - right)/(fabs(left) + 1);
-      CALL_AND_CHECK_STATUS(mkl_sparse_d_export_csr(csrC_, &indexing_,
-                                                    &rows_, &cols_,
-                                                    &pointerB_C_,
-                                                    &pointerE_C_,
-                                                    &columns_C_, &values_C_),
-                            "Error after MKL_SPARSE_D_EXPORT_CSR\n");
-    }
-    // Ensure compiler doesn't optimise away the work being done
-    callConsume();
-  }
-  /** Perform any required steps before calling the GEMM kernel that should
-   * be timed. */
-  void preLoopRequirements() override {}
-  /** Perform any required steps after calling the GEMM kernel that should
-   * be timed. */
-  void postLoopRequirements() override {}
-  /** Do any necessary cleanup (free pointers, close library handles, etc.)
-   * after Kernel has been called. */
-  void postCallKernelCleanup() override {
-    if (mkl_sparse_destroy(csrC_) != SPARSE_STATUS_SUCCESS) {
-      printf(" Error after MKL_SPARSE_DESTROY, csrC_\n");
-      fflush(0);
-      status = 1;
-    }
-    //Deallocate arrays for which we allocate memory ourselves.
-    mkl_free(rslt_mv_trans_);
-    mkl_free(rslt_mv-);
-    mkl_free(x_);
-    mkl_free(y_);
-    //Release matrix handle and deallocate arrays for which we allocate memory ourselves.
-    if (mkl_sparse_destroy(csrA_) != SPARSE_STATUS_SUCCESS) {
-      printf("Error after MKL_SPARSE_DESTROY, csrA_\n");
-      fflush(0);
-      status = 1;
-    }
-    mkl_free(values_A_);
-    mkl_free(columns_A_);
-    mkl_free(rowIndex_A_);
-    if (mkl_sparse_destroy(csrB_) != SPARSE_STATUS_SUCCESS) {
-      printf("Error after MKL_SPARSE_DESTROY, csrB_\n");
-      fflush(0);
-      status = 1;
-    }
-    mkl_free(values_B_);
-    mkl_free(columns_B_);
-    mkl_free(rowIndex_B_);
-  }
-  int nnz_;
-  MKL_INT* columns_A_;
-  MKL_INT* columns_B_;
-  MKL_INT* columns_C_;
-  MKL_INT* rowIndex_A_;
-  MKL_INT* rowIndex_B_;
-  MKL_INT* pointerB_C_;
-  MKL_INT* pointerE_C_;
-  T* rslt_mv_;
-  T* rslt_mv_trans_;
-  T* x_;
-  T* y_;
-  T left_, right_, residual_;
-  MKL_INT rows_, cols_, i_, j_, ii_, status_;
-  sparse_index_base_t indexing_;
-  struct matrix_descr descr_type_gen_;
-  sparse_matrix_t csrA_, csrB_, csrC_;
-}  // namespace cpu
\ No newline at end of file

From 52d5e913fe3715a5da5cb1f32f1b5740fc55ce1b Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Tue, 7 Jan 2025 16:52:51 +0000
Subject: [PATCH 35/38] Refactoring to make individual files relate to a single

 .idea/workspace.xml                           |  39 +-
 AOCL/sp_gemm.hh                               |  88 ---
 ArmPL/{sp_gemv.hh => spgemv.hh}               |   0
 ArmPL/{sp_gemm.hh => spmm.hh}                 | 327 ++++++-----
 cuBLAS/{sp_gemv.hh => spgemv.hh}              |   0
 cuBLAS/{sp_gemm.hh => spmm.hh}                | 264 +++++----
 include/doGemm.hh                             | 550 +++++++-----------
 include/doGemv.hh                             | 340 ++++-------
 include/doSpgemm.hh                           |   8 +
 include/doSpgemv.hh                           |   8 +
 include/doSpmm.hh                             | 445 ++++++++++++++
 include/kernels/CPU/sp_gemm.hh                | 108 ----
 include/kernels/CPU/{sp_gemv.hh => spgemv.hh} |   0
 include/kernels/CPU/spgmm.hh                  |   8 +
 include/kernels/CPU/spmm.hh                   |  60 ++
 include/kernels/GPU/sp_gemm.hh                |  28 -
 include/kernels/GPU/spgemm.hh                 |   8 +
 include/kernels/GPU/{sp_gemv.hh => spgemv.hh} |   0
 include/kernels/GPU/spmm.hh                   |  28 +
 include/kernels/gemm.hh                       | 128 ----
 include/kernels/spgemm.hh                     |   8 +
 include/kernels/spgemv.hh                     |   8 +
 include/kernels/spmm.hh                       | 168 ++++++
 include/main.hh                               |   3 +
 src/                                   | 192 +++---
 25 files changed, 1545 insertions(+), 1271 deletions(-)
 delete mode 100644 AOCL/sp_gemm.hh
 rename ArmPL/{sp_gemv.hh => spgemv.hh} (100%)
 rename ArmPL/{sp_gemm.hh => spmm.hh} (87%)
 rename cuBLAS/{sp_gemv.hh => spgemv.hh} (100%)
 rename cuBLAS/{sp_gemm.hh => spmm.hh} (80%)
 create mode 100644 include/doSpgemm.hh
 create mode 100644 include/doSpgemv.hh
 create mode 100644 include/doSpmm.hh
 delete mode 100644 include/kernels/CPU/sp_gemm.hh
 rename include/kernels/CPU/{sp_gemv.hh => spgemv.hh} (100%)
 create mode 100644 include/kernels/CPU/spgmm.hh
 create mode 100644 include/kernels/CPU/spmm.hh
 delete mode 100644 include/kernels/GPU/sp_gemm.hh
 create mode 100644 include/kernels/GPU/spgemm.hh
 rename include/kernels/GPU/{sp_gemv.hh => spgemv.hh} (100%)
 create mode 100644 include/kernels/GPU/spmm.hh
 create mode 100644 include/kernels/spgemm.hh
 create mode 100644 include/kernels/spgemv.hh
 create mode 100644 include/kernels/spmm.hh

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 9592790..84d08df 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -15,13 +15,30 @@
   <component name="ChangeListManager">
-    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Beginning gemv ARMPL">
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Getting rid of old oneMKL sparse file">
+      <change afterPath="$PROJECT_DIR$/include/doSpgemm.hh" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/include/doSpgemv.hh" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/include/doSpmm.hh" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/include/kernels/CPU/spgmm.hh" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemm.hh" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/include/kernels/spgemm.hh" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/include/kernels/spgemv.hh" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/include/kernels/spmm.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/cuBLAS/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/sp_gemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/AOCL/sp_gemm.hh" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spmm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spgemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/cuBLAS/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spmm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/cuBLAS/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spgemv.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/include/doGemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemm.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/sp_gemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spgemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/GPU/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spmm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/GPU/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/main.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/main.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/src/" beforeDir="false" afterPath="$PROJECT_DIR$/src/" afterDir="false" />
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -553,7 +570,15 @@
       <option name="project" value="LOCAL" />
-    <option name="localTasksCounter" value="47" />
+    <task id="LOCAL-00047" summary="Getting rid of old oneMKL sparse file">
+      <option name="closed" value="true" />
+      <created>1735823512058</created>
+      <option name="number" value="00047" />
+      <option name="presentableId" value="LOCAL-00047" />
+      <option name="project" value="LOCAL" />
+      <updated>1735823512058</updated>
+    </task>
+    <option name="localTasksCounter" value="48" />
     <servers />
   <component name="TypeScriptGeneratedFilesManager">
@@ -571,7 +596,6 @@
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="Trying to work out CSR malloc bug" />
     <MESSAGE value="cuSPARSE unified memory implementation" />
     <MESSAGE value="Now compiles" />
     <MESSAGE value="Now compiles with fewer runtime errors" />
@@ -596,6 +620,7 @@
     <MESSAGE value="Providing armpl with hints" />
     <MESSAGE value="Updating to show sparsity" />
     <MESSAGE value="Beginning gemv ARMPL" />
-    <option name="LAST_COMMIT_MESSAGE" value="Beginning gemv ARMPL" />
+    <MESSAGE value="Getting rid of old oneMKL sparse file" />
+    <option name="LAST_COMMIT_MESSAGE" value="Getting rid of old oneMKL sparse file" />
\ No newline at end of file
diff --git a/AOCL/sp_gemm.hh b/AOCL/sp_gemm.hh
deleted file mode 100644
index 4fc178b..0000000
--- a/AOCL/sp_gemm.hh
+++ /dev/null
@@ -1,88 +0,0 @@
-#pragma once
-#ifdef CPU_AOCL
-#include <blis.h>
-#include "../include/kernels/CPU/gemm.hh"
-#include "../include/utilities.hh"
-namespace cpu {
-/** A class for GEMM CPU BLAS kernels. */
-template <typename T>
-class gemm_cpu : public gemm<T> {
- public:
-  using gemm<T>::gemm;
-  using gemm<T>::callConsume;
-  using gemm<T>::m_;
-  using gemm<T>::n_;
-  using gemm<T>::k_;
-  using gemm<T>::A_;
-  using gemm<T>::B_;
-  using gemm<T>::C_;
- private:
-  /** Make call to the GEMM kernel. */
-  void callGemm() override {
-    if constexpr (std::is_same_v<T, float>) {
-      bli_sgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_,
-                rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_),
-                &beta, C_, rowStride, std::max(1, m_));
-    } else if constexpr (std::is_same_v<T, double>) {
-      // Todo -- base?
-      aoclsparse_create_dscr(&A_csr_, base, n_, n_, nnz_,
-      (),,;
-      aoclsparse_create_dscr(&B_csr_, base, n_, n_, nnz_,
-      (),,;
-      aoclsparse_spmm(aoclsparse_operation_none, A_csr_, B_csr_, &C_csr_);
-      aoclsparse_export_dcsr(C_csr_, &base, &C_M_, &C_N_, &nnz_C_,
-                             &csr_row_ptr_C_, &csr_col_ind_C_, (void**)
-                             &csr_val_C_);
-    } else {
-      // Un-specialised class will not do any work - print error and exit.
-      std::cout << "ERROR - Datatype for AOCL CPU GEMM kernel not supported."
-                << std::endl;
-      exit(1);
-    }
-    // Ensure compiler doesn't optimise away the work being done
-    callConsume();
-  }
-  /** Perform any required steps before calling the GEMM kernel that should
-   * be timed. */
-  void preLoopRequirements() override {}
-  /** Perform any required steps after calling the GEMM kernel that should
-   * be timed. */
-  void postLoopRequirements() override {}
-  /** The constant value Alpha. */
-  T alpha = ALPHA;
-  /** The constant value Beta. */
-  T beta = BETA;
-  /** The distance in elements to the next column. */
-  const int rowStride = 1;
-  aoclsparse_matrix A_csr_;
-  aoclsparse_int* csr_row_ptr_A_;
-  aoclsparse_int* csr_col_ind_A_;
-  T* csr_val_A_;
-  aoclsparse_matrix B_csr_;
-  aoclsparse_int* csr_row_ptr_B_;
-  aoclsparse_int* csr_col_ind_B_;
-  T* csr_val_B_;
-  aoclsparse_matrix C_csr_;
-  aoclsparse_int* csr_row_ptr_C_;
-  aoclsparse_int* csr_col_ind_C_;
-  T* csr_val_C_;
-  aoclsparse_int C_M_;
-  aoclsparse_int C_N_;
-  aoclsparse_status status;
-}  // namespace cpu
\ No newline at end of file
diff --git a/ArmPL/sp_gemv.hh b/ArmPL/spgemv.hh
similarity index 100%
rename from ArmPL/sp_gemv.hh
rename to ArmPL/spgemv.hh
diff --git a/ArmPL/sp_gemm.hh b/ArmPL/spmm.hh
similarity index 87%
rename from ArmPL/sp_gemm.hh
rename to ArmPL/spmm.hh
index e8e28a5..93ed4b5 100644
--- a/ArmPL/sp_gemm.hh
+++ b/ArmPL/spmm.hh
@@ -8,26 +8,177 @@
 #include <algorithm>
-#include "../include/kernels/CPU/sp_gemm.hh"
+#include "../include/kernels/CPU/spmm.hh"
 #include "../include/utilities.hh"
 namespace cpu {
 /** A class for GEMM CPU BLAS kernels. */
 template <typename T>
-class sp_gemm_cpu : public sp_gemm<T> {
+class spmm_cpu : public spmm<T> {
-  using sp_gemm<T>::sp_gemm;
-  using sp_gemm<T>::callConsume;
-  using sp_gemm<T>::m_;
-  using sp_gemm<T>::n_;
-  using sp_gemm<T>::k_;
-  using sp_gemm<T>::A_;
-  using sp_gemm<T>::B_;
-  using sp_gemm<T>::C_;
-  using sp_gemm<T>::nnz_;
-  using sp_gemm<T>::A_vals_;
-  using sp_gemm<T>::B_vals_;
-  using sp_gemm<T>::C_vals_;
+  using spmm<T>::spmm;
+  using spmm<T>::callConsume;
+  using spmm<T>::m_;
+  using spmm<T>::n_;
+  using spmm<T>::k_;
+  using spmm<T>::A_;
+  using spmm<T>::B_;
+  using spmm<T>::C_;
+  using spmm<T>::nnzA_;
+  using spmm<T>::nnzB_;
+ protected:
+  void toSparseFormat() override {
+    m_armpl_ = m_;
+    n_armpl_ = n_;
+    k_armpl_ = k_;
+    // ToDo -- check whether flags_ is correct!
+    flags_ = 0;
+    // Move A to CSR
+    A_armpl_row_ptr_ = new armpl_int_t[m_ + 1];
+    A_armpl_col_index_ = new armpl_int_t[nnzA_];
+    A_vals_ = new T[nnzA_];
+    A_armpl_row_ptr_[0] = 0;
+    int nnz_encountered = 0;
+    for (int row = 0; row < m_; row++) {
+      A_armpl_row_ptr_[row + 1] = nnz_encountered;
+      for (int col = 0; col < k_; col++) {
+        if (A_[(row * k_) + col] != 0.0) {
+          A_armpl_col_index_[nnz_encountered] = col;
+          A_vals_[nnz_encountered] = static_cast<T>(A_[(row * k_) + col]);
+          nnz_encountered++;
+        }
+      }
+    }
+    // Move B to CSR
+    B_armpl_row_ptr_ = new armpl_int_t[k_ + 1];
+    B_armpl_col_index_ = new armpl_int_t[nnz_];
+    B_vals_ = new T[nnz_];
+    B_armpl_row_ptr_[0] = 0;
+    nnz_encountered = 0;
+    for (int row = 0; row < k_; row++) {
+      B_armpl_row_ptr_[row + 1] = nnz_encountered;
+      for (int col = 0; col < n_; col++) {
+        if (B_[(row * n_) + col] != 0.0) {
+          B_armpl_col_index_[nnz_encountered] = col;
+          B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
+          nnz_encountered++;
+        }
+      }
+    }
+    // Move C to CSR
+    C_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+    C_armpl_col_index_ = new armpl_int_t[0];
+    C_vals_ = new T[0];
+    // ToDo Commented out below as it should be needed?
+//    C_armpl_row_ptr_[0] = 0;
+//    nnz_encountered = 0;
+//    for (int row = 0; row < n_; row++) {
+//      C_armpl_row_ptr_[row + 1] = nnz_encountered;
+//      for (int col = 0; col < n_; col++) {
+//        if (B_[(row * n_) + col] != 0.0) {
+//          C_armpl_col_index_[nnz_encountered] = col;
+//          C_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
+//          nnz_encountered++;
+//        }
+//      }
+//    }
+    if constexpr (std::is_same_v<T, float>) {
+//      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&A_armpl_,
+                                         m_armpl_,
+                                         k_armpl_,
+                                         A_armpl_row_ptr_,
+                                         A_armpl_col_index_,
+                                         A_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&B_armpl_,
+                                         k_armpl_,
+                                         n_armpl_,
+                                         B_armpl_row_ptr_,
+                                         B_armpl_col_index_,
+                                         B_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&C_armpl_,
+                                         m_armpl_,
+                                         n_armpl_,
+                                         C_armpl_row_ptr_,
+                                         C_armpl_col_index_,
+                                         C_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+    } else if constexpr (std::is_same_v<T, double>) {
+//      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
+//                nnz_, flags_
+      status_ = armpl_spmat_create_csr_d(&A_armpl_,
+                                         m_armpl_,
+                                         k_armpl_,
+                                         A_armpl_row_ptr_,
+                                         A_armpl_col_index_,
+                                         A_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_d(&B_armpl_,
+                                         k_armpl_,
+                                         n_armpl_,
+                                         B_armpl_row_ptr_,
+                                         B_armpl_col_index_,
+                                         B_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_d(&C_armpl_,
+                                         m_armpl_,
+                                         n_armpl_,
+                                         C_armpl_row_ptr_,
+                                         C_armpl_col_index_,
+                                         C_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+//      std::cout << "Okay, all matrices made!!" << std::endl;
+    }
+  }
   /** Make call to the GEMM kernel. */
@@ -213,152 +364,6 @@ class sp_gemm_cpu : public sp_gemm<T> {
   const T beta = BETA;
   void toCSR_armpl() {
-    n_armpl_ = n_;
-    // ToDo -- check whether flags_ is correct!
-    flags_ = 0;
-    // Move A to CSR
-    A_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
-    A_armpl_col_index_ = new armpl_int_t[nnz_];
-    A_vals_ = new T[nnz_];
-    A_armpl_row_ptr_[0] = 0;
-    int nnz_encountered = 0;
-    for (int row = 0; row < n_; row++) {
-      A_armpl_row_ptr_[row + 1] = nnz_encountered;
-      for (int col = 0; col < n_; col++) {
-        if (A_[(row * n_) + col] != 0.0) {
-          A_armpl_col_index_[nnz_encountered] = col;
-          A_vals_[nnz_encountered] = static_cast<T>(A_[(row * n_) + col]);
-          nnz_encountered++;
-        }
-      }
-    }
-    // Move B to CSR
-    B_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
-    B_armpl_col_index_ = new armpl_int_t[nnz_];
-    B_vals_ = new T[nnz_];
-    B_armpl_row_ptr_[0] = 0;
-    nnz_encountered = 0;
-    for (int row = 0; row < n_; row++) {
-      B_armpl_row_ptr_[row + 1] = nnz_encountered;
-      for (int col = 0; col < n_; col++) {
-        if (B_[(row * n_) + col] != 0.0) {
-          B_armpl_col_index_[nnz_encountered] = col;
-          B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
-          nnz_encountered++;
-        }
-      }
-    }
-    // Move C to CSR
-    C_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
-    C_armpl_col_index_ = new armpl_int_t[nnz_];
-    C_vals_ = new T[nnz_];
-    C_armpl_row_ptr_[0] = 0;
-    nnz_encountered = 0;
-    for (int row = 0; row < n_; row++) {
-      C_armpl_row_ptr_[row + 1] = nnz_encountered;
-      for (int col = 0; col < n_; col++) {
-        if (B_[(row * n_) + col] != 0.0) {
-          C_armpl_col_index_[nnz_encountered] = col;
-          C_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
-          nnz_encountered++;
-        }
-      }
-    }
-    if constexpr (std::is_same_v<T, float>) {
-//      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
-//                nnz_, flags_);
-      status_ = armpl_spmat_create_csr_s(&A_armpl_,
-                                         n_armpl_,
-                                         n_armpl_,
-                                         A_armpl_row_ptr_,
-                                         A_armpl_col_index_,
-                                         A_vals_,
-                                         flags_);
-      if (status_ != ARMPL_STATUS_SUCCESS) {
-        std::cout << "ERROR " << status_ << std::endl;
-        exit(1);
-      }
-//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
-//                nnz_, flags_);
-      status_ = armpl_spmat_create_csr_s(&B_armpl_,
-                                         n_armpl_,
-                                         n_armpl_,
-                                         B_armpl_row_ptr_,
-                                         B_armpl_col_index_,
-                                         B_vals_,
-                                         flags_);
-      if (status_ != ARMPL_STATUS_SUCCESS) {
-        std::cout << "ERROR " << status_ << std::endl;
-        exit(1);
-      }
-//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
-//                nnz_, flags_);
-      status_ = armpl_spmat_create_csr_s(&C_armpl_,
-                                         n_armpl_,
-                                         n_armpl_,
-                                         C_armpl_row_ptr_,
-                                         C_armpl_col_index_,
-                                         C_vals_,
-                                         flags_);
-      if (status_ != ARMPL_STATUS_SUCCESS) {
-        std::cout << "ERROR " << status_ << std::endl;
-        exit(1);
-      }
-    } else if constexpr (std::is_same_v<T, double>) {
-//      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
-//                nnz_, flags_
-      status_ = armpl_spmat_create_csr_d(&A_armpl_,
-                                         n_armpl_,
-                                         n_armpl_,
-                                         A_armpl_row_ptr_,
-                                         A_armpl_col_index_,
-                                         A_vals_,
-                                         flags_);
-      if (status_ != ARMPL_STATUS_SUCCESS) {
-        std::cout << "ERROR " << status_ << std::endl;
-        exit(1);
-      }
-//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
-//                nnz_, flags_);
-      status_ = armpl_spmat_create_csr_d(&B_armpl_,
-                                         n_armpl_,
-                                         n_armpl_,
-                                         B_armpl_row_ptr_,
-                                         B_armpl_col_index_,
-                                         B_vals_,
-                                         flags_);
-      if (status_ != ARMPL_STATUS_SUCCESS) {
-        std::cout << "ERROR " << status_ << std::endl;
-        exit(1);
-      }
-//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
-//                nnz_, flags_);
-      status_ = armpl_spmat_create_csr_d(&C_armpl_,
-                                         n_armpl_,
-                                         n_armpl_,
-                                         C_armpl_row_ptr_,
-                                         C_armpl_col_index_,
-                                         C_vals_,
-                                         flags_);
-      if (status_ != ARMPL_STATUS_SUCCESS) {
-        std::cout << "ERROR " << status_ << std::endl;
-        exit(1);
-      }
-//      std::cout << "Okay, all matrices made!!" << std::endl;
-    }
   void printCSR(armpl_int_t n, armpl_int_t* rp, armpl_int_t* ci, T* v,
@@ -385,7 +390,9 @@ class sp_gemm_cpu : public sp_gemm<T> {
   armpl_int_t flags_;
+  armpl_int_t m_armpl_;
   armpl_int_t n_armpl_;
+  armpl_int_t k_armpl_;
   armpl_int_t* A_armpl_row_ptr_;
   armpl_int_t* A_armpl_col_index_;
diff --git a/cuBLAS/sp_gemv.hh b/cuBLAS/spgemv.hh
similarity index 100%
rename from cuBLAS/sp_gemv.hh
rename to cuBLAS/spgemv.hh
diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/spmm.hh
similarity index 80%
rename from cuBLAS/sp_gemm.hh
rename to cuBLAS/spmm.hh
index b5e8d93..071c8c1 100644
--- a/cuBLAS/sp_gemm.hh
+++ b/cuBLAS/spmm.hh
@@ -7,23 +7,24 @@
 #include <random>
 #include <iostream>
-#include "../include/kernels/GPU/sp_gemm.hh"
+#include "../include/kernels/GPU/spmm.hh"
 #include "../include/utilities.hh"
 #include "common.hh"
 namespace gpu {
 /** A class for sparse GEMM GPU BLAS kernels. */
 template <typename T>
-class sp_gemm_gpu : public sp_gemm<T> {
+class spmm_gpu : public spmm<T> {
-  using sp_gemm<T>::sp_gemm;
-  using sp_gemm<T>::initInputMatricesSparse;
-  using sp_gemm<T>::toCSR_int;
-  using sp_gemm<T>::n_;
-  using sp_gemm<T>::A_;
-  using sp_gemm<T>::B_;
-  using sp_gemm<T>::C_;
-  using sp_gemm<T>::offload_;
+  using spmm<T>::spmm;
+  using spmm<T>::initInputMatrices;
+  using spmm<T>::m_
+  using spmm<T>::n_;
+  using spmm<T>::k_
+  using spmm<T>::A_;
+  using spmm<T>::B_;
+  using spmm<T>::C_;
+  using spmm<T>::offload_;
 	// ToDo -- No checksum for sparse yet.  Need to do
@@ -34,7 +35,7 @@ class sp_gemm_gpu : public sp_gemm<T> {
    *  - Always:  Move data from host to device and device to host each iteration
    *  - Unified: Initialise data as unified memory; no data movement semantics
    *             required */
-  void initialise(gpuOffloadType offload, int n, float sparsity) override {
+  void initialise(gpuOffloadType offload, int n, double sparsity) override {
     offload_ = offload;
     if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
@@ -43,7 +44,19 @@ class sp_gemm_gpu : public sp_gemm<T> {
       std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
+    m_ = m;
     n_ = n;
+    k_ = k;
+    A_ = (T*)malloc(sizeof(T) * m_ * k_);
+    B_ = (T*)malloc(sizeof(T) * k_ * n_);
+    C_ = (T*)calloc(sizeof(T) * m_ * n_);Ã¥
+    /** Determine the number of nnz elements in A and B */
+    nnzA_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+    nnzB_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_));
+    initInputMatrices(sparsity_);
     // Get device identifier
@@ -53,42 +66,37 @@ class sp_gemm_gpu : public sp_gemm<T> {
-   // Work out number of edges needed to achieve target sparsity
-    A_nnz_ = B_nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity));
     if (offload_ == gpuOffloadType::unified) {
-      cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_));
-      cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * A_nnz_));
-      cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1)));
+      cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * nnzA_));
+      cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * nnzA_));
+      cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (m_ + 1)));
-      cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * B_nnz_));
-      cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * B_nnz_));
-      cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1)));
+      cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * nnzB_));
+      cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * nnzB_));
+      cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (k_ + 1)));
       cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1)));
       C_val_ = NULL;
       C_col_ = NULL;
     } else {
-      A_val_ = (T*)malloc(sizeof(T) * A_nnz_);
-      A_col_ = (int*)malloc(sizeof(int) * A_nnz_);
-      A_row_ = (int*)malloc(sizeof(int) * (n_ + 1));
+      A_val_ = (T*)malloc(sizeof(T) * nnzA_);
+      A_col_ = (int*)malloc(sizeof(int) * nnzA_);
+      A_row_ = (int*)malloc(sizeof(int) * (m_ + 1));
-      B_val_ = (T*)malloc(sizeof(T) * B_nnz_);
-      B_col_ = (int*)malloc(sizeof(int) * B_nnz_);
-      B_row_ = (int*)malloc(sizeof(int) * (n_ + 1));
+      B_val_ = (T*)malloc(sizeof(T) * nnzB_);
+      B_col_ = (int*)malloc(sizeof(int) * nnzB_);
+      B_row_ = (int*)malloc(sizeof(int) * (k_ + 1));
       C_row_ = (int*)malloc(sizeof(int) * (n_ + 1));
-      cudaCheckError(cudaMalloc((void**)&A_val_dev_, sizeof(T) * A_nnz_));
-      cudaCheckError(cudaMalloc((void**)&A_col_dev_, sizeof(int) * A_nnz_));
-      cudaCheckError(cudaMalloc((void**)&A_row_dev_, sizeof(int) * (n_ + 1)));
+      cudaCheckError(cudaMalloc((void**)&A_val_dev_, sizeof(T) * nnzA_));
+      cudaCheckError(cudaMalloc((void**)&A_col_dev_, sizeof(int) * nnzA_));
+      cudaCheckError(cudaMalloc((void**)&A_row_dev_, sizeof(int) * (m_ + 1)));
-      cudaCheckError(cudaMalloc((void**)&B_val_dev_, sizeof(T) * B_nnz_));
-      cudaCheckError(cudaMalloc((void**)&B_col_dev_, sizeof(int) * B_nnz_));
-      cudaCheckError(cudaMalloc((void**)&B_row_dev_, sizeof(int) * (n_ + 1)));
+      cudaCheckError(cudaMalloc((void**)&B_val_dev_, sizeof(T) * nnzB_));
+      cudaCheckError(cudaMalloc((void**)&B_col_dev_, sizeof(int) * nnzB_));
+      cudaCheckError(cudaMalloc((void**)&B_row_dev_, sizeof(int) * (k_ + 1)));
       cudaCheckError(cudaMalloc((void**)&C_row_dev_, sizeof(int) * (n_ + 1)));
@@ -97,22 +105,6 @@ class sp_gemm_gpu : public sp_gemm<T> {
     C_mem_allocated_once_ = false;
     C_mem_allocated_unified_ = false;
-		// Initialise the host matricies
-		// cusparseSpGEMM() works on CSR format only.  This helpfully makes our
-		// sparse matrix format decision for us!
-		// Initialise the matrices
-		// Set initial values to 0
-    A_ = (T*)malloc(sizeof(T) * n_ * n_);
-    B_ = (T*)malloc(sizeof(T) * n_ * n_);
-    initInputMatricesSparse(sparsity);
-    toCSR_int(A_, n_, n_, A_val_, A_col_, A_row_);
-    toCSR_int(B_, n_, n_, B_val_, B_col_, B_row_);
 //    std::cout << "_____Matrix A_____" << std::endl;
 //    printDenseMatrix(A_, n_, n_);
 //    std::cout << std::endl << std::endl;
@@ -128,6 +120,41 @@ class sp_gemm_gpu : public sp_gemm<T> {
+ protected:
+  void toSparseFormat() override {
+    // Load A into CSR
+    int nnz_encountered = 0;
+    for (int row = 0; row < m_; row++) {
+      A_row_[row] = nnz_encountered;
+      int nnz_row = 0;
+      for (int col = 0; col < k_; col++) {
+        if (B_[(row * k_) + col] != 0.0) {
+          nnz_row++;
+          A_col_[nnz_encountered] = col;
+          A_val_[nnz_encountered] = A_[(row * k_) + col];
+          nnz_encountered++;
+        }
+      }
+    }
+    A_row_[m_] = nnz_encountered;
+    // Load B into CSR
+    int nnz_encountered = 0;
+    for (int row = 0; row < k_; row++) {
+      B_row_[row] = nnz_encountered;
+      int nnz_row = 0;
+      for (int col = 0; col < n_; col++) {
+        if (B_[(row * n_) + col] != 0.0) {
+          nnz_row++;
+          B_col_[nnz_encountered] = col;
+          B_val_[nnz_encountered] = B_[(row * n_) + col];
+          nnz_encountered++;
+        }
+      }
+    }
+    B_row_[k_] = nnz_encountered;
+  }
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
@@ -137,31 +164,31 @@ class sp_gemm_gpu : public sp_gemm<T> {
       case gpuOffloadType::always: {
         // Make matrix descriptors
-                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
+                cusparseCreateCsr(&descrA_, m_, k_, nnzA_, A_row_dev_,
                                   A_col_dev_, A_val_dev_, rType_, cType_,
                                   indType_, cudaDataType_));
-                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_,
+                cusparseCreateCsr(&descrB_, k_, n_, nnzB_, B_row_dev_,
                                   B_col_dev_, B_val_dev_, rType_, cType_,
                                   indType_, cudaDataType_));
-                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL,
+                cusparseCreateCsr(&descrC_, n_, m_, 0, C_row_dev_, NULL, NULL,
                                   rType_, cType_, indType_, cudaDataType_));
       case gpuOffloadType::once: {
         cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) *
-                                       A_nnz_, cudaMemcpyHostToDevice, s1_));
+                                       nnzA_, cudaMemcpyHostToDevice, s1_));
         cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) *
-                                       A_nnz_, cudaMemcpyHostToDevice, s1_));
-        cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (n_
+                                       nnzA_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (m_
                                        + 1), cudaMemcpyHostToDevice, s1_));
         cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) *
-                                       B_nnz_, cudaMemcpyHostToDevice, s2_));
+                                       nnzB_, cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) *
-                                       B_nnz_, cudaMemcpyHostToDevice, s2_));
-        cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_
+                                       nnzB_, cudaMemcpyHostToDevice, s2_));
+        cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (k_
                                        + 1), cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_
@@ -169,45 +196,45 @@ class sp_gemm_gpu : public sp_gemm<T> {
         // Craete matrix descriptors
-                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
+                cusparseCreateCsr(&descrA_, m_, k_, nnzA_, A_row_dev_,
                                   A_col_dev_, A_val_dev_, rType_, cType_,
                                   indType_, cudaDataType_));
-                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_,
+                cusparseCreateCsr(&descrB_, k_, n_, nnzB_, B_row_dev_,
                                   B_col_dev_, B_val_dev_, rType_, cType_,
                                   indType_, cudaDataType_));
-                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL,
+                cusparseCreateCsr(&descrC_, n_, m_, 0, C_row_dev_, NULL, NULL,
                                   rType_, cType_, indType_, cudaDataType_));
       case gpuOffloadType::unified: {
         // Prefetch memory to device
-        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_,
+        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * nnzA_,
                                             gpuDevice_, s1_));
-        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * A_nnz_,
+        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * nnzA_,
                                             gpuDevice_, s1_));
-        cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1),
+        cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (m_ + 1),
                                             gpuDevice_, s1_));
-        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * B_nnz_,
+        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * nnzB_,
                                             gpuDevice_, s2_));
-        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * B_nnz_,
+        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * nnzB_,
                                             gpuDevice_, s2_));
-        cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1),
+        cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (k_ + 1),
                                             gpuDevice_, s2_));
         // Make matrix descriptors
-                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_, A_col_,
+                cusparseCreateCsr(&descrA_, m_, k_, nnzA_, A_row_, A_col_,
                                   A_val_, rType_, cType_, indType_,
-                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_, B_col_,
+                cusparseCreateCsr(&descrB_, k_, n_, nnzB_, B_row_, B_col_,
                                   B_val_, rType_, cType_, indType_,
-                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_, NULL, NULL,
+                cusparseCreateCsr(&descrC_, n_, m_, 0, C_row_, NULL, NULL,
                                   rType_, cType_, indType_, cudaDataType_));
@@ -224,17 +251,17 @@ class sp_gemm_gpu : public sp_gemm<T> {
         cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) *
-        A_nnz_, cudaMemcpyHostToDevice, s1_));
+        nnzA_, cudaMemcpyHostToDevice, s1_));
         cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) *
-        A_nnz_, cudaMemcpyHostToDevice, s1_));
-        cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (n_
+        nnzA_, cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (m_
                                        + 1), cudaMemcpyHostToDevice, s1_));
         cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) *
-        B_nnz_, cudaMemcpyHostToDevice, s2_));
+        nnzB_, cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) *
-        B_nnz_, cudaMemcpyHostToDevice, s2_));
-        cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_
+        nnzB_, cudaMemcpyHostToDevice, s2_));
+        cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (k_
                                        + 1), cudaMemcpyHostToDevice, s2_));
         cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_
@@ -243,15 +270,15 @@ class sp_gemm_gpu : public sp_gemm<T> {
         // Make matrix descriptors
-                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
+                cusparseCreateCsr(&descrA_, m_, k_, nnzA_, A_row_dev_,
                                   A_col_dev_, A_val_dev_, rType_, cType_,
                                   indType_, cudaDataType_));
-                cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_,
+                cusparseCreateCsr(&descrB_, k_, n_, nnzB_, B_row_dev_,
                                   B_col_dev_, B_val_dev_, rType_, cType_,
                                   indType_, cudaDataType_));
-                cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL,
+                cusparseCreateCsr(&descrC_, n_, m_, 0, C_row_dev_, NULL, NULL,
                                   rType_, cType_, indType_, cudaDataType_));
@@ -282,14 +309,14 @@ class sp_gemm_gpu : public sp_gemm<T> {
                 cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
-                                     &C_nnz_));
+                                     &nnzC_));
         if (C_mem_allocated_always_) {
-        cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_));
-        cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_));
+        cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * nnzC_));
+        cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * nnzC_));
                 cusparseCsrSetPointers(descrC_, C_row_dev_, C_col_dev_,
@@ -300,31 +327,31 @@ class sp_gemm_gpu : public sp_gemm<T> {
                                     alg_, spgemmDesc_));
         cudaCheckError(cudaMemcpyAsync(A_val_, A_val_dev_, sizeof(T) *
-        A_nnz_, cudaMemcpyDeviceToHost, s1_));
+        nnzA_, cudaMemcpyDeviceToHost, s1_));
         cudaCheckError(cudaMemcpyAsync(A_col_, A_col_dev_, sizeof(int) *
-        A_nnz_, cudaMemcpyDeviceToHost, s1_));
+        nnzA_, cudaMemcpyDeviceToHost, s1_));
         cudaCheckError(cudaMemcpyAsync(A_row_, A_row_dev_, sizeof(int) *
-        (n_ + 1), cudaMemcpyDeviceToHost, s1_));
+        (m_ + 1), cudaMemcpyDeviceToHost, s1_));
         cudaCheckError(cudaMemcpyAsync(B_val_, B_val_dev_, sizeof(T) *
-        B_nnz_, cudaMemcpyDeviceToHost, s2_));
+        nnzB_, cudaMemcpyDeviceToHost, s2_));
         cudaCheckError(cudaMemcpyAsync(B_col_, B_col_dev_, sizeof(int) *
-        B_nnz_, cudaMemcpyDeviceToHost, s2_));
+        nnzB_, cudaMemcpyDeviceToHost, s2_));
         cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) *
-        (n_ + 1), cudaMemcpyDeviceToHost, s2_));
+        (k_ + 1), cudaMemcpyDeviceToHost, s2_));
         if (C_mem_allocated_always_) {
-        C_val_ = (T*)malloc(sizeof(T) * C_nnz_);
-        C_col_ = (int*)malloc(sizeof(int) * C_nnz_);
+        C_val_ = (T*)malloc(sizeof(T) * nnzC_);
+        C_col_ = (int*)malloc(sizeof(int) * nnzC_);
         C_mem_allocated_always_ = true;
         cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) *
-        C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        nnzC_, cudaMemcpyDeviceToHost, s3_));
         cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) *
-        C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        nnzC_, cudaMemcpyDeviceToHost, s3_));
         cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) *
         (n_ + 1), cudaMemcpyDeviceToHost, s3_));
@@ -364,14 +391,14 @@ class sp_gemm_gpu : public sp_gemm<T> {
                 cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
-                                     &C_nnz_));
+                                     &nnzC_));
         if (C_mem_allocated_once_) {
-        cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_));
-        cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_));
+        cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * nnzC_));
+        cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * nnzC_));
         C_mem_allocated_once_ = true;
@@ -417,15 +444,15 @@ class sp_gemm_gpu : public sp_gemm<T> {
                 cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_,
-                                     &C_nnz_));
+                                     &nnzC_));
         if (C_mem_allocated_unified_) {
-        cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * C_nnz_));
-        cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * C_nnz_));
+        cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnzC_));
+        cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnzC_));
         C_mem_allocated_unified_ = true;
@@ -455,25 +482,25 @@ class sp_gemm_gpu : public sp_gemm<T> {
       case gpuOffloadType::once: {
         cudaCheckError(cudaMemcpyAsync(A_val_, A_val_dev_, sizeof(T) *
-        A_nnz_, cudaMemcpyDeviceToHost, s1_));
+        nnzA_, cudaMemcpyDeviceToHost, s1_));
         cudaCheckError(cudaMemcpyAsync(A_col_, A_col_dev_, sizeof(int) *
-        A_nnz_, cudaMemcpyDeviceToHost, s1_));
+        nnzA_, cudaMemcpyDeviceToHost, s1_));
         cudaCheckError(cudaMemcpyAsync(A_row_, A_row_dev_, sizeof(int) *
-        (n_ + 1), cudaMemcpyDeviceToHost, s1_));
+        (m_ + 1), cudaMemcpyDeviceToHost, s1_));
         cudaCheckError(cudaMemcpyAsync(B_val_, B_val_dev_, sizeof(T) *
-        B_nnz_, cudaMemcpyDeviceToHost, s2_));
+        nnzB_, cudaMemcpyDeviceToHost, s2_));
         cudaCheckError(cudaMemcpyAsync(B_col_, B_col_dev_, sizeof(int) *
-        B_nnz_, cudaMemcpyDeviceToHost, s2_));
+        nnzB_, cudaMemcpyDeviceToHost, s2_));
         cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) *
-        (n_ + 1), cudaMemcpyDeviceToHost, s2_));
+        (k_ + 1), cudaMemcpyDeviceToHost, s2_));
-        C_val_ = (T*)malloc(sizeof(T) * C_nnz_);
-        C_col_ = (int*)malloc(sizeof(int) * C_nnz_);
+        C_val_ = (T*)malloc(sizeof(T) * nnzC_);
+        C_col_ = (int*)malloc(sizeof(int) * nnzC_);
         cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) *
-        C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        nnzC_, cudaMemcpyDeviceToHost, s3_));
         cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) *
-        C_nnz_, cudaMemcpyDeviceToHost, s3_));
+        nnzC_, cudaMemcpyDeviceToHost, s3_));
         cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) *
         (n_ + 1), cudaMemcpyDeviceToHost, s3_));
@@ -486,23 +513,23 @@ class sp_gemm_gpu : public sp_gemm<T> {
       case gpuOffloadType::unified: {
         // Ensure all data resides on host once work has completed
-        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_,
+        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * nnzA_,
                                             cudaCpuDeviceId, s1_));
-        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * A_nnz_,
+        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * nnzA_,
                                             cudaCpuDeviceId, s1_));
-        cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1),
+        cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (m_ + 1),
                                             cudaCpuDeviceId, s1_));
-        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * B_nnz_,
+        cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * nnzB_,
                                             cudaCpuDeviceId, s2_));
-        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * B_nnz_,
+        cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * nnzB_,
                                             cudaCpuDeviceId, s2_));
-        cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1),
+        cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (k_ + 1),
                                             cudaCpuDeviceId, s2_));
-//        cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * C_nnz_,
+//        cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * nnzC_,
 //                                            cudaCpuDeviceId, s3_));
-//        cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * C_nnz_,
+//        cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * nnzC_,
 //                                            cudaCpuDeviceId, s3_));
         cudaCheckError(cudaMemPrefetchAsync(C_row_, sizeof(int) * (n_ + 1),
                                             cudaCpuDeviceId, s3_));
@@ -618,21 +645,18 @@ class sp_gemm_gpu : public sp_gemm<T> {
   int* A_row_;
   int64_t A_num_rows_;
   int64_t A_num_cols_;
-  int64_t A_nnz_;
   T* B_val_;
   int* B_col_;
   int* B_row_;
   int64_t B_num_rows_;
   int64_t B_num_cols_;
-  int64_t B_nnz_;
   T* C_val_ = NULL;
   int* C_col_ = NULL;
   int* C_row_;
   int64_t C_num_rows_;
   int64_t C_num_cols_;
-  int64_t C_nnz_;
   /** CSR format vectors for matrices A, B and C on the device. */
 	T* A_val_dev_;
diff --git a/include/doGemm.hh b/include/doGemm.hh
index 23caa6f..6a0de59 100644
--- a/include/doGemm.hh
+++ b/include/doGemm.hh
@@ -8,7 +8,6 @@
 #if defined CPU_ARMPL
 #include "../ArmPL/gemm.hh"
-#include "../ArmPL/sp_gemm.hh"
 #elif defined CPU_ONEMKL
 #include "../oneMKL/CPU/gemm.hh"
 #elif defined CPU_AOCL
@@ -21,7 +20,6 @@
 #if defined GPU_CUBLAS
 #include "../cuBLAS/gemm.hh"
-#include "../cuBLAS/sp_gemm.hh"
 #elif defined GPU_ONEMKL
 #include "../oneMKL/GPU/gemm.hh"
 #elif defined GPU_ROCBLAS
@@ -35,25 +33,20 @@ class doGemm {
   doGemm(const std::string csvDir, const int iters, const int startDim,
          const int upperLimit, const bool cpuEnabled = true,
-         const bool gpuEnabled = true, const bool doDense = true,
-         const bool doSparse = true)
+         const bool gpuEnabled = true)
       : CSV_DIR(csvDir),
-        doGPU_(gpuEnabled),
-        doDense_(doDense),
-        doSparse_(doSparse)
+        doGPU_(gpuEnabled)
-        gemmCpu_(iterations_),
-        spGemmCpu_(iterations_)
+        cpu_(iterations_)
-        gemmGpu_(iterations_),
-        spGemmGpu_(iterations_)
+        gpu_(iterations_)
     static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) &&
@@ -65,309 +58,247 @@ class doGemm {
   void collectData() {
     // ToDo -- I've hard coded false here as kernel selection was not working
     //  .  Needs to be fixed
-    if (doDense_) {
-      // Square Problem Sizes...
-      // Re-initialise offload threshold structures
-      cpuGpu_always_ = cpuGpu_offloadThreshold();
-      cpuGpu_once_ = cpuGpu_offloadThreshold();
-      cpuGpu_unified_ = cpuGpu_offloadThreshold();
-      prev_gpuResult_always = time_checksum_gflop();
-      prev_gpuResult_once = time_checksum_gflop();
-      prev_gpuResult_unified = time_checksum_gflop();
-      std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                                          "_square_square_M=N=K.csv");
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        // M = dim, N = dim, K = dim;
-        callDenseKernels(csvFile, dim, dim, dim);
-      }
-      // Close file
-      csvFile.close();
-      if (doCPU_ && doGPU_) {
-        // Print offload results to stdout
-        printOffloadThreshold("Square x Square (M=N=K)");
-      }
-    // Rectangular Problem Sizes:
-    // Tall and thin x Short and wide
-    // Re-initialise offload threshold structures & previous results
+    // Square Problem Sizes...
+    // Re-initialise offload threshold structures
     cpuGpu_always_ = cpuGpu_offloadThreshold();
     cpuGpu_once_ = cpuGpu_offloadThreshold();
     cpuGpu_unified_ = cpuGpu_offloadThreshold();
     prev_gpuResult_always = time_checksum_gflop();
     prev_gpuResult_once = time_checksum_gflop();
     prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_tall-thin_short-wide_M=N_M=16K.csv");
-    int K = startDimention_;
-    int M = 16 * K;
-    int N = 16 * K;
-    while (M <= upperLimit_) {
-      callDenseKernels(csvFile, M, N, K);
-      M += 16;
-      N += 16;
-      K++;
+    std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                                        "_square_square_M=N=K.csv");
+    for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+      // M = dim, N = dim, K = dim;
+      callKernels(csvFile, dim, dim, dim);
     // Close file
     if (doCPU_ && doGPU_) {
       // Print offload results to stdout
-      printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, M=16K)");
+      printOffloadThreshold("Square x Square (M=N=K)");
-    // Tall and thin x Short and wide
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_tall-thin_short-wide_M=N_K=32.csv");
-    if (upperLimit_ >= 32) {
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        // M = dim, N = dim, K = 32;
-        callDenseKernels(csvFile, dim, dim, 32);
-      }
-    }
-    // Close file
-    csvFile.close();
+  // Rectangular Problem Sizes:
+  // Tall and thin x Short and wide
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_tall-thin_short-wide_M=N_M=16K.csv");
+  int K = startDimention_;
+  int M = 16 * K;
+  int N = 16 * K;
+  while (M <= upperLimit_) {
+    callKernels(csvFile, M, N, K);
+    M += 16;
+    N += 16;
+    K++;
+  }
+  // Close file
+  csvFile.close();
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, K=32)");
-    }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, M=16K)");
+  }
-    // Short and wide x Tall and thin
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_short-wide_tall-thin_M=N_K=16M.csv");
-    M = startDimention_;
-    N = startDimention_;
-    K = 16 * M;
-    while (K <= upperLimit_) {
-      callDenseKernels(csvFile, M, N, K);
-      M++;
-      N++;
-      K += 16;
+  // Tall and thin x Short and wide
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_tall-thin_short-wide_M=N_K=32.csv");
+  if (upperLimit_ >= 32) {
+    for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+      // M = dim, N = dim, K = 32;
+      callKernels(csvFile, dim, dim, 32);
-    // Close file
-    csvFile.close();
+  }
+  // Close file
+  csvFile.close();
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N, K=16M)");
-    }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, K=32)");
+  }
-    // Short and wide x Tall and thin
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_short-wide_tall-thin_M=N=32_K.csv");
-    if (upperLimit_ >= 32) {
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        // M = 32, N = 32, K = dim;
-        callDenseKernels(csvFile, 32, 32, dim);
-      }
-    }
-    // Close file
-    csvFile.close();
+  // Short and wide x Tall and thin
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_short-wide_tall-thin_M=N_K=16M.csv");
+  M = startDimention_;
+  N = startDimention_;
+  K = 16 * M;
+  while (K <= upperLimit_) {
+    callKernels(csvFile, M, N, K);
+    M++;
+    N++;
+    K += 16;
+  }
+  // Close file
+  csvFile.close();
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N=32, K)");
-    }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N, K=16M)");
+  }
-    // Tall and Thin x Square
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_tall-thin_square_K=N_M=16K.csv");
-    K = startDimention_;
-    N = startDimention_;
-    M = 16 * K;
-    while (M <= upperLimit_) {
-      callDenseKernels(csvFile, M, N, K);
-      M += 16;
-      N++;
-      K++;
+  // Short and wide x Tall and thin
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_short-wide_tall-thin_M=N=32_K.csv");
+  if (upperLimit_ >= 32) {
+    for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+      // M = 32, N = 32, K = dim;
+      callKernels(csvFile, 32, 32, dim);
-    // Close file
-    csvFile.close();
+  }
+  // Close file
+  csvFile.close();
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Tall-and-Thin x Square (K=N, M=16K)");
-    }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N=32, K)");
+  }
-    // Tall and Thin x Square
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_tall-thin_square_K=N=32_M.csv");
-    if (upperLimit_ >= 32) {
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        // M = dim, N = 32, K = 32;
-        callDenseKernels(csvFile, dim, 32, 32);
-      }
-    }
-    // Close file
-    csvFile.close();
+  // Tall and Thin x Square
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_tall-thin_square_K=N_M=16K.csv");
+  K = startDimention_;
+  N = startDimention_;
+  M = 16 * K;
+  while (M <= upperLimit_) {
+    callKernels(csvFile, M, N, K);
+    M += 16;
+    N++;
+    K++;
+  }
+  // Close file
+  csvFile.close();
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Tall-and-Thin x Square (M, K=N=32)");
-    }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Square (K=N, M=16K)");
+  }
-    // Square x Short and Wide
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_square_short-wide_M=K_N=16K.csv");
-    M = startDimention_;
-    K = startDimention_;
-    N = 16 * K;
-    while (N <= upperLimit_) {
-      callDenseKernels(csvFile, M, N, K);
-      M++;
-      N += 16;
-      K++;
+  // Tall and Thin x Square
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_tall-thin_square_K=N=32_M.csv");
+  if (upperLimit_ >= 32) {
+    for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+      // M = dim, N = 32, K = 32;
+      callKernels(csvFile, dim, 32, 32);
-    // Close file
-    csvFile.close();
+  }
+  // Close file
+  csvFile.close();
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)");
-    }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Square (M, K=N=32)");
+  }
-    // Square x Short and Wide
-    // Re-initialise offload threshold structures & previous results
-    cpuGpu_always_ = cpuGpu_offloadThreshold();
-    cpuGpu_once_ = cpuGpu_offloadThreshold();
-    cpuGpu_unified_ = cpuGpu_offloadThreshold();
-    prev_gpuResult_always = time_checksum_gflop();
-    prev_gpuResult_once = time_checksum_gflop();
-    prev_gpuResult_unified = time_checksum_gflop();
-    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                          "_square_short-wide_M=K=32_N.csv");
-    if (upperLimit_ >= 32) {
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        // M = 32, N = dim, K = 32;
-        callDenseKernels(csvFile, 32, dim, 32);
-      }
-    }
+  // Square x Short and Wide
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_square_short-wide_M=K_N=16K.csv");
+  M = startDimention_;
+  K = startDimention_;
+  N = 16 * K;
+  while (N <= upperLimit_) {
+    callKernels(csvFile, M, N, K);
+    M++;
+    N += 16;
+    K++;
+  }
+  // Close file
+  csvFile.close();
-    if (doCPU_ && doGPU_) {
-      // Print offload results to stdout
-      printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)");
-    }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)");
+  }
-      // Close file
-      csvFile.close();
+  // Square x Short and Wide
+  // Re-initialise offload threshold structures & previous results
+  cpuGpu_always_ = cpuGpu_offloadThreshold();
+  cpuGpu_once_ = cpuGpu_offloadThreshold();
+  cpuGpu_unified_ = cpuGpu_offloadThreshold();
+  prev_gpuResult_always = time_checksum_gflop();
+  prev_gpuResult_once = time_checksum_gflop();
+  prev_gpuResult_unified = time_checksum_gflop();
+  csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                        "_square_short-wide_M=K=32_N.csv");
+  if (upperLimit_ >= 32) {
+    for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+      // M = 32, N = dim, K = 32;
+      callKernels(csvFile, 32, dim, 32);
-    if (doSparse_) {    // Square sparse matrix - sparse matrix multiplication
-      cpuGpu_always_ = cpuGpu_offloadThreshold();
-      cpuGpu_once_ = cpuGpu_offloadThreshold();
-      cpuGpu_unified_ = cpuGpu_offloadThreshold();
-      std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
-              getKernelName() + "_sparse_square_99.csv");
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        callSparseKernels(csvFile, dim, 0.99);
-      }
-      // Close file
-      csvFile.close();
-      if (doCPU_ && doGPU_) {
-        // Print offload results to stdout
-        printOffloadThreshold("Sparse Square 0.99");
-      }
-      cpuGpu_always_ = cpuGpu_offloadThreshold();
-      cpuGpu_once_ = cpuGpu_offloadThreshold();
-      cpuGpu_unified_ = cpuGpu_offloadThreshold();
-      csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
-              getKernelName() + "_sparse_square_999.csv");
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        callSparseKernels(csvFile, dim, 0.999);
-      }
-      if (doCPU_ && doGPU_) {
-        // Print offload results to stdout
-        printOffloadThreshold("Sparse Square 0.999");
-      }
-      cpuGpu_always_ = cpuGpu_offloadThreshold();
-      cpuGpu_once_ = cpuGpu_offloadThreshold();
-      cpuGpu_unified_ = cpuGpu_offloadThreshold();
-      csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
-              getKernelName() + "_sparse_square_9999.csv");
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        callSparseKernels(csvFile, dim, 0.9999);
-      }
-      if (doCPU_ && doGPU_) {
-        // Print offload results to stdout
-        printOffloadThreshold("Sparse Square 0.9999");
-      }
-      cpuGpu_always_ = cpuGpu_offloadThreshold();
-      cpuGpu_once_ = cpuGpu_offloadThreshold();
-      cpuGpu_unified_ = cpuGpu_offloadThreshold();
-      csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
-                                          getKernelName() +
-                                          "_sparse_square_99999.csv");
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        callSparseKernels(csvFile, dim, 0.99999);
-      }
+  }
-      if (doCPU_ && doGPU_) {
-        // Print offload results to stdout
-        printOffloadThreshold("Sparse Square 0.99999");
-      }
+  if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)");
+  }
-    }
+    // Close file
+    csvFile.close();
   /** Call the appropriate CPU and GPU GEMM kernels. */
-  void callDenseKernels(std::ofstream& csvFile, const int M, const int N,
-                        const int K) {
+  void callKernels(std::ofstream& csvFile, const int M, const int N,
+                   const int K) {
     const double probSize = calcKib(M, N, K);
     const uint64_t flops = calcFlops(M, N, K);
     std::string kernelName = getKernelName();
@@ -380,8 +311,8 @@ class doGemm {
 // Perform CPU kernel
     if (doCPU_) {
-      gemmCpu_.initialise(M, N, K);
-      cpuResult = gemmCpu_.compute();
+      cpu_.initialise(M, N, K);
+      cpuResult = cpu_.compute();
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
       // Write result to CSV file
       writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize,
@@ -394,21 +325,21 @@ class doGemm {
     if (doGPU_) {
       // - ONCE : Offload to/from GPU once before all iterations and once
       // after
-      gemmGpu_.initialise(gpuOffloadType::once, M, N, K);
-      gpuResult_once = gemmGpu_.compute();
+      gpu_.initialise(gpuOffloadType::once, M, N, K);
+      gpuResult_once = gpu_.compute();
       gpuResult_once.gflops =
           calcGflops(flops, iterations_, gpuResult_once.runtime);
       // - ALWAYS: Offload to/from GPU every iteration
-      gemmGpu_.initialise(gpuOffloadType::always, M, N, K);
-      gpuResult_always = gemmGpu_.compute();
+      gpu_.initialise(gpuOffloadType::always, M, N, K);
+      gpuResult_always = gpu_.compute();
       gpuResult_always.gflops =
           calcGflops(flops, iterations_, gpuResult_always.runtime);
       // - UNIFIED : data passed from host to device (and device to host) as
       //             needed
-      gemmGpu_.initialise(gpuOffloadType::unified, M, N, K);
-      gpuResult_unified = gemmGpu_.compute();
+      gpu_.initialise(gpuOffloadType::unified, M, N, K);
+      gpuResult_unified = gpu_.compute();
       gpuResult_unified.gflops =
           calcGflops(flops, iterations_, gpuResult_unified.runtime);
@@ -559,61 +490,6 @@ class doGemm {
-	void callSparseKernels(std::ofstream& csvFile, const int N, const float
-	sparsity) {
-		const double probSize = calcKib(N, N, N);
-		const uint64_t flops = calcFlops(N, N, N);
-		std::string kernelName = getKernelName();
-    if (doCPU_) {
-      spGemmCpu_.initialise(N, sparsity);
-      time_checksum_gflop cpuResult = spGemmCpu_.compute();
-      cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
-		  writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize,
-                     sparsity, iterations_, cpuResult.runtime,
-                     cpuResult.gflops);
-    }
-    // Perform the GPU kernels
-    // - UNIFIED : data passed from host to device (and device to host) as
-    //             needed
-    if (doGPU_) {
-      spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
-      time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
-      gpuResult_unified.gflops =
-      calcGflops(flops, iterations_, gpuResult_unified.runtime);
-    // - ALWAYS: Offload to/from GPU every iteration
-      spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity);
-      time_checksum_gflop gpuResult_always = spGemmGpu_.compute();
-      gpuResult_always.gflops =
-            calcGflops(flops, iterations_, gpuResult_always.runtime);
-		// - ONCE : Offload to/from GPU once before all iterations and once
-		// after
-      spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity);
-		  time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
-		  gpuResult_once.gflops =
-						calcGflops(flops, iterations_, gpuResult_once.runtime);
-		// ToDo -- non-default GPU operations
-		// Write lines to CSV file
-		  writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
-		                sparsity, iterations_, gpuResult_once.runtime,
-                    gpuResult_once.gflops);
-		  writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
-		                sparsity, iterations_, gpuResult_always.runtime,
-		                gpuResult_always.gflops);
-		  writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
-		                sparsity, iterations_, gpuResult_unified.runtime,
-		                gpuResult_unified.gflops);
-    }
-	}
   /** A function for calculating FLOPs performed by a GEMM.
    * C = alpha*AB + beta*C */
   constexpr uint64_t calcFlops(const int M, const int N, const int K) const {
@@ -744,20 +620,14 @@ class doGemm {
   /** Whether the GPU kernels should be run. */
   const bool doGPU_ = true;
-  /** Whether we should run dense and or sparse kernels */
-  const bool doDense_;
-  const bool doSparse_;
   /** The GEMM CPU kernel. */
-  cpu::gemm_cpu<T> gemmCpu_;
-  cpu::sp_gemm_cpu<T> spGemmCpu_;
+  cpu::gemm_cpu<T> cpu_;
   /** The GEMM GPU kernel. */
-  gpu::gemm_gpu<T> gemmGpu_;
-	gpu::sp_gemm_gpu<T> spGemmGpu_;
+  gpu::gemm_gpu<T> gpu_;
   /** The point at which offloading to GPU (offload once) becomes worthwhile. */
diff --git a/include/doGemv.hh b/include/doGemv.hh
index 0ecd814..ebc9262 100644
--- a/include/doGemv.hh
+++ b/include/doGemv.hh
@@ -8,7 +8,6 @@
 #if defined CPU_ARMPL
 #include "../ArmPL/gemv.hh"
-#include "../ArmPL/sp_gemv.hh"
 #elif defined CPU_ONEMKL
 #include "../oneMKL/CPU/gemv.hh"
 #elif defined CPU_AOCL
@@ -21,7 +20,6 @@
 #if defined GPU_CUBLAS
 #include "../cuBLAS/gemv.hh"
-#include "../cuBLAS/sp_gemv.hh"
 #elif defined GPU_ONEMKL
 #include "../oneMKL/GPU/gemv.hh"
 #elif defined GPU_ROCBLAS
@@ -35,25 +33,20 @@ class doGemv {
   doGemv(const std::string csvDir, const int iters, const int startDim,
          const int upperLimit, const bool cpuEnabled = true,
-         const bool gpuEnabled = true, const bool doDense = true, const bool
-         doSparse = true)
+         const bool gpuEnabled = true)
       : CSV_DIR(csvDir),
-        doGPU_(gpuEnabled),
-        doDense_(doDense),
-        doSparse_(doSparse)
+        doGPU_(gpuEnabled)
-        gemvCpu_(iterations_),
-        spGemvCpu_(iterations_)
+        cpu_(iterations_)
-        gemvGpu_(iterations_),
-        spGemvGpu_(iterations_)
+        gpu_(iterations_)
     static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) &&
@@ -63,156 +56,131 @@ class doGemv {
   /** Run all problem types and write data to CSV files. */
   void collectData() {
-    if (doDense_) {
-      // Square Problem Sizes...
-      // Re-initialise offload threshold structures & previous results
-      cpuGpu_always_ = cpuGpu_offloadThreshold();
-      cpuGpu_once_ = cpuGpu_offloadThreshold();
-      cpuGpu_unified_ = cpuGpu_offloadThreshold();
-      prev_gpuResult_always = time_checksum_gflop();
-      prev_gpuResult_once = time_checksum_gflop();
-      prev_gpuResult_unified = time_checksum_gflop();
-      std::ofstream csvFile =
-          initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv");
-      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        // M = dim, N = dim;
-        callDenseKernels(csvFile, dim, dim);
-      }
-      // Close file
-      csvFile.close();
+    // Square Problem Sizes...
+    // Re-initialise offload threshold structures & previous results
+    cpuGpu_always_ = cpuGpu_offloadThreshold();
+    cpuGpu_once_ = cpuGpu_offloadThreshold();
+    cpuGpu_unified_ = cpuGpu_offloadThreshold();
+    prev_gpuResult_always = time_checksum_gflop();
+    prev_gpuResult_once = time_checksum_gflop();
+    prev_gpuResult_unified = time_checksum_gflop();
+    std::ofstream csvFile =
+        initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv");
+    for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+      // M = dim, N = dim;
+      callDenseKernels(csvFile, dim, dim);
+    }
+    // Close file
+    csvFile.close();
-      if (doCPU_ && doGPU_) {
-        // Print offload results to stdout
-        printOffloadThreshold("Square x Vector (M=N)");
-      }
+    if (doCPU_ && doGPU_) {
+      // Print offload results to stdout
+      printOffloadThreshold("Square x Vector (M=N)");
+    }
-      // Rectangular Problem Sizes:
-      // Tall and thin x Vector
-      // Re-initialise offload threshold structures & previous results
-      cpuGpu_always_ = cpuGpu_offloadThreshold();
-      cpuGpu_once_ = cpuGpu_offloadThreshold();
-      cpuGpu_unified_ = cpuGpu_offloadThreshold();
-      prev_gpuResult_always = time_checksum_gflop();
-      prev_gpuResult_once = time_checksum_gflop();
-      prev_gpuResult_unified = time_checksum_gflop();
-      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                            "_tall-thin_vector_M=16N.csv");
-      int N = startDimention_;
-      int M = 16 * N;
-      while (M <= upperLimit_) {
-        callDenseKernels(csvFile, M, N);
-        M += 16;
-        N++;
-      }
-      // Close file
-      csvFile.close();
+    // Rectangular Problem Sizes:
+    // Tall and thin x Vector
+    // Re-initialise offload threshold structures & previous results
+    cpuGpu_always_ = cpuGpu_offloadThreshold();
+    cpuGpu_once_ = cpuGpu_offloadThreshold();
+    cpuGpu_unified_ = cpuGpu_offloadThreshold();
+    prev_gpuResult_always = time_checksum_gflop();
+    prev_gpuResult_once = time_checksum_gflop();
+    prev_gpuResult_unified = time_checksum_gflop();
+    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                          "_tall-thin_vector_M=16N.csv");
+    int N = startDimention_;
+    int M = 16 * N;
+    while (M <= upperLimit_) {
+      callDenseKernels(csvFile, M, N);
+      M += 16;
+      N++;
+    }
+    // Close file
+    csvFile.close();
-      if (doCPU_ && doGPU_) {
-        // Print offload results to stdout
-        printOffloadThreshold("Tall-and-Thin x Vector (M=16N)");
-      }
+    if (doCPU_ && doGPU_) {
+      // Print offload results to stdout
+      printOffloadThreshold("Tall-and-Thin x Vector (M=16N)");
+    }
-      // Tall and thin x Vector
-      // Re-initialise offload threshold structures & previous results
-      cpuGpu_always_ = cpuGpu_offloadThreshold();
-      cpuGpu_once_ = cpuGpu_offloadThreshold();
-      cpuGpu_unified_ = cpuGpu_offloadThreshold();
-      prev_gpuResult_always = time_checksum_gflop();
-      prev_gpuResult_once = time_checksum_gflop();
-      prev_gpuResult_unified = time_checksum_gflop();
-      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                            "_tall-thin_vector_M_N=32.csv");
-      if (upperLimit_ >= 32) {
-        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-          // M = dim, N = 32;
-          callDenseKernels(csvFile, dim, 32);
-        }
+    // Tall and thin x Vector
+    // Re-initialise offload threshold structures & previous results
+    cpuGpu_always_ = cpuGpu_offloadThreshold();
+    cpuGpu_once_ = cpuGpu_offloadThreshold();
+    cpuGpu_unified_ = cpuGpu_offloadThreshold();
+    prev_gpuResult_always = time_checksum_gflop();
+    prev_gpuResult_once = time_checksum_gflop();
+    prev_gpuResult_unified = time_checksum_gflop();
+    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                          "_tall-thin_vector_M_N=32.csv");
+    if (upperLimit_ >= 32) {
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        // M = dim, N = 32;
+        callDenseKernels(csvFile, dim, 32);
-      // Close file
-      csvFile.close();
+    }
+    // Close file
+    csvFile.close();
-      if (doCPU_ && doGPU_) {
-        // Print offload results to stdout
-        printOffloadThreshold("Tall-and-Thin x Vector (M, N=32)");
-      }
+    if (doCPU_ && doGPU_) {
+      // Print offload results to stdout
+      printOffloadThreshold("Tall-and-Thin x Vector (M, N=32)");
+    }
-      // Short and wide x Vector
-      // Re-initialise offload threshold structures & previous results
-      cpuGpu_always_ = cpuGpu_offloadThreshold();
-      cpuGpu_once_ = cpuGpu_offloadThreshold();
-      cpuGpu_unified_ = cpuGpu_offloadThreshold();
-      prev_gpuResult_always = time_checksum_gflop();
-      prev_gpuResult_once = time_checksum_gflop();
-      prev_gpuResult_unified = time_checksum_gflop();
-      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                            "_short-wide_vector_N=16M.csv");
-      M = startDimention_;
-      N = 16 * M;
-      while (N <= upperLimit_) {
-        callDenseKernels(csvFile, M, N);
-        M++;
-        N += 16;
-      }
-      // Close file
-      csvFile.close();
+    // Short and wide x Vector
+    // Re-initialise offload threshold structures & previous results
+    cpuGpu_always_ = cpuGpu_offloadThreshold();
+    cpuGpu_once_ = cpuGpu_offloadThreshold();
+    cpuGpu_unified_ = cpuGpu_offloadThreshold();
+    prev_gpuResult_always = time_checksum_gflop();
+    prev_gpuResult_once = time_checksum_gflop();
+    prev_gpuResult_unified = time_checksum_gflop();
+    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                          "_short-wide_vector_N=16M.csv");
+    M = startDimention_;
+    N = 16 * M;
+    while (N <= upperLimit_) {
+      callDenseKernels(csvFile, M, N);
+      M++;
+      N += 16;
+    }
+    // Close file
+    csvFile.close();
-      if (doCPU_ && doGPU_) {
-        // Print offload results to stdout
-        printOffloadThreshold("Short-and-Wide x Vector (N=16M)");
-      }
+    if (doCPU_ && doGPU_) {
+      // Print offload results to stdout
+      printOffloadThreshold("Short-and-Wide x Vector (N=16M)");
+    }
-      // Short and wide x Vector
-      // Re-initialise offload threshold structures & previous results
-      cpuGpu_always_ = cpuGpu_offloadThreshold();
-      cpuGpu_once_ = cpuGpu_offloadThreshold();
-      cpuGpu_unified_ = cpuGpu_offloadThreshold();
-      prev_gpuResult_always = time_checksum_gflop();
-      prev_gpuResult_once = time_checksum_gflop();
-      prev_gpuResult_unified = time_checksum_gflop();
-      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                            "_short-wide_vector_M=32_N.csv");
-      if (upperLimit_ >= 32) {
-        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-          // M = 32, N = dim;
-          callDenseKernels(csvFile, 32, dim);
-        }
-      }
-      // Close file
-      csvFile.close();
-      if (doCPU_ && doGPU_) {
-        // Print offload results to stdout
-        printOffloadThreshold("Short-and-Wide x Vector (M=32, N)");
-      }
-    }
-    if (doSparse_) {
-      // Sparse square matrix
-      cpuGpu_always_ = cpuGpu_offloadThreshold();
-      cpuGpu_once_ = cpuGpu_offloadThreshold();
-      cpuGpu_unified_ = cpuGpu_offloadThreshold();
-      prev_gpuResult_always = time_checksum_gflop();
-      prev_gpuResult_once = time_checksum_gflop();
-      prev_gpuResult_unified = time_checksum_gflop();
-      std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
-                            "_sparse_square_9999.csv");
+    // Short and wide x Vector
+    // Re-initialise offload threshold structures & previous results
+    cpuGpu_always_ = cpuGpu_offloadThreshold();
+    cpuGpu_once_ = cpuGpu_offloadThreshold();
+    cpuGpu_unified_ = cpuGpu_offloadThreshold();
+    prev_gpuResult_always = time_checksum_gflop();
+    prev_gpuResult_once = time_checksum_gflop();
+    prev_gpuResult_unified = time_checksum_gflop();
+    csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                          "_short-wide_vector_M=32_N.csv");
+    if (upperLimit_ >= 32) {
       for (int dim = startDimention_; dim <= upperLimit_; dim++) {
-        callSparseKernels(csvFile, dim, 0.9999);
+        // M = 32, N = dim;
+        callDenseKernels(csvFile, 32, dim);
-      // Close filex1
-      csvFile.close();
+    }
+    // Close file
+    csvFile.close();
-      if (doCPU_ && doGPU_) {
-        // Print offload results to stdout
-        printOffloadThreshold("Sparse square // sparsity = 0.9999");
-      }
-      csvFile.close();
+    if (doCPU_ && doGPU_) {
+      // Print offload results to stdout
+      printOffloadThreshold("Short-and-Wide x Vector (M=32, N)");
@@ -230,8 +198,8 @@ class doGemv {
 // Perform CPU kernel
     if (doCPU_) {
-      gemvCpu_.initialise(M, N);
-      cpuResult = gemvCpu_.compute();
+      cpu_.initialise(M, N);
+      cpuResult = cpu_.compute();
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
       // Write result to CSV file
       writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, 0.0,
@@ -244,21 +212,21 @@ class doGemv {
     if (doGPU_) {
       // - ONCE : Offload to/from GPU once before all iterations and once
       // after
-      gemvGpu_.initialise(gpuOffloadType::once, M, N);
-      gpuResult_once = gemvGpu_.compute();
+      gpu_.initialise(gpuOffloadType::once, M, N);
+      gpuResult_once = gpu_.compute();
       gpuResult_once.gflops =
           calcGflops(flops, iterations_, gpuResult_once.runtime);
       // - ALWAYS: Offload to/from GPU every iteration
-      gemvGpu_.initialise(gpuOffloadType::always, M, N);
-      gpuResult_always = gemvGpu_.compute();
+      gpu_.initialise(gpuOffloadType::always, M, N);
+      gpuResult_always = gpu_.compute();
       gpuResult_always.gflops =
           calcGflops(flops, iterations_, gpuResult_always.runtime);
       // - UNIFIED : data passed from host to device (and device to host) as
       //             needed
-      gemvGpu_.initialise(gpuOffloadType::unified, M, N);
-      gpuResult_unified = gemvGpu_.compute();
+      gpu_.initialise(gpuOffloadType::unified, M, N);
+      gpuResult_unified = gpu_.compute();
       gpuResult_unified.gflops =
           calcGflops(flops, iterations_, gpuResult_unified.runtime);
@@ -302,64 +270,6 @@ class doGemv {
-  void callSparseKernels(std::ofstream& csvFile, const int N, const float
-  sparsity) {
-    const double probSize = calcKib(N, N);
-    const uint64_t flops = calcFlops(N, N);
-    std::string kernelName = getKernelName();
-    time_checksum_gflop cpuResult;
-    time_checksum_gflop gpuResult_once;
-    time_checksum_gflop gpuResult_always;
-    time_checksum_gflop gpuResult_unified;
-    if (doCPU_) {
-      spGemvCpu_.initialise(N, sparsity);
-      time_checksum_gflop cpuResult = spGemvCpu_.compute();
-      cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
-      // Write result to CSV file
-      writeLineToCsv(csvFile, "cpu", kernelName, N, N, 0, probSize, sparsity,
-                     iterations_, cpuResult.runtime, cpuResult.gflops);
-    }
-    if (doGPU_) {
-      // - ONCE : Offload to/from GPU once before all iterations and once
-      // after
-      spGemvGpu_.initialise(gpuOffloadType::once, N, sparsity);
-      gpuResult_once = spGemvGpu_.compute();
-      gpuResult_once.gflops =
-          calcGflops(flops, iterations_, gpuResult_once.runtime);
-      // - ALWAYS: Offload to/from GPU every iteration
-      spGemvGpu_.initialise(gpuOffloadType::always, N, sparsity);
-      gpuResult_always = spGemvGpu_.compute();
-      gpuResult_always.gflops =
-          calcGflops(flops, iterations_, gpuResult_always.runtime);
-      // - UNIFIED : data passed from host to device (and device to host) as
-      //             needed
-      spGemvGpu_.initialise(gpuOffloadType::unified, N, sparsity);
-      gpuResult_unified = spGemvGpu_.compute();
-      gpuResult_unified.gflops =
-          calcGflops(flops, iterations_, gpuResult_unified.runtime);
-      // Write results to CSV file
-      writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, 0, probSize,
-                     sparsity, iterations_, gpuResult_once.runtime,
-                     gpuResult_once.gflops);
-      writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, 0,
-                     probSize, sparsity, iterations_, gpuResult_always.runtime,
-                     gpuResult_always.gflops);
-      writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, 0, probSize,
-                     sparsity, iterations_, gpuResult_unified.runtime,
-                     gpuResult_unified.gflops);
-    }
-  }
   /** Ensure all CPU and GPU checksums are within the permitted limit of
    * eachother. */
   void checkChecksums(time_checksum_gflop cpuResult,
@@ -584,20 +494,14 @@ class doGemv {
   /** Whether the GPU kernels should be run. */
   const bool doGPU_ = true;
-  /** Whether sparse and or dense kernels should be run. */
-  const bool doDense_;
-  const bool doSparse_;
   /** The GEMV CPU kernel. */
-  cpu::gemv_cpu<T> gemvCpu_;
-  cpu::sp_gemv_cpu<T> spGemvCpu_;
+  cpu::gemv_cpu<T> cpu_;
   /** The GEMV GPU kernel. */
-  gpu::gemv_gpu<T> gemvGpu_;
-  gpu::sp_gemv_gpu<T> spGemvGpu_;
+  gpu::gemv_gpu<T> gpu_;
   /** The point at which offloading to GPU (offload once) becomes worthwhile. */
diff --git a/include/doSpgemm.hh b/include/doSpgemm.hh
new file mode 100644
index 0000000..2131a7d
--- /dev/null
+++ b/include/doSpgemm.hh
@@ -0,0 +1,8 @@
+// Created by Alexander Cockrean on 07/01/2025.
diff --git a/include/doSpgemv.hh b/include/doSpgemv.hh
new file mode 100644
index 0000000..cf315e0
--- /dev/null
+++ b/include/doSpgemv.hh
@@ -0,0 +1,8 @@
+// Created by Alexander Cockrean on 07/01/2025.
diff --git a/include/doSpmm.hh b/include/doSpmm.hh
new file mode 100644
index 0000000..2321636
--- /dev/null
+++ b/include/doSpmm.hh
@@ -0,0 +1,445 @@
+#pragma once
+#include <sstream>
+#include <type_traits>
+#include <cstdint>
+#include "helpers.hh"
+#include "tablePrinter.hh"
+#include "utilities.hh"
+#if defined CPU_ARMPL
+#include "../ArmPL/spmm.hh"
+#elif defined CPU_ONEMKL
+// Todo #include "../oneMKL/CPU/spmm.hh"
+#elif defined CPU_AOCL
+// Todo #include "../AOCL/gemm.hh"
+#elif defined CPU_NVPL
+ // Todo #include "../NVPL/gemm.hh"
+#elif defined CPU_OPENBLAS
+// Todo #include "../OpenBLAS/gemm.hh"
+#if defined GPU_CUBLAS
+#include "../cuBLAS/spmm.hh"
+#elif defined GPU_ONEMKL
+// Todo #include "../oneMKL/GPU/gemm.hh"
+#elif defined GPU_ROCBLAS
+// Todo #include "../rocBLAS/gemm.hh"
+/** `T` represents the type of kernel that will be run - i.e. T=float is for
+ *      SGEMM. */
+template <typename T>
+class doSpmm {
+    doSpmm(const std::string csvDir, const int iters, const int startDim,
+           const int upperLimit, const bool cpuEnabled = true,
+           const bool gpuEnabled = true)
+            : CSV_DIR(csvDir),
+              iterations_(iters),
+              startDimention_(startDim),
+              upperLimit_(upperLimit),
+              doCPU_(cpuEnabled),
+              doGPU_(gpuEnabled)
+    ,
+        cpu_(iterations_)
+    ,
+        gpu_(iterations_)
+    {
+      static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) &&
+                    "ERROR - doSpmm can only be constructed using one of the "
+                    "following types: [float, double].");
+    }
+    /** Run all problem types and write data to CSV files. */
+    void collectData() {
+      // ToDo -- I've hard coded false here as kernel selection was not working
+      //  .  Needs to be fixed
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
+                                          getKernelName() + "_sparse_square_99.csv");
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        callKernels(csvFile, dim, 0.99);
+      }
+      // Close file
+      csvFile.close();
+      if (doCPU_ && doGPU_) {
+      // Print offload results to stdout
+      printOffloadThreshold("Sparse Square 0.99");
+    }
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
+                            getKernelName() + "_sparse_square_999.csv");
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        callKernels(csvFile, dim, 0.999);
+      }
+      if (doCPU_ && doGPU_) {
+      // Print offload results to stdout
+      printOffloadThreshold("Sparse Square 0.999");
+    }
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
+                            getKernelName() + "_sparse_square_9999.csv");
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        callKernels(csvFile, dim, 0.9999);
+      }
+      if (doCPU_ && doGPU_) {
+      // Print offload results to stdout
+      printOffloadThreshold("Sparse Square 0.9999");
+    }
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
+                            getKernelName() +
+                            "_sparse_square_99999.csv");
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        callKernels(csvFile, dim, 0.99999);
+      }
+      if (doCPU_ && doGPU_) {
+      // Print offload results to stdout
+      printOffloadThreshold("Sparse Square 0.99999");
+    }
+    }
+        /** Ensure all CPU and GPU checksums are within the permitted limit of
+     * eachother. */
+    void checkChecksums(time_checksum_gflop cpuResult,
+                        time_checksum_gflop gpuResult_once,
+                        time_checksum_gflop gpuResult_always,
+                        time_checksum_gflop gpuResult_unified, const int M,
+                        const int N, const int K) {
+      // Ensure that each checksum difference is less than 0.1%
+      double hundredOverChecksum = 100 / std::fabs(cpuResult.checksum);
+      if (((std::fabs(cpuResult.checksum - gpuResult_once.checksum) *
+            hundredOverChecksum)) > 0.1 &&
+          ((std::fabs(cpuResult.checksum - gpuResult_always.checksum) *
+            hundredOverChecksum)) > 0.1 &&
+          ((std::fabs(cpuResult.checksum - gpuResult_unified.checksum) *
+            hundredOverChecksum)) > 0.1) {
+        std::cerr << "ERROR - " << getKernelName()
+                  << " kernel checksums do not match:\n\tInput "
+                     "dimensions: M="
+                  << M << ", N=" << N << ", K=" << K << std::endl;
+        std::cerr << std::setprecision(10)
+                  << "\tCPU Checksum = " << cpuResult.checksum << std::endl;
+        std::cerr << std::setprecision(10)
+                  << "\tGPU (Once) Checksum = " << gpuResult_once.checksum
+                  << std::endl;
+        std::cerr << std::setprecision(10)
+                  << "\tGPU (Always) Checksum = " << gpuResult_always.checksum
+                  << std::endl;
+        std::cerr << std::setprecision(10)
+                  << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum
+                  << std::endl;
+        exit(1);
+      }
+    }
+    /** Check whether the offload structures need to be reset; and doing so if
+     * required.
+     *   - If CPU.gflops >= GPU.gflops for last two problem sizes, then reset
+     * offload structures as GPU may not necessarily have reached the offload
+     * threshold. */
+    void checkOffloadStructReset(time_checksum_gflop cpuResult,
+                                 time_checksum_gflop gpuResult_once,
+                                 time_checksum_gflop gpuResult_always,
+                                 time_checksum_gflop gpuResult_unified) {
+      if ((cpuGpu_once_.M != 0) && (cpuResult.gflops >= gpuResult_once.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_once.gflops)) {
+        cpuGpu_once_.cpuGflops = 0.0;
+        cpuGpu_once_.gpuGflops = 0.0;
+        cpuGpu_once_.probSize_kib = 0.0;
+        cpuGpu_once_.M = 0;
+        cpuGpu_once_.N = 0;
+        cpuGpu_once_.K = 0;
+      }
+      if ((cpuGpu_always_.M != 0) &&
+          (cpuResult.gflops >= gpuResult_always.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_always.gflops)) {
+        cpuGpu_always_.cpuGflops = 0.0;
+        cpuGpu_always_.gpuGflops = 0.0;
+        cpuGpu_always_.probSize_kib = 0.0;
+        cpuGpu_always_.M = 0;
+        cpuGpu_always_.N = 0;
+        cpuGpu_always_.K = 0;
+      }
+      if ((cpuGpu_unified_.M != 0) &&
+          (cpuResult.gflops >= gpuResult_unified.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_unified.gflops)) {
+        cpuGpu_unified_.cpuGflops = 0.0;
+        cpuGpu_unified_.gpuGflops = 0.0;
+        cpuGpu_unified_.probSize_kib = 0.0;
+        cpuGpu_unified_.M = 0;
+        cpuGpu_unified_.N = 0;
+        cpuGpu_unified_.K = 0;
+      }
+    }
+    /** Update the offload threshold structs if GPU.gflops > CPU.gflops. */
+    void updateOffloadStructs(time_checksum_gflop cpuResult,
+                              time_checksum_gflop gpuResult_once,
+                              time_checksum_gflop gpuResult_always,
+                              time_checksum_gflop gpuResult_unified, const int M,
+                              const int N, const int K, const double probSize) {
+      if ((cpuGpu_once_.M == 0) && cpuResult.gflops < gpuResult_once.gflops) {
+        cpuGpu_once_.cpuGflops = cpuResult.gflops;
+        cpuGpu_once_.gpuGflops = gpuResult_once.gflops;
+        cpuGpu_once_.probSize_kib = probSize;
+        cpuGpu_once_.M = M;
+        cpuGpu_once_.N = N;
+        cpuGpu_once_.K = K;
+      }
+      if ((cpuGpu_always_.M == 0) && cpuResult.gflops < gpuResult_always.gflops) {
+        cpuGpu_always_.cpuGflops = cpuResult.gflops;
+        cpuGpu_always_.gpuGflops = gpuResult_always.gflops;
+        cpuGpu_always_.probSize_kib = probSize;
+        cpuGpu_always_.M = M;
+        cpuGpu_always_.N = N;
+        cpuGpu_always_.K = K;
+      }
+      if ((cpuGpu_unified_.M == 0) &&
+          cpuResult.gflops < gpuResult_unified.gflops) {
+        cpuGpu_unified_.cpuGflops = cpuResult.gflops;
+        cpuGpu_unified_.gpuGflops = gpuResult_unified.gflops;
+        cpuGpu_unified_.probSize_kib = probSize;
+        cpuGpu_unified_.M = M;
+        cpuGpu_unified_.N = N;
+        cpuGpu_unified_.K = K;
+      }
+    }
+    void callKernels(std::ofstream& csvFile, const int N, const float
+    sparsity) {
+      const double probSize = calcKib(N, N, N);
+      const uint64_t flops = calcFlops(N, N, N);
+      std::string kernelName = getKernelName();
+    if (doCPU_) {
+      cpu_.initialise(N, sparsity);
+      time_checksum_gflop cpuResult = cpu_.compute();
+      cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
+		  writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize,
+                     sparsity, iterations_, cpuResult.runtime,
+                     cpuResult.gflops);
+    }
+      // Perform the GPU kernels
+    // - UNIFIED : data passed from host to device (and device to host) as
+    //             needed
+    if (doGPU_) {
+      gpu_.initialise(gpuOffloadType::unified, N, sparsity);
+      time_checksum_gflop gpuResult_unified = gpu_.compute();
+      gpuResult_unified.gflops =
+      calcGflops(flops, iterations_, gpuResult_unified.runtime);
+    // - ALWAYS: Offload to/from GPU every iteration
+      gpu_.initialise(gpuOffloadType::always, N, sparsity);
+      time_checksum_gflop gpuResult_always = gpu_.compute();
+      gpuResult_always.gflops =
+            calcGflops(flops, iterations_, gpuResult_always.runtime);
+		// - ONCE : Offload to/from GPU once before all iterations and once
+		// after
+      gpu_.initialise(gpuOffloadType::once, N, sparsity);
+		  time_checksum_gflop gpuResult_once = gpu_.compute();
+		  gpuResult_once.gflops =
+						calcGflops(flops, iterations_, gpuResult_once.runtime);
+		// ToDo -- non-default GPU operations
+		// Write lines to CSV file
+		  writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
+		                sparsity, iterations_, gpuResult_once.runtime,
+                    gpuResult_once.gflops);
+		  writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
+		                sparsity, iterations_, gpuResult_always.runtime,
+		                gpuResult_always.gflops);
+		  writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
+		                sparsity, iterations_, gpuResult_unified.runtime,
+		                gpuResult_unified.gflops);
+    }
+    }
+    /** A function for calculating FLOPs performed by a GEMM.
+     * C = alpha*AB + beta*C */
+    constexpr uint64_t calcFlops(const int M, const int N, const int K) const {
+      // A * B = 2*M*N*K (FMA)
+      // alpha * AB = M*N (multiplication)
+      // beta * C = M*N (multiplication)
+      // AB + C = M*N (addition)
+      // = 2MNK + MN + MN + MN
+      // If beta==0; = 2MNK + MN ------- alpha*AB Always done
+      // Else; = 2MNK + 3MN
+      uint64_t scalar = (BETA != 0) ? 3 : 1;
+      return (2 * (uint64_t)M * (uint64_t)N * (uint64_t)K) +
+              (scalar * (uint64_t)M * (uint64_t)N);
+    }
+    /** A function for calculating the total GEMM problem size in KiB. */
+    constexpr double calcKib(const int M, const int N, const int K) const {
+      uint64_t M_ = (uint64_t)M, N_ = (uint64_t)N, K_ = (uint64_t)K;
+      uint64_t probSize = (M_ * K_) + (K_ * N_) + (M_ * N_);
+      return ((double)(probSize * (sizeof(T))) / 1024);
+    }
+    /** Get the name of the kernel being run. */
+    std::string getKernelName() const {
+      switch (sizeof(T)) {
+        case 4:
+          return "sgemm";
+        case 8:
+          return "dgemm";
+        default:
+          return "unknown";
+      }
+    }
+    /** Print to stdout the offload thresholds. */
+    void printOffloadThreshold(const std::string& problemName) const {
+      std::vector<std::string> header = {
+              "Device",  "M",          "N", "K", "Total Prob. Size (KiB)",
+              "GFLOP/s", "CPU GFLOP/s"};
+      std::vector<std::vector<std::string>> rows;
+      // Initialise GPU_Once row
+      std::stringstream probSize_o;
+      std::stringstream gpuGflops_o;
+      std::stringstream cpuGflops_o;
+      probSize_o << std::fixed << std::setprecision(2)
+                 << cpuGpu_once_.probSize_kib;
+      gpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.gpuGflops;
+      cpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.cpuGflops;
+      if (cpuGpu_once_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Offload Once)", std::to_string(0),
+                        std::to_string(0), std::to_string(0), probSize_o.str(),
+                        "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Offload Once)", std::to_string(cpuGpu_once_.M),
+                        std::to_string(cpuGpu_once_.N),
+                        std::to_string(cpuGpu_once_.K), probSize_o.str(),
+                        gpuGflops_o.str(), cpuGflops_o.str()});
+      }
+      // Initialise GPU_always row
+      std::stringstream probSize_a;
+      std::stringstream gpuGflops_a;
+      std::stringstream cpuGflops_a;
+      probSize_a << std::fixed << std::setprecision(2)
+                 << cpuGpu_always_.probSize_kib;
+      gpuGflops_a << std::fixed << std::setprecision(2)
+                  << cpuGpu_always_.gpuGflops;
+      cpuGflops_a << std::fixed << std::setprecision(2)
+                  << cpuGpu_always_.cpuGflops;
+      if (cpuGpu_always_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Offload Always)", std::to_string(0),
+                        std::to_string(0), std::to_string(0), probSize_a.str(),
+                        "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Offload Always)", std::to_string(cpuGpu_always_.M),
+                        std::to_string(cpuGpu_always_.N),
+                        std::to_string(cpuGpu_always_.K), probSize_a.str(),
+                        gpuGflops_a.str(), cpuGflops_a.str()});
+      }
+      // Initialise GPU_unified row
+      std::stringstream probSize_u;
+      std::stringstream gpuGflops_u;
+      std::stringstream cpuGflops_u;
+      probSize_u << std::fixed << std::setprecision(2)
+                 << cpuGpu_unified_.probSize_kib;
+      gpuGflops_u << std::fixed << std::setprecision(2)
+                  << cpuGpu_unified_.gpuGflops;
+      cpuGflops_u << std::fixed << std::setprecision(2)
+                  << cpuGpu_unified_.cpuGflops;
+      if (cpuGpu_unified_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Unified Memory)", std::to_string(0),
+                        std::to_string(0), std::to_string(0), probSize_u.str(),
+                        "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Unified Memory)", std::to_string(cpuGpu_unified_.M),
+                        std::to_string(cpuGpu_unified_.N),
+                        std::to_string(cpuGpu_unified_.K), probSize_u.str(),
+                        gpuGflops_u.str(), cpuGflops_u.str()});
+      }
+      // Print table
+      tablePrinter tPrinter(
+              problemName + " Problem Domian GPU Offload Thresholds:", header, rows);
+      tPrinter.print(1);
+    }
+    /** The output directory where CSV files should be saved to. */
+    const std::string CSV_DIR;
+    /** The number of iterations to perform per problem size. */
+    const int iterations_;
+    /** The value of the first probelm size dimention run. */
+    const int startDimention_;
+    /** The maximum value of the largest problem size dimention. */
+    const int upperLimit_;
+    /** Whether the CPU kernels should be run. */
+    const bool doCPU_ = true;
+    /** Whether the GPU kernels should be run. */
+    const bool doGPU_ = true;
+    /** The CPU kernel. */
+  cpu::spmm_cpu<T> cpu_;
+    /** The GPU kernel. */
+	gpu::spmm_gpu<T> gpu_;
+    /** The point at which offloading to GPU (offload once) becomes worthwhile. */
+    cpuGpu_offloadThreshold cpuGpu_once_;
+    /** The point at which offloading to GPU (offload always) becomes worthwhile.
+     */
+    cpuGpu_offloadThreshold cpuGpu_always_;
+    /** The point at which offloading to GPU (unified memory) becomes worthwhile.
+     */
+    cpuGpu_offloadThreshold cpuGpu_unified_;
+    /** The previous problem size's GPU (offload once) performance results. */
+    time_checksum_gflop prev_gpuResult_once;
+    /** The previous problem size's GPU (offload always) performance results. */
+    time_checksum_gflop prev_gpuResult_always;
+    /** The previous problem size's GPU (unified memory) performance results. */
+    time_checksum_gflop prev_gpuResult_unified;
\ No newline at end of file
diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh
deleted file mode 100644
index c431d4d..0000000
--- a/include/kernels/CPU/sp_gemm.hh
+++ /dev/null
@@ -1,108 +0,0 @@
-#pragma once
-#include "../gemm.hh"
-#include <random>
-#include <memory>
-#include <iostream>
-namespace cpu {
-/** An abstract class for GEMM BLAS kernels. */
-		template <typename T>
-		class sp_gemm : public ::gemm<T> {
-		public:
-        using ::gemm<T>::gemm;
-        using ::gemm<T>::initInputMatricesSparse;
-        using ::gemm<T>::toCSR_int;
-				using ::gemm<T>::iterations_;
-        using ::gemm<T>::m_;
-				using ::gemm<T>::n_;
-				using ::gemm<T>::k_;
-				using ::gemm<T>::A_;
-				using ::gemm<T>::B_;
-				using ::gemm<T>::C_;
-		public:
-			/** Initialise the required data structures. */
-			virtual void initialise(int n, double sparsity, bool binary = false) {
-				n_ = n;
-        sparsity_ = sparsity;
-        // Note that the below should be the same as the edges calculation
-        // used in the initInputMatricesSparse function.  If changed here,
-        // change there
-        nnz_ = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity_));
-//        std::cout << "\t____About to malloc()____" << std::endl;
-				A_ = (T*)malloc(sizeof(T) * n_ * n_);
-				B_ = (T*)malloc(sizeof(T) * n_ * n_);
-				C_ = (T*)malloc(sizeof(T) * n_ * n_);
-				initInputMatricesSparse(sparsity);
-        toCSR_int();
-			}
-      uint64_t nnz_;
-    protected:
-        T* A_vals_;
-        T* B_vals_;
-        T* C_vals_;
-    private:
-				/** Do any necessary cleanup (free pointers, close library handles, etc.)
-				 * after Kernel has been called. */
-      void postCallKernelCleanup() {
-        free(A_);
-        free(B_);
-        free(C_);
-      }
-      void toCSR_int() {
-        // Move A to CSR
-        A_row_ptr_ = new int[n_ + 1];
-        A_col_index_ = new int[nnz_];
-        A_vals_ = new T[nnz_];
-        int nnz_encountered = 0;
-        for (int row = 0; row < n_; row++) {
-          A_row_ptr_[row] = nnz_encountered;
-          for (int col = 0; col < n_; col++) {
-            if (A_[(row * n_) + col] != 0.0) {
-              A_col_index_[nnz_encountered] = col;
-              A_vals_[nnz_encountered] = A_[(row * n_) + col];
-              nnz_encountered++;
-            }
-          }
-        }
-        // Move B to CSR
-        B_row_ptr_ = new int[n_ + 1];
-        B_col_index_ = new int[nnz_];
-        B_vals_ = new T[nnz_];
-        nnz_encountered = 0;
-        for (int row = 0; row < n_; row++) {
-          B_row_ptr_[row] = nnz_encountered;
-          for (int col = 0; col < n_; col++) {
-            if (B_[(row * n_) + col] != 0.0) {
-              B_col_index_[nnz_encountered] = col;
-              B_vals_[nnz_encountered] = B_[(row * n_) + col];
-              nnz_encountered++;
-            }
-          }
-        }
-      }
-      double sparsity_;
-      int* A_row_ptr_;
-      int* A_col_index_;
-      int* B_row_ptr_;
-      int* B_col_index_;
-      int* C_row_ptr_;
-      int* C_col_index_;
-		};
-}  // namespace cpu
diff --git a/include/kernels/CPU/sp_gemv.hh b/include/kernels/CPU/spgemv.hh
similarity index 100%
rename from include/kernels/CPU/sp_gemv.hh
rename to include/kernels/CPU/spgemv.hh
diff --git a/include/kernels/CPU/spgmm.hh b/include/kernels/CPU/spgmm.hh
new file mode 100644
index 0000000..59856ed
--- /dev/null
+++ b/include/kernels/CPU/spgmm.hh
@@ -0,0 +1,8 @@
+// Created by Alexander Cockrean on 07/01/2025.
diff --git a/include/kernels/CPU/spmm.hh b/include/kernels/CPU/spmm.hh
new file mode 100644
index 0000000..7d19f5d
--- /dev/null
+++ b/include/kernels/CPU/spmm.hh
@@ -0,0 +1,60 @@
+#pragma once
+#include "../spmm.hh"
+#include <random>
+#include <memory>
+#include <iostream>
+namespace cpu {
+/** An abstract class for sparse matrix-sparse matrix BLAS kernels. */
+template <typename T>
+class spmm : public ::spmm<T> {
+  using ::spmm<T>::spmm;
+  using ::spmm<T>::initInputMatrices;
+  using ::spmm<T>::toCSR_int;
+  using ::spmm<T>::iterations_;
+  using ::spmm<T>::nnzA_;
+  using ::spmm<T>::nnzB_;
+  using ::spmm<T>::m_;
+  using ::spmm<T>::n_;
+  using ::spmm<T>::k_;
+  using ::spmm<T>::A_;
+  using ::spmm<T>::B_;
+  using ::spmm<T>::C_;
+  /** Initialise the required data structures. */
+  void initialise(int n, int m, int k, double sparsity,
+                          bool binary = false) {
+    n_ = n;
+    m_ = m;
+    k_ = k;
+    sparsity_ = sparsity;
+    /** Determine the number of nnz elements in A and B */
+    nnzA_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+    nnzB_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_));
+    A_ = (T*)malloc(sizeof(T) * m_ * k_);
+    B_ = (T*)malloc(sizeof(T) * k_ * n_);
+    C_ = (T*)calloc(sizeof(T) * m_ * n_);
+    initInputMatrices(sparsity_);
+  }
+    /** Do any necessary cleanup (free pointers, close library handles, etc.)
+     * after Kernel has been called. */
+  void postCallKernelCleanup() {
+    free(A_);
+    free(B_);
+    free(C_);
+  }
+  double sparsity_;
+}  // namespace cpu
diff --git a/include/kernels/GPU/sp_gemm.hh b/include/kernels/GPU/sp_gemm.hh
deleted file mode 100644
index 52a5494..0000000
--- a/include/kernels/GPU/sp_gemm.hh
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-#include "../gemm.hh"
-namespace gpu {
-/** An abstract class for GEMM BLAS kernels. */
-		template <typename T>
-		class sp_gemm : public ::gemm<T> {
-		public:
-				using ::gemm<T>::gemm;
-				/** Initialise the required data structures.
-				 * `offload` refers to the data offload type:
-				 *  - Once:    Move data from host to device before all iterations & move from
-				 *             device to host after all iterations
-				 *  - Always:  Move data from host to device and device to host each iteration
-				 *  - Unified: Initialise data as unified memory; no data movement semantics
-				 *             required */
-				virtual void initialise(gpuOffloadType offload, int n, float sparsity)
-        = 0;
-		protected:
-				/** Whether data should be offloaded to/from the GPU each iteration, or just
-				 * before & after. */
-				gpuOffloadType offload_ = gpuOffloadType::always;
-		};
-}  // namespace gpu
\ No newline at end of file
diff --git a/include/kernels/GPU/spgemm.hh b/include/kernels/GPU/spgemm.hh
new file mode 100644
index 0000000..917469b
--- /dev/null
+++ b/include/kernels/GPU/spgemm.hh
@@ -0,0 +1,8 @@
+// Created by Alexander Cockrean on 07/01/2025.
diff --git a/include/kernels/GPU/sp_gemv.hh b/include/kernels/GPU/spgemv.hh
similarity index 100%
rename from include/kernels/GPU/sp_gemv.hh
rename to include/kernels/GPU/spgemv.hh
diff --git a/include/kernels/GPU/spmm.hh b/include/kernels/GPU/spmm.hh
new file mode 100644
index 0000000..3f5002e
--- /dev/null
+++ b/include/kernels/GPU/spmm.hh
@@ -0,0 +1,28 @@
+#pragma once
+#include "../spmm.hh"
+namespace gpu {
+/** An abstract class for sparse matrix-sparse matrix BLAS kernels. */
+template <typename T>
+class spmm : public ::spmm<T> {
+    using ::spmm<T>::spmm;
+    /** Initialise the required data structures.
+   * `offload` refers to the data offload type:
+   *  - Once:    Move data from host to device before all iterations & move from
+   *             device to host after all iterations
+   *  - Always:  Move data from host to device and device to host each iteration
+   *  - Unified: Initialise data as unified memory; no data movement semantics
+   *             required */
+    virtual void initialise(gpuOffloadType offload, int m, int n, int k,
+                            double sparsity, bool binary = false) = 0;
+    /** Whether data should be offloaded to/from the GPU each iteration, or just
+     * before & after. */
+    gpuOffloadType offload_ = gpuOffloadType::always;
+}  // namespace gpu
\ No newline at end of file
diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh
index 6e1328e..3f0aece 100644
--- a/include/kernels/gemm.hh
+++ b/include/kernels/gemm.hh
@@ -92,137 +92,9 @@ class gemm {
-  // Note that the below should be the same as the nnz calculation
-  // used in the cpu initialise functions.  If changed here,
-  // change there
-  void initInputMatricesSparse(float sparsity) {
-    for (int i = 0; i < (n_ * n_); i++) {
-      A_[i] = 0.0;
-      B_[i] = 0.0;
-    }
-    // Random number generator objects for use in descent
-    std::default_random_engine gen;
-    gen.seed(std::chrono::system_clock::now()
-                     .time_since_epoch().count());
-    std::uniform_real_distribution<double> dist(0.0, 1.0);
-    int edges = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity));
-    // Using a=0.45 and b=c=0.22 as default probabilities
-    for (int i = 0; i < edges; i++) {
-      while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
-                   false)) {}
-      while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist,
-                   false)) {}
-    }
-  }
   /** Call the extern consume() function. */
   void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); }
-  /** Recursive function to populate sparse matrices */
-  bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b,
-            float c, std::default_random_engine* gen,
-            std::uniform_real_distribution<double> dist, bool bin) {
-    // If a 1x1 submatrix, then add an edge and return out
-    if (x1 >= x2 && y1 >= y2) {
-      // Needed to avoid overfloe segfaults with large problem sizes
-      uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1);
-      if (abs(M[index]) > 0.1) {
-        return false;
-      } else {
-        // Add 1.0 if this is a binary graph, and a random real number otherwise
-        M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0);
-        return true;
-      }
-    } else {
-      // Divide up the matrix
-      int xMidPoint = x1 + floor((x2 - x1) / 2);
-      int yMidPoint = y1 + floor((y2 - y1) / 2);
-      // ToDo -- add some noise to these values between iterations
-      float newA = a;
-      float newB = b;
-      float newC = c;
-      // Work out which quarter to recurse into
-      // There are some ugly ternary operators here to avoid going out of bounds in the edge case
-      // that we are already at 1 width or 1 height
-      float randomNum = dist(*gen);
-      if (randomNum < a) {
-        return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
-                    newA, newB, newC, gen, dist, bin);
-      } else if (randomNum < (a + b)) {
-        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
-                    newA, newB, newC, gen, dist, bin);
-      } else if (randomNum < (a + b + c)) {
-        return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
-                    newA, newB, newC, gen, dist, bin);
-      } else {
-        return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
-                    ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC,
-                    gen, dist, bin);
-      }
-    }
-  }
-  void toCSR_int(T* dense, int n_col, int n_row, T* vals, int* col_index,
-             int* row_ptr) {
-    int nnz_encountered = 0;
-    for (int row = 0; row < n_row; row++) {
-      row_ptr[row] = nnz_encountered;
-      int nnz_row = 0;
-      for (int col = 0; col < n_col; col++) {
-        if (dense[(row * n_col) + col] != 0.0) {
-          nnz_row++;
-          col_index[nnz_encountered] = col;
-          vals[nnz_encountered] = dense[(row * n_col) + col];
-          nnz_encountered++;
-        }
-      }
-    }
-    row_ptr[n_row] = nnz_encountered;
-  }
-#ifdef CPU_ONEMKL
-  void toCSR_mkl(T* dense, int n_col, int n_row, T* vals, MKL_INT* col_index,
-                 MKL_INT* row_ptr) {
-    int nnz_encountered = 0;
-    for (int row = 0; row < n_row; row++) {
-      row_ptr[row] = (MKL_INT)nnz_encountered;
-      int nnz_row = 0;
-      for (int col = 0; col < n_col; col++) {
-        if (dense[(row * n_col) + col] != 0.0) {
-          nnz_row++;
-          col_index[nnz_encountered] = (MKL_INT)col;
-          vals[nnz_encountered] = dense[(row * n_col) + col];
-          nnz_encountered++;
-        }
-      }
-    }
-    row_ptr[n_row] = (MKL_INT)nnz_encountered;
-  }
-#ifdef CPU_AOCL
-    void toCSR_aocl(T* dense, int n_col, int n_row, T* vals, aoclsparse_int*
-    col_index, aoclsparse_int* row_ptr) {
-    int nnz_encountered = 0;
-    for (int row = 0; row < n_row; row++) {
-      row_ptr[row] = (aoclsparse_int)nnz_encountered;
-      int nnz_row = 0;
-      for (int col = 0; col < n_col; col++) {
-        if (dense[(row * n_col) + col] != 0.0) {
-          nnz_row++;
-          col_index[nnz_encountered] = (aoclsparse_int)col;
-          vals[nnz_encountered] = dense[(row * n_col) + col];
-          nnz_encountered++;
-        }
-      }
-    }
-    row_ptr[n_row] = (MKL_INT)nnz_encountered;
-  }
   /** The number of iterations to perform per problem size. */
   const int iterations_;
diff --git a/include/kernels/spgemm.hh b/include/kernels/spgemm.hh
new file mode 100644
index 0000000..917469b
--- /dev/null
+++ b/include/kernels/spgemm.hh
@@ -0,0 +1,8 @@
+// Created by Alexander Cockrean on 07/01/2025.
diff --git a/include/kernels/spgemv.hh b/include/kernels/spgemv.hh
new file mode 100644
index 0000000..9e7d953
--- /dev/null
+++ b/include/kernels/spgemv.hh
@@ -0,0 +1,8 @@
+// Created by Alexander Cockrean on 07/01/2025.
diff --git a/include/kernels/spmm.hh b/include/kernels/spmm.hh
new file mode 100644
index 0000000..37de9cf
--- /dev/null
+++ b/include/kernels/spmm.hh
@@ -0,0 +1,168 @@
+#pragma one
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <limits>
+#include <random>
+#include <iostream>
+#include "../utilities.hh"
+/** A generic abstract class defining the operation of timing a SPMM BLAS
+ * kernel for n iterations */
+template <typename T>
+class spmm {
+    spmm(const int iters) : iterations_(iters) {}
+    /** Call the kernel n times.  Returns the time elapsed for all n calls
+     * in seconds */
+    time_checksum_gflop compute() {
+      // Start the timer
+      std::chrono::time_point<std::chrono::high_resolution_clock> startTime =
+              std::chrono::high_resolution_clock::now();
+      // perform tje SPMM calls
+      preLoopRequirements();
+      for (int i = 0; i < iterations_; i++) {
+        callSpmm();
+      }
+      postLoopRequirements();
+      // Stop the timer
+      std::chrono::time_point<std::chrono::high_resolution_clock> endTime =
+              std::chrono::high_resolution_clock::now();
+      std::chrono::duration<double> time_s = endTime - startTime;
+      double checksum = calcChecksum();
+      postCallKernelCleanup();
+      return {time_s.count(), checksum, 0.0};
+    }
+    int64_t nnzA_ = 0;
+    int64_t nnzB_ = 0;
+    int64_t nnzC_ = 0;
+    /** Performs the steps required before calling the SPMM kernel that
+     * should be timed */
+    virtual void preLoopRequirements() = 0;
+    /** Perform the SPMM kernel. */
+    virtual void callSpmm() = 0;
+    /** Perform any steps required after calling the SPMM kernel that should
+     * be timed */
+    virtual void postLoopRequirements() = 0;
+    /** Do the necessary cleanup after the kernel has been finished that
+     * should not be timed */
+    virtual void postCallKernelCleanup() = 0;
+    /** Calculate a checksum from the result matrix C. */
+    constexpr double calcChecksum() {
+      // Todo -- think about how this can sensibly be done for SPMM
+      return 0.0;
+    }
+    /** Set up the starting matrices */
+    void initInputMatrices() {
+      for (size_t i = 0; i < (m_ * k_); i++) {
+        A_[i] = 0.0;
+      }
+      for (size_t i = 0; i < (k_ * n_); i++) {
+        B_[i] = 0.0;
+      }
+      // Random number generator objects for use in descent
+      std::default_random_engine gen;
+      gen.seed(std::chrono::system_clock::now()
+                       .time_since_epoch().count());
+      std::uniform_real_distribution<double> dist(0.0, 1.0);
+      // Using a=0.45 and b=c=0.22 as default probabilities
+      for (size_t i = 0; i < nnzA_; i++) {
+        while (!rMat(A_, k_, 0, k_ - 1, 0, m_ - 1, 0.45, 0.22, 0.22, &gen, dist,
+                     false)) {}
+      }
+      for (size_t i = 0; i < nnzB_; i++) {
+        while (!rMat(B_, n_, 0, n_ - 1, 0, k_ - 1, 0.45, 0.22, 0.22, &gen, dist,
+                     false)) {}
+      }
+      toSparseFormat()
+    }
+    /** Move matrices into the sparse representation of for the given library */
+    virtual void toSparseFormat() = 0;
+    /** Call the external consume() function on the matrices */
+    void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); }/** Recursive function to populate sparse matrices */
+    // On first iteration, n should be x2 + 1
+    bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b,
+              float c, std::default_random_engine* gen,
+              std::uniform_real_distribution<double> dist, bool bin) {
+      // If a 1x1 submatrix, then add an edge and return out
+      if (x1 >= x2 && y1 >= y2) {
+        // Needed to avoid overflow segfaults with large problem sizes
+        uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1);
+        if (abs(M[index]) > 0.1) {
+          return false;
+        } else {
+          // Add 1.0 if this is a binary graph, and a random real number otherwise
+          M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0);
+          return true;
+        }
+      } else {
+        // Divide up the matrix
+        int xMidPoint = x1 + floor((x2 - x1) / 2);
+        int yMidPoint = y1 + floor((y2 - y1) / 2);
+        // Work out which quarter to recurse into
+        // There are some ugly ternary operators here to avoid going out of bounds in the edge case
+        // that we are already at 1 width or 1 height
+        float randomNum = dist(*gen);
+        if (randomNum < a) {
+          return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+                      a, b, c, gen, dist, bin);
+        } else if (randomNum < (a + b)) {
+          return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+                      a, b, c, gen, dist, bin);
+        } else if (randomNum < (a + b + c)) {
+          return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+                      a, b, c, gen, dist, bin);
+        } else {
+          return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+                      ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, a,
+                      b, c, gen, dist, bin);
+        }
+      }
+    }
+    /** The number of iterations to perform per problem size. */
+    const int iterations_;
+    /** Matrix dimension M. */
+    int m_ = 0;
+    /** Matrix dimension N. */
+    int n_ = 0;
+    /** Matrix dimension K. */
+    int k_ = 0;
+    /** Dense representation of input matrix A. */
+    T* A_;
+    /** Dense representation of input matrix B. */
+    T* B_;
+    /** Dense representation of output matrix C. */
+    T* C_;
\ No newline at end of file
diff --git a/include/main.hh b/include/main.hh
index f12ebcb..f639407 100644
--- a/include/main.hh
+++ b/include/main.hh
@@ -5,7 +5,10 @@
 #include <string>
 #include "doGemm.hh"
+#include "doSpgemm.hh"
+#include "doSpmm.hh"
 #include "doGemv.hh"
+#include "doSpgemv.hh"
 #include "utilities.hh"
 /** A function which prints standard configuration information to stdout. */
diff --git a/src/ b/src/
index bdc1db2..8bb7412 100644
--- a/src/
+++ b/src/
@@ -3,14 +3,21 @@
 int iters = 10;
 int startDim = 1;
 int upperLimit = 128;
+// GEMM kernels
 bool doSgemm = true;
 bool doDgemm = true;
-bool doSp_sgemm = true;
-bool doSp_dgemm = true;
+// Sparse GEMM kernels
+bool doSspgemm = true;
+bool doDspgemm = true;
+// GEMV kernels
 bool doSgemv = true;
 bool doDgemv = true;
-bool doSp_sgemv = true;
-bool doSp_dgemv = true;
+// Sparse GEMV kernles
+bool doSspgemv = true;
+bool doDspgemv = true;
+// Sparse-sparse matrix multiplication kernels
+bool doSspmm = true;
+bool doDspmm = true;
 bool doCpu = CPU_ENABLED;
 bool doGpu = GPU_ENABLED;
@@ -39,33 +46,101 @@ int main(int argc, char** argv) {
   // -------- GEMM --------
   // SGEMM Comparison
-  std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl;
-  doGemm<float> sgemm(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                      doGpu, doSgemm, doSp_sgemm);
-  sgemm.collectData();
-  std::cout << "Finished!" << std::endl;
+  if (doSgemm) {
+    std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl;
+    doGemm<float> sgemm(std::string(absPath), iters, startDim, upperLimit,
+                        doCpu,
+                        doGpu);
+    sgemm.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
+  // DGEMM Comparison
+  if (doDgemm) {
+    std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl;
+    doGemm<double> dgemm(std::string(absPath), iters, startDim, upperLimit,
+                         doCpu,
+                         doGpu);
+    dgemm.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
+  // -------- SPGEMM --------
+  // SPGEMM Comparison
+  if (doSspgemm) {
+    std::cout << std::endl << "Comparing SSpGEMM Kernels:" << std::endl;
+    doSpgemm<float> sspgemm(std::string(absPath), iters, startDim, upperLimit,
+                            doCpu, doGpu);
+    sspgemm.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
   // DGEMM Comparison
-  std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl;
-  doGemm<double> dgemm(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                       doGpu, doDgemm, doSp_dgemm);
-  dgemm.collectData();
-  std::cout << "Finished!" << std::endl;
+  if (doDspgemm) {
+    std::cout << std::endl << "Comparing DSpMM Kernels:" << std::endl;
+    doSpgemm<double> dspgemm(std::string(absPath), iters, startDim, upperLimit,
+                             doCpu, doGpu);
+    dspgemm.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
+  // -------- SPMM --------
+  // SSPMM comparison
+  if (doSspmm) {
+    std::cout << std::endl << "Comparing SSpMM Kernels:" << std::endl;
+    doSpmm<float> sspmm(std::string(absPath), iters, startDim, upperLimit,
+                            doCpu, doGpu);
+    sspmm.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
+  // DSPMM Comparison
+  if (doDspmm) {
+    std::cout << std::endl << "Comparing DSpMM Kernels:" << std::endl;
+    doSpmm<double> dspmm(std::string(absPath), iters, startDim, upperLimit,
+                             doCpu, doGpu);
+    dspmm.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
   // -------- GEMV --------
   // SGEMV Comparison
-  std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl;
-  doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                      doGpu, doSgemv, doSp_sgemv);
-  sgemv.collectData();
-  std::cout << "Finished!" << std::endl;
+  if (doSgemv) {
+    std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl;
+    doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit,
+                        doCpu, doGpu);
+    sgemv.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
   // DGEMV Comparison
-  std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl;
-  doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu,
-                       doGpu, doDgemv, doSp_dgemv);
-  dgemv.collectData();
-  std::cout << "Finished!" << std::endl;
+  if (doDgemv) {
+    std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl;
+    doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit,
+                         doCpu, doGpu);
+    dgemv.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
+  // -------- SPGEMV --------
+  // SSPGEMV Comparison
+  if (doSspgemv) {
+    std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl;
+    doSpgemv<float> sspgemv(std::string(absPath), iters, startDim, upperLimit,
+                        doCpu, doGpu);
+    sspgemv.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
+  // DSPGEMV Comparison
+  if (doDgemv) {
+    std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl;
+    doSpgemv<double> dspgemv(std::string(absPath), iters, startDim, upperLimit,
+                         doCpu, doGpu);
+    dspgemv.collectData();
+    std::cout << "Finished!" << std::endl;
+  }
   return 0;
@@ -150,49 +225,20 @@ void getParameters(int argc, char** argv) {
     } else if (!strcmp(argv[i], "--no_gpu")) {
       doGpu = false;
     } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) {
-      doSgemm = doDgemm = doSp_sgemm = doSp_dgemm =
-      doSgemv = doDgemv = doSp_sgemv = doSp_dgemv = false;
       std::string kernelList = argv[++i];
-      if (kernelList.find("sp-sgemm") != std::string::npos) {
-        doSp_sgemm = true;
-        if (kernelList.find("sgemm") != std::string::npos &&
-            kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) {
-          doSgemm = true;
-        }
-      } else if (kernelList.find("sgemm") != std::string::npos) {
-        doSgemm = true;
-      }
-      if (kernelList.find("sp-dgemm") != std::string::npos) {
-        doSp_dgemm = true;
-        if (kernelList.find("dgemm") != std::string::npos &&
-            kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) {
-          doDgemm = true;
-        }
-      } else if (kernelList.find("dgemm") != std::string::npos) {
-        doDgemm = true;
-      }
+      doSgemm = (kernelList.find("sgemm") != std::string::npos);
+      doDgemm = (kernelList.find("dgemm") != std::string::npos);
+      doSspgemm = (kernelList.find("sspgemm") != std::string::npos);
+      doDspgemm = (kernelList.find("dspgemm") != std::string::npos);
+      doSspmm = (kernelList.find("sspmm") != std::string::npos);
+      doDspmm = (kernelList.find("dspmm") != std::string::npos);
+      doSgemv = (kernelList.find("sgemv") != std::string::npos);
+      doDgemv = (kernelList.find("dgemv") != std::string::npos);
+      doSspgemv = (kernelList.find("sspgemv") != std::string::npos);
+      doDspgemv = (kernelList.find("dspgemv") != std::string::npos);
-      if (kernelList.find("sp-sgemv") != std::string::npos) {
-        doSp_sgemv = true;
-        if (kernelList.find("sgemv") != std::string::npos &&
-            kernelList.find("sgemv") != kernelList.find("sp-sgemv") + 3) {
-          doSgemv = true;
-        }
-      } else if (kernelList.find("sgemv") != std::string::npos) {
-        doSgemv = true;
-      }
-      if (kernelList.find("sp-dgemv") != std::string::npos) {
-        doSp_dgemv = true;
-        if (kernelList.find("dgemv") != std::string::npos &&
-            kernelList.find("dgemv") != kernelList.find("sp-dgemv") + 3) {
-          doDgemv = true;
-        }
-      } else if (kernelList.find("dgemv") != std::string::npos) {
-        doDgemv = true;
-      }
-      if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm &&
-          !doSgemv && !doDgemv && !doSp_sgemv && !doSp_dgemv) {
+      if (!doSgemm && !doDgemm && !doSspgemm && !doDspgemm &&
+          !doSgemv && !doDgemv && !doSspgemv && !doDspgemv) {
         std::cout << "ERROR - no implemented kernels in list" << std::endl;
       } else {
@@ -212,18 +258,16 @@ void getParameters(int argc, char** argv) {
           << "  -o  --output_dir             The CSV file output directory"
           << std::endl;
       std::cout << "  -i  --iterations I           Repeat each kernel I times "
-                   "(default: "
-                << iters << ")" << std::endl;
+                   "(default: " << iters << ")" << std::endl;
       std::cout << "  -s  --start_dimension S      First value of M, N, K is S "
-                   "(default: "
-                << startDim << ")" << std::endl;
+                   "(default: " << startDim << ")" << std::endl;
       std::cout << "  -d  --dimension_limit D      Max value of M, N, K is D "
-                   "(default: "
-                << upperLimit << ")" << std::endl;
+                   "(default: " << upperLimit << ")" << std::endl;
       std::cout << "  -k  --kernels <kernels>      Comma-separated list of "
-                   "kernels to be run.  Options are sgemm, dgemm, sp-sgemm, "
-                   "sp-dgemm (default: sgemm,dgemm,sp-gemm,sp-dgemm)" <<
-                   std::endl;
+                   "kernels to be run.  Options are sgemm, dgemm, sspgemm, "
+                   "dspgemm, sspmm, dspmm, sgemv, dgemv, sspgemv, dspgemv "
+                   "(default: `-k sgemm,dgemm,sspgemm,dspgemm,sspmm,dspmm,"
+                   "sgemv,dgemv,sspgemv,dspgemv`)" << std::endl;
       std::cout << std::endl;
     } else {

From 7819f6f6f1ea1f7849f274bc4b66f81d8d026ba2 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Wed, 8 Jan 2025 14:04:30 +0000
Subject: [PATCH 36/38] Moving spgemv into new format

 .idea/workspace.xml           |  46 ++--
 ArmPL/spgemv.hh               |  31 +--
 cuBLAS/spgemv.hh              |  90 ++++---
 include/doGemv.hh             |  12 +-
 include/doSpgemv.hh           | 429 +++++++++++++++++++++++++++++++++-
 include/kernels/CPU/spgemv.hh |  34 ++-
 include/kernels/CPU/spmm.hh   |   5 +-
 include/kernels/GPU/spgemv.hh |  10 +-
 include/kernels/spgemv.hh     | 135 ++++++++++-
 include/kernels/spmm.hh       |  46 +---
 include/utilities.hh          | 110 ++++++++-
 11 files changed, 772 insertions(+), 176 deletions(-)

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 84d08df..3d4f373 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -15,30 +15,18 @@
   <component name="ChangeListManager">
-    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Getting rid of old oneMKL sparse file">
-      <change afterPath="$PROJECT_DIR$/include/doSpgemm.hh" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/include/doSpgemv.hh" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/include/doSpmm.hh" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/include/kernels/CPU/spgmm.hh" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemm.hh" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/include/kernels/spgemm.hh" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/include/kernels/spgemv.hh" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/include/kernels/spmm.hh" afterDir="false" />
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Refactoring to make individual files relate to a single kernel">
       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/AOCL/sp_gemm.hh" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spmm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spgemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/cuBLAS/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spmm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/cuBLAS/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spgemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/doGemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/ArmPL/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spgemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/cuBLAS/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spgemv.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spgemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/GPU/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spmm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/GPU/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/main.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/main.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/src/" beforeDir="false" afterPath="$PROJECT_DIR$/src/" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/doSpgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpgemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spgemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/GPU/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spgemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spmm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/utilities.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/utilities.hh" afterDir="false" />
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -578,7 +566,15 @@
       <option name="project" value="LOCAL" />
-    <option name="localTasksCounter" value="48" />
+    <task id="LOCAL-00048" summary="Refactoring to make individual files relate to a single kernel">
+      <option name="closed" value="true" />
+      <created>1736268772766</created>
+      <option name="number" value="00048" />
+      <option name="presentableId" value="LOCAL-00048" />
+      <option name="project" value="LOCAL" />
+      <updated>1736268772766</updated>
+    </task>
+    <option name="localTasksCounter" value="49" />
     <servers />
   <component name="TypeScriptGeneratedFilesManager">
@@ -596,7 +592,6 @@
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="cuSPARSE unified memory implementation" />
     <MESSAGE value="Now compiles" />
     <MESSAGE value="Now compiles with fewer runtime errors" />
     <MESSAGE value="Implementing other offload types - still some runtime errors" />
@@ -621,6 +616,7 @@
     <MESSAGE value="Updating to show sparsity" />
     <MESSAGE value="Beginning gemv ARMPL" />
     <MESSAGE value="Getting rid of old oneMKL sparse file" />
-    <option name="LAST_COMMIT_MESSAGE" value="Getting rid of old oneMKL sparse file" />
+    <MESSAGE value="Refactoring to make individual files relate to a single kernel" />
+    <option name="LAST_COMMIT_MESSAGE" value="Refactoring to make individual files relate to a single kernel" />
\ No newline at end of file
diff --git a/ArmPL/spgemv.hh b/ArmPL/spgemv.hh
index f39a764..5045062 100644
--- a/ArmPL/spgemv.hh
+++ b/ArmPL/spgemv.hh
@@ -8,22 +8,22 @@
 #include <algorithm>
-#include "../include/kernels/CPU/sp_gemv.hh"
+#include "../include/kernels/CPU/spgemv.hh"
 #include "../include/utilities.hh"
 namespace cpu {
 /** A class for GEMM CPU BLAS kernels. */
 template <typename T>
-class sp_gemv_cpu : public sp_gemv<T> {
+class spgemv_cpu : public spgemv<T> {
-  using sp_gemv<T>::sp_gemv;
-  using sp_gemv<T>::callConsume;
-  using sp_gemv<T>::m_;
-  using sp_gemv<T>::n_;
-  using sp_gemv<T>::A_;
-  using sp_gemv<T>::x_;
-  using sp_gemv<T>::y_;
-  using sp_gemv<T>::nnz_;
+  using spgemv<T>::spgemv;
+  using spgemv<T>::callConsume;
+  using spgemv<T>::m_;
+  using spgemv<T>::n_;
+  using spgemv<T>::A_;
+  using spgemv<T>::x_;
+  using spgemv<T>::y_;
+  using spgemv<T>::nnz_;
   /** Make call to the GEMM kernel. */
@@ -62,7 +62,7 @@ class sp_gemv_cpu : public sp_gemv<T> {
     } else {
       // Un-specialised class will not do any work - print error and exit.
-      std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported."
+      std::cout << "ERROR - Datatype for ArmPL CPU SPGEMM kernel not supported."
                 << std::endl;
@@ -156,7 +156,7 @@ class sp_gemv_cpu : public sp_gemv<T> {
   /** The constant value Beta. */
   const T beta = BETA;
-  void toCSR_armpl() {
+  void toSparseFormat() {
     n_armpl_ = n_;
     // ToDo -- check whether flags_ is correct!
     flags_ = 0;
@@ -168,7 +168,7 @@ class sp_gemv_cpu : public sp_gemv<T> {
     A_armpl_row_ptr_[0] = 0;
     int nnz_encountered = 0;
-    for (int row = 0; row < n_; row++) {
+    for (int row = 0; row < m_; row++) {
       A_armpl_row_ptr_[row + 1] = nnz_encountered;
       for (int col = 0; col < n_; col++) {
         if (A_[(row * n_) + col] != 0.0) {
@@ -183,7 +183,7 @@ class sp_gemv_cpu : public sp_gemv<T> {
 //      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
 //                nnz_, flags_);
       status_ = armpl_spmat_create_csr_s(&A_armpl_,
-                                         n_armpl_,
+                                         m_armpl_,
@@ -197,7 +197,7 @@ class sp_gemv_cpu : public sp_gemv<T> {
 //      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
 //                nnz_, flags_
       status_ = armpl_spmat_create_csr_d(&A_armpl_,
-                                         n_armpl_,
+                                         m_armpl_,
@@ -239,6 +239,7 @@ class sp_gemv_cpu : public sp_gemv<T> {
   armpl_int_t flags_;
   armpl_int_t n_armpl_;
+  armpl_int_t m_armpl;
   T* A_vals_;
   armpl_int_t* A_armpl_row_ptr_;
diff --git a/cuBLAS/spgemv.hh b/cuBLAS/spgemv.hh
index f35a63a..2076488 100644
--- a/cuBLAS/spgemv.hh
+++ b/cuBLAS/spgemv.hh
@@ -7,27 +7,27 @@
 #include <random>
 #include <iostream>
-#include "../include/kernels/GPU/sp_gemv.hh"
+#include "../include/kernels/GPU/spgemv.hh"
 #include "../include/utilities.hh"
 #include "common.hh"
 namespace gpu {
 /** A class for sparse GEMM GPU BLAS kernels. */
 template <typename T>
-class sp_gemv_gpu : public sp_gemv<T> {
+class spgemv_gpu : public spgemv<T> {
-  using sp_gemv<T>::sp_gemv;
-  using sp_gemv<T>::initInputMatrixVectorSparse;
-//  using sp_gemv<T>::toCSR_int;
-  using sp_gemv<T>::m_;
-  using sp_gemv<T>::n_;
-  using sp_gemv<T>::A_;
-  using sp_gemv<T>::x_;
-  using sp_gemv<T>::y_;
-  using sp_gemv<T>::offload_;
-  using sp_gemv<T>::sparsity_;
-  ~sp_gemv_gpu() {
+  using spgemv<T>::spgemv;
+  using spgemv<T>::initInputMatrixVector;
+  using spgemv<T>::nnz_;
+  using spgemv<T>::m_;
+  using spgemv<T>::n_;
+  using spgemv<T>::A_;
+  using spgemv<T>::x_;
+  using spgemv<T>::y_;
+  using spgemv<T>::offload_;
+  using spgemv<T>::sparsity_;
+  ~spgemv_gpu() {
     // ToDo -- destroy the handle
     // Destroy streams after use
@@ -45,14 +45,15 @@ class sp_gemv_gpu : public sp_gemv<T> {
    *  - Always:  Move data from host to device and device to host each iteration
    *  - Unified: Initialise data as unified memory; no data movement semantics
    *             required */
-  void initialise(gpuOffloadType offload, int n, float sparsity) override {
-    std::cout << std::endl << "##############################" << std::endl
-              << "\tCUSPARSE GEMV\t\tInitialising n = " << n << "\tOffload"
-              << " type = " <<
-              (((offload == gpuOffloadType::unified) ? "Unified" : (offload
-              == gpuOffloadType::always) ? "Always" : "Once"))
-              << std::endl
-              << "##############################" << std::endl;
+  void initialise(gpuOffloadType offload, int m, int n, float sparsity)
+  override {
+//    std::cout << std::endl << "##############################" << std::endl
+//              << "\tCUSPARSE GEMV\t\tInitialising n = " << n << "\tOffload"
+//              << " type = " <<
+//              (((offload == gpuOffloadType::unified) ? "Unified" : (offload
+//              == gpuOffloadType::always) ? "Always" : "Once"))
+//              << std::endl
+//              << "##############################" << std::endl;
     offload_ = offload;
     sparsity_ = sparsity;
@@ -83,6 +84,7 @@ class sp_gemv_gpu : public sp_gemv<T> {
       std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
+    m_ = m;
     n_ = n;
     // Initialise 3 streams to asynchronously move data between host and device
@@ -93,13 +95,11 @@ class sp_gemv_gpu : public sp_gemv<T> {
     std::cout << "\tcuda streams created" << std::endl;
-   // Work out the sizes of all the vectors
-    A_nnz_ = 1 + (uint64_t)(n_ * n_ * (1 - sparsity));
-    vals_size_ = sizeof(T) * A_nnz_;
-    cols_size_ = sizeof(int) * A_nnz_;
-    rows_size_ = sizeof(int) * (n_ + 1);
+    vals_size_ = sizeof(T) * nnz_;
+    cols_size_ = sizeof(int) * nnz_;
+    rows_size_ = sizeof(int) * (m_ + 1);
     x_size_ = sizeof(T) * n_;
-    y_size_ = sizeof(T) * n_;
+    y_size_ = sizeof(T) * m_;
     if (offload_ == gpuOffloadType::unified) {
       // Get device identifier
@@ -141,18 +141,14 @@ class sp_gemv_gpu : public sp_gemv<T> {
     // Initialise the matrices
     // Set initial values to 0
-    A_ = (T*)malloc(sizeof(T) * n_ * n_);
+    A_ = (T*)malloc(sizeof(T) * m_ * n_);
     std::cout << "\tA_ dense array made" << std::endl;
-    initInputMatrixVectorSparse();git branc
+    initInputMatrixVector();
     std::cout << "\tinputs made" << std::endl;
-    toCSR_int(A_, n_, n_, A_val_, A_col_, A_row_);
-    std::cout << "\tA_ moved to CSR" << std::endl;
 //    std::cout << "_____Matrix A_____" << std::endl;
 //    printDenseMatrix(A_, n_, n_);
 //    std::cout << std::endl << std::endl;
@@ -172,7 +168,7 @@ class sp_gemv_gpu : public sp_gemv<T> {
       case gpuOffloadType::always: {
         // Make matrix descriptor
-                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
+                cusparseCreateCsr(&descrA_, m_, n_, nnz_, A_row_dev_,
                                   A_col_dev_, A_val_dev_, rType_, cType_,
                                   indType_, cudaDataType_));
         std::cout << "\tA_ description made" << std::endl;
@@ -180,7 +176,7 @@ class sp_gemv_gpu : public sp_gemv<T> {
         cusparseCheckError(cusparseCreateDnVec(&descrx_, n_, x_dev_,
         std::cout << "\tx_ description made" << std::endl;
-        cusparseCheckError(cusparseCreateDnVec(&descry_, n_, NULL,
+        cusparseCheckError(cusparseCreateDnVec(&descry_, m_, NULL,
         std::cout << "\ty_ description made" << std::endl;
@@ -204,7 +200,7 @@ class sp_gemv_gpu : public sp_gemv<T> {
         // Create matrix descriptor
-                cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_,
+                cusparseCreateCsr(&descrA_, m_, n_, nnz_, A_row_dev_,
                                   A_col_dev_, A_val_dev_, rType_, cType_,
                                   indType_, cudaDataType_));
         std::cout << "\tA_ description made" << std::endl;
@@ -212,7 +208,7 @@ class sp_gemv_gpu : public sp_gemv<T> {
         cusparseCheckError(cusparseCreateDnVec(&descrx_, n_, x_dev_,
         std::cout << "\tx_ description made" << std::endl;
-        cusparseCheckError(cusparseCreateDnVec(&descry_, n_, NULL,
+        cusparseCheckError(cusparseCreateDnVec(&descry_, m_, NULL,
         std::cout << "\ty_ description made" << std::endl;
@@ -508,16 +504,14 @@ class sp_gemv_gpu : public sp_gemv<T> {
-    void toCSR_int(T* dense, int n_col, int n_row, T* vals, int* col_index,
-             int* row_ptr) {
+    void toSparseFormat() {
       int nnz_encountered = 0;
-      for (int row = 0; row < n_row; row++) {
-        row_ptr[row] = nnz_encountered;
-        for (int col = 0; col < n_col; col++) {
-          if (dense[(row * n_) + col] != 0.0) {
-            col_index[nnz_encountered] = col;
-            vals[nnz_encountered] = dense[(row * n_) + col];
+      for (int row = 0; row < m_; row++) {
+        A_row_[row] = nnz_encountered;
+        for (int col = 0; col < n_; col++) {
+          if (A_[(row * n_) + col] != 0.0) {
+            A_col_[nnz_encountered] = col;
+            A_val_[nnz_encountered] = A_[(row * n_) + col];
@@ -606,7 +600,7 @@ class sp_gemv_gpu : public sp_gemv<T> {
 	T* A_val_dev_;
 	int *A_col_dev_, *A_row_dev_;
   /** Metadata */
-  uint64_t A_nnz_, vals_size_, cols_size_, rows_size_;
+  uint64_t vals_size_, cols_size_, rows_size_;
    * ################################
diff --git a/include/doGemv.hh b/include/doGemv.hh
index ebc9262..0068a1c 100644
--- a/include/doGemv.hh
+++ b/include/doGemv.hh
@@ -68,7 +68,7 @@ class doGemv {
         initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv");
     for (int dim = startDimention_; dim <= upperLimit_; dim++) {
       // M = dim, N = dim;
-      callDenseKernels(csvFile, dim, dim);
+      callKernels(csvFile, dim, dim);
     // Close file
@@ -93,7 +93,7 @@ class doGemv {
     int N = startDimention_;
     int M = 16 * N;
     while (M <= upperLimit_) {
-      callDenseKernels(csvFile, M, N);
+      callKernels(csvFile, M, N);
       M += 16;
@@ -119,7 +119,7 @@ class doGemv {
     if (upperLimit_ >= 32) {
       for (int dim = startDimention_; dim <= upperLimit_; dim++) {
         // M = dim, N = 32;
-        callDenseKernels(csvFile, dim, 32);
+        callKernels(csvFile, dim, 32);
     // Close file
@@ -144,7 +144,7 @@ class doGemv {
     M = startDimention_;
     N = 16 * M;
     while (N <= upperLimit_) {
-      callDenseKernels(csvFile, M, N);
+      callKernels(csvFile, M, N);
       N += 16;
@@ -170,7 +170,7 @@ class doGemv {
     if (upperLimit_ >= 32) {
       for (int dim = startDimention_; dim <= upperLimit_; dim++) {
         // M = 32, N = dim;
-        callDenseKernels(csvFile, 32, dim);
+        callKernels(csvFile, 32, dim);
     // Close file
@@ -185,7 +185,7 @@ class doGemv {
   /** Call the appropriate CPU and GPU GEMV kernels. */
-  void callDenseKernels(std::ofstream& csvFile, const int M, const int N) {
+  void callKernels(std::ofstream& csvFile, const int M, const int N) {
     const double probSize = calcKib(M, N);
     const uint64_t flops = calcFlops(M, N);
     std::string kernelName = getKernelName();
diff --git a/include/doSpgemv.hh b/include/doSpgemv.hh
index cf315e0..c2c6a3d 100644
--- a/include/doSpgemv.hh
+++ b/include/doSpgemv.hh
@@ -1,8 +1,425 @@
-// Created by Alexander Cockrean on 07/01/2025.
+#pragma once
+#include <sstream>
+#include <type_traits>
+#include "helpers.hh"
+#include "tablePrinter.hh"
+#include "utilities.hh"
+#if defined CPU_ARMPL
+#include "../ArmPL/spgemv.hh"
+#elif defined CPU_ONEMKL
+// Todo #include "../oneMKL/CPU/spgemv.hh"
+#elif defined CPU_AOCL
+// Todo #include "../AOCL/spgemv.hh"
+#elif defined CPU_NVPL
+// Todo #include "../NVPL/spgemv.hh"
+#elif defined CPU_OPENBLAS
+// Todo #include "../OpenBLAS/spgemv.hh"
+#if defined GPU_CUBLAS
+#include "../cuBLAS/spgemv.hh"
+#elif defined GPU_ONEMKL
+// Todo #include "../oneMKL/GPU/spgemv.hh"
+#elif defined GPU_ROCBLAS
+// Todo #include "../rocBLAS/spgemv.hh"
+/** `T` represents the type of kernel that will be run - i.e. T=float is for
+ *      SSPGEMV. */
+template <typename T>
+class doSpgemv {
+    doSpgemv(const std::string csvDir, const int iters, const int startDim,
+           const int upperLimit, const bool cpuEnabled = true,
+           const bool gpuEnabled = true)
+            : CSV_DIR(csvDir),
+              iterations_(iters),
+              startDimention_(startDim),
+              upperLimit_(upperLimit),
+              doCPU_(cpuEnabled),
+              doGPU_(gpuEnabled)
+    ,
+        cpu_(iterations_)
+    ,
+        gpu_(iterations_)
+    {
+      static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) &&
+                    "ERROR - doSpgemv can only be constructed using one of the "
+                    "following types: [float, double].");
+    }
+    /** Run all problem types and write data to CSV files. */
+    void collectData() {
+      // Square Problem Sizes...
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      std::ofstream csvFile =
+              initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv");
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        // M = dim, N = dim;
+        callKernels(csvFile, dim, dim);
+      }
+      // Close file
+      csvFile.close();
+      if (doCPU_ && doGPU_) {
+        // Print offload results to stdout
+        printOffloadThreshold("Square x Vector (M=N)");
+      }
+    }
+    /** Call the appropriate CPU and GPU SPGEMV kernels. */
+    void callKernels(std::ofstream& csvFile, const int M, const int N) {
+      const double probSize = calcKib(M, N);
+      const uint64_t flops = calcFlops(M, N);
+      std::string kernelName = getKernelName();
+      time_checksum_gflop cpuResult;
+      time_checksum_gflop gpuResult_once;
+      time_checksum_gflop gpuResult_always;
+      time_checksum_gflop gpuResult_unified;
+// Perform CPU kernel
+    if (doCPU_) {
+      cpu_.initialise(M, N);
+      cpuResult = cpu_.compute();
+      cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
+      // Write result to CSV file
+      writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, 0.0,
+                     iterations_, cpuResult.runtime, cpuResult.gflops);
+    }
+// Perform the GPU kernels
+    if (doGPU_) {
+      // - ONCE : Offload to/from GPU once before all iterations and once
+      // after
+      gpu_.initialise(gpuOffloadType::once, M, N);
+      gpuResult_once = gpu_.compute();
+      gpuResult_once.gflops =
+          calcGflops(flops, iterations_, gpuResult_once.runtime);
+      // - ALWAYS: Offload to/from GPU every iteration
+      gpu_.initialise(gpuOffloadType::always, M, N);
+      gpuResult_always = gpu_.compute();
+      gpuResult_always.gflops =
+          calcGflops(flops, iterations_, gpuResult_always.runtime);
+      // - UNIFIED : data passed from host to device (and device to host) as
+      //             needed
+      gpu_.initialise(gpuOffloadType::unified, M, N);
+      gpuResult_unified = gpu_.compute();
+      gpuResult_unified.gflops =
+          calcGflops(flops, iterations_, gpuResult_unified.runtime);
+      // Write results to CSV file
+      writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, 0, probSize,
+                     0.0, iterations_, gpuResult_once.runtime,
+                     gpuResult_once.gflops);
+      writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, 0,
+                     probSize, 0.0, iterations_, gpuResult_always.runtime,
+                     gpuResult_always.gflops);
+      writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, 0, probSize,
+                     0.0, iterations_, gpuResult_unified.runtime,
+                     gpuResult_unified.gflops);
+    }
+    if (doCPU_ && doGPU_) {
+      // Make sure all checksums match if CPU and GPU kernels are run.
+      //  - The majority of BLAS Libraries guarentee the same result if a
+      //  function
+      //    is called multiple times. Given all input matrices are identical for
+      //    each GPU offload type, we need only to compare the CPU and GPU
+      //    checksums.
+      checkChecksums(cpuResult, gpuResult_once, gpuResult_always,
+                     gpuResult_unified, M, N);
+      // Check if offload structs should be reset
+      checkOffloadStructReset(cpuResult, gpuResult_once, gpuResult_always,
+                              gpuResult_unified);
+      // Check if offload threshold has been achieved for each GPU offload type.
+      updateOffloadStructs(cpuResult, gpuResult_once, gpuResult_always,
+                           gpuResult_unified, M, N, probSize);
+      // Update previous results
+      prev_gpuResult_once = gpuResult_once;
+      prev_gpuResult_always = gpuResult_always;
+      prev_gpuResult_unified = gpuResult_unified;
+    }
+    }
+    /** Todo -- find a sensible way to do this for sparse */
+    void checkChecksums(time_checksum_gflop cpuResult,
+                        time_checksum_gflop gpuResult_once,
+                        time_checksum_gflop gpuResult_always,
+                        time_checksum_gflop gpuResult_unified, const int M,
+                        const int N) {
+      // Ensure that each checksum difference is less than 0.1%
+//      double hundredOverChecksum = 100 / std::fabs(cpuResult.checksum);
+//      if (((std::fabs(cpuResult.checksum - gpuResult_once.checksum) *
+//            hundredOverChecksum)) > 0.1 &&
+//          ((std::fabs(cpuResult.checksum - gpuResult_always.checksum) *
+//            hundredOverChecksum)) > 0.1 &&
+//          ((std::fabs(cpuResult.checksum - gpuResult_unified.checksum) *
+//            hundredOverChecksum)) > 0.1) {
+//        std::cerr << "ERROR - " << getKernelName()
+//                  << " kernel checksums do not match:\n\tInput "
+//                     "dimensions: M="
+//                  << M << ", N=" << N << std::endl;
+//        std::cerr << std::setprecision(10)
+//                  << "\tCPU Checksum = " << cpuResult.checksum << std::endl;
+//        std::cerr << std::setprecision(10)
+//                  << "\tGPU (Once) Checksum = " << gpuResult_once.checksum
+//                  << std::endl;
+//        std::cerr << std::setprecision(10)
+//                  << "\tGPU (Always) Checksum = " << gpuResult_always.checksum
+//                  << std::endl;
+//        std::cerr << std::setprecision(10)
+//                  << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum
+//                  << std::endl;
+//        exit(1);
+//      }
+    }
+    /** Check whether the offload structures need to be reset; and doing so if
+     * required.
+     *   - If CPU.gflops >= GPU.gflops for last two problem sizes, then reset
+     * offload structures as GPU may not necessarily have reached the offload
+     * threshold.
+     */
+    void checkOffloadStructReset(time_checksum_gflop cpuResult,
+                                 time_checksum_gflop gpuResult_once,
+                                 time_checksum_gflop gpuResult_always,
+                                 time_checksum_gflop gpuResult_unified) {
+      if ((cpuGpu_once_.M != 0) && (cpuResult.gflops >= gpuResult_once.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_once.gflops)) {
+        cpuGpu_once_.cpuGflops = 0.0;
+        cpuGpu_once_.gpuGflops = 0.0;
+        cpuGpu_once_.probSize_kib = 0.0;
+        cpuGpu_once_.M = 0;
+        cpuGpu_once_.N = 0;
+      }
+      if ((cpuGpu_always_.M != 0) &&
+          (cpuResult.gflops >= gpuResult_always.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_always.gflops)) {
+        cpuGpu_always_.cpuGflops = 0.0;
+        cpuGpu_always_.gpuGflops = 0.0;
+        cpuGpu_always_.probSize_kib = 0.0;
+        cpuGpu_always_.M = 0;
+        cpuGpu_always_.N = 0;
+      }
+      if ((cpuGpu_unified_.M != 0) &&
+          (cpuResult.gflops >= gpuResult_unified.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_unified.gflops)) {
+        cpuGpu_unified_.cpuGflops = 0.0;
+        cpuGpu_unified_.gpuGflops = 0.0;
+        cpuGpu_unified_.probSize_kib = 0.0;
+        cpuGpu_unified_.M = 0;
+        cpuGpu_unified_.N = 0;
+      }
+    }
+    /** Update the offload threshold structs if GPU.gflops > CPU.gflops. */
+    void updateOffloadStructs(time_checksum_gflop cpuResult,
+                              time_checksum_gflop gpuResult_once,
+                              time_checksum_gflop gpuResult_always,
+                              time_checksum_gflop gpuResult_unified, const int M,
+                              const int N, const double probSize) {
+      if ((cpuGpu_once_.M == 0) && cpuResult.gflops < gpuResult_once.gflops) {
+        cpuGpu_once_.cpuGflops = cpuResult.gflops;
+        cpuGpu_once_.gpuGflops = gpuResult_once.gflops;
+        cpuGpu_once_.probSize_kib = probSize;
+        cpuGpu_once_.M = M;
+        cpuGpu_once_.N = N;
+      }
+      if ((cpuGpu_always_.M == 0) && cpuResult.gflops < gpuResult_always.gflops) {
+        cpuGpu_always_.cpuGflops = cpuResult.gflops;
+        cpuGpu_always_.gpuGflops = gpuResult_always.gflops;
+        cpuGpu_always_.probSize_kib = probSize;
+        cpuGpu_always_.M = M;
+        cpuGpu_always_.N = N;
+      }
+      if ((cpuGpu_unified_.M == 0) &&
+          cpuResult.gflops < gpuResult_unified.gflops) {
+        cpuGpu_unified_.cpuGflops = cpuResult.gflops;
+        cpuGpu_unified_.gpuGflops = gpuResult_unified.gflops;
+        cpuGpu_unified_.probSize_kib = probSize;
+        cpuGpu_unified_.M = M;
+        cpuGpu_unified_.N = N;
+      }
+    }
+    /** Todo -- work out how tis can be determined for a sparse problem with
+     * an unknown algorithm
+     * A function for calculating FLOPs performed by a GEMV.
+     * y = alpha*Ax + beta*y */
+    constexpr uint64_t calcFlops(const int M, const int N) const {
+      // A * x = 2*M*N (FMA)
+      // alpha * Ax = M (multiplication)
+      // beta * y = M (multiplication)
+      // Ax + y = M (addition)
+      // = 2MN + M + M + M
+      // If beta==0; = 2MN + M ------- alpha*Ax Always done
+      // Else; = 2MN + 3M
+      uint64_t scalar = (BETA != 0) ? 3 : 1;
+      return (2 * (uint64_t)M * (uint64_t)N) + (scalar * (uint64_t)M);
+    }
+    /** A function for calculating the total GEMV problem size in KiB. */
+    constexpr double calcKib(const int M, const int N) const {
+      uint64_t M_ = (uint64_t)M, N_ = (uint64_t)N;
+      uint64_t probSize = (M_ * N_) + N_ + M_;
+      return ((double)(probSize * (sizeof(T))) / 1024);
+    }
+    /** Get the name of the kernel being run. */
+    std::string getKernelName() const {
+      switch (sizeof(T)) {
+        case 4:
+          return "sgemv";
+        case 8:
+          return "dgemv";
+        default:
+          return "unknown";
+      }
+    }
+    /** Print to stdout the offload thresholds. */
+    void printOffloadThreshold(std::string problemName) const {
+      std::vector<std::string> header = {
+              "Device", "M", "N", "Total Prob. Size (KiB)", "GFLOP/s", "CPU GFLOP/s"};
+      std::vector<std::vector<std::string>> rows;
+      // Initialise GPU_Once row
+      std::stringstream probSize_o;
+      std::stringstream gpuGflops_o;
+      std::stringstream cpuGflops_o;
+      probSize_o << std::fixed << std::setprecision(2)
+                 << cpuGpu_once_.probSize_kib;
+      gpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.gpuGflops;
+      cpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.cpuGflops;
+      if (cpuGpu_once_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Offload Once)", std::to_string(0),
+                        std::to_string(0), probSize_o.str(), "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Offload Once)", std::to_string(cpuGpu_once_.M),
+                        std::to_string(cpuGpu_once_.N), probSize_o.str(),
+                        gpuGflops_o.str(), cpuGflops_o.str()});
+      }
+      // Initialise GPU_always row
+      std::stringstream probSize_a;
+      std::stringstream gpuGflops_a;
+      std::stringstream cpuGflops_a;
+      probSize_a << std::fixed << std::setprecision(2)
+                 << cpuGpu_always_.probSize_kib;
+      gpuGflops_a << std::fixed << std::setprecision(2)
+                  << cpuGpu_always_.gpuGflops;
+      cpuGflops_a << std::fixed << std::setprecision(2)
+                  << cpuGpu_always_.cpuGflops;
+      if (cpuGpu_always_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Offload Always)", std::to_string(0),
+                        std::to_string(0), probSize_a.str(), "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Offload Always)", std::to_string(cpuGpu_always_.M),
+                        std::to_string(cpuGpu_always_.N), probSize_a.str(),
+                        gpuGflops_a.str(), cpuGflops_a.str()});
+      }
+      // Initialise GPU_unified row
+      std::stringstream probSize_u;
+      std::stringstream gpuGflops_u;
+      std::stringstream cpuGflops_u;
+      probSize_u << std::fixed << std::setprecision(2)
+                 << cpuGpu_unified_.probSize_kib;
+      gpuGflops_u << std::fixed << std::setprecision(2)
+                  << cpuGpu_unified_.gpuGflops;
+      cpuGflops_u << std::fixed << std::setprecision(2)
+                  << cpuGpu_unified_.cpuGflops;
+      if (cpuGpu_unified_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Unified Memory)", std::to_string(0),
+                        std::to_string(0), probSize_u.str(), "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Unified Memory)", std::to_string(cpuGpu_unified_.M),
+                        std::to_string(cpuGpu_unified_.N), probSize_u.str(),
+                        gpuGflops_u.str(), cpuGflops_u.str()});
+      }
+      // Print table
+      tablePrinter tPrinter(
+              problemName + " Problem Domian GPU Offload Thresholds:", header, rows);
+      tPrinter.print(1);
+    }
+    /** The output directory where CSV files should be saved to. */
+    const std::string CSV_DIR;
+    /** The number of iterations to perform per problem size. */
+    const int iterations_;
+    /** The value of the first probelm size dimention run. */
+    const int startDimention_;
+    /** The maximum value of the largest problem size dimention. */
+    const int upperLimit_;
+    /** Whether the CPU kernels should be run. */
+    const bool doCPU_ = true;
+    /** Whether the GPU kernels should be run. */
+    const bool doGPU_ = true;
+    /** The GEMV CPU kernel. */
+  cpu::gemv_cpu<T> cpu_;
+    /** The GEMV GPU kernel. */
+  gpu::gemv_gpu<T> gpu_;
+    /** The point at which offloading to GPU (offload once) becomes worthwhile. */
+    cpuGpu_offloadThreshold cpuGpu_once_;
+    /** The point at which offloading to GPU (offload always) becomes worthwhile.
+     */
+    cpuGpu_offloadThreshold cpuGpu_always_;
+    /** The point at which offloading to GPU (unified memory) becomes worthwhile.
+     */
+    cpuGpu_offloadThreshold cpuGpu_unified_;
+    /** The previous problem size's GPU (offload once) performance results. */
+    time_checksum_gflop prev_gpuResult_once;
+    /** The previous problem size's GPU (offload always) performance results. */
+    time_checksum_gflop prev_gpuResult_always;
+    /** The previous problem size's GPU (unified memory) performance results. */
+    time_checksum_gflop prev_gpuResult_unified;
\ No newline at end of file
diff --git a/include/kernels/CPU/spgemv.hh b/include/kernels/CPU/spgemv.hh
index 28b0caf..84722c2 100644
--- a/include/kernels/CPU/spgemv.hh
+++ b/include/kernels/CPU/spgemv.hh
@@ -1,6 +1,6 @@
 #pragma once
-#include "../gemv.hh"
+#include "../spgemv.hh"
 #include <random>
 #include <memory>
@@ -9,44 +9,42 @@ namespace cpu {
 /** An abstract class for GEMV BLAS kernels. */
     template <typename T>
-    class sp_gemv : public ::gemv<T> {
+    class spgemv : public ::spgemv<T> {
-        using ::gemv<T>::gemv;
-        using ::gemv<T>::initInputMatrixVectorSparse;
-        using ::gemv<T>::m_;
-        using ::gemv<T>::n_;
-        using ::gemv<T>::A_;
-        using ::gemv<T>::x_;
-        using ::gemv<T>::y_;
-        using ::gemv<T>::sparsity_;
+        using ::spgemv<T>::spgemv;
+        using ::spgemv<T>::initInputMatrixVector;
+        using ::spgemv<T>::m_;
+        using ::spgemv<T>::n_;
+        using ::spgemv<T>::A_;
+        using ::spgemv<T>::x_;
+        using ::spgemv<T>::y_;
+        using ::spgemv<T>::sparsity_;
+        using ::spgemv<T>::nnz_;
         /** Initialise the required data structures. */
-        void initialise(int n, double sparsity) {
-          m_ = n;
+        void initialise(int m, int n, double sparsity) {
+          m_ = m;
           n_ = n;
           sparsity_ = sparsity;
           // Note that the below should be the same as the edges calculation
           // used in the initInputMatricesSparse function.  If changed here,
           // change there
-          nnz_ = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity_));
+          nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_));
           A_ = (T*)malloc(sizeof(T) * m_ * n_);
           x_ = (T*)malloc(sizeof(T) * n_);
           y_ = (T*)malloc(sizeof(T) * m_);
           // Initialise the matrix and vectors
-          initInputMatrixVectorSparse();
+          initInputMatrixVector();
-    protected:
-        uint64_t nnz_;
         /** Do any necessary cleanup (free pointers, close library handles, etc.)
          * after Kernel has been called. */
-        void postCallKernelCleanup() override {
+        void postCallKernelCleanup() {
diff --git a/include/kernels/CPU/spmm.hh b/include/kernels/CPU/spmm.hh
index 7d19f5d..d90f48b 100644
--- a/include/kernels/CPU/spmm.hh
+++ b/include/kernels/CPU/spmm.hh
@@ -18,6 +18,7 @@ public:
   using ::spmm<T>::iterations_;
   using ::spmm<T>::nnzA_;
   using ::spmm<T>::nnzB_;
+  using ::spmm<T>::sparsity_;
   using ::spmm<T>::m_;
   using ::spmm<T>::n_;
   using ::spmm<T>::k_;
@@ -43,7 +44,7 @@ public:
     B_ = (T*)malloc(sizeof(T) * k_ * n_);
     C_ = (T*)calloc(sizeof(T) * m_ * n_);
-    initInputMatrices(sparsity_);
+    initInputMatrices();
@@ -54,7 +55,5 @@ private:
-  double sparsity_;
 }  // namespace cpu
diff --git a/include/kernels/GPU/spgemv.hh b/include/kernels/GPU/spgemv.hh
index 75fd126..0a93c77 100644
--- a/include/kernels/GPU/spgemv.hh
+++ b/include/kernels/GPU/spgemv.hh
@@ -1,14 +1,14 @@
 #pragma once
-#include "../gemv.hh"
+#include "../spgemv.hh"
 namespace gpu {
 /** An abstract class for GEMV BLAS kernels. */
     template <typename T>
-    class sp_gemv : public ::gemv<T> {
+    class spgemv : public ::spgemv<T> {
-        using ::gemv<T>::gemv;
+        using ::spgemv<T>::spgemv;
         /** Initialise the required data structures.
          * `offload` refers to the data offload type:
@@ -17,8 +17,8 @@ namespace gpu {
          *  - Always:  Move data from host to device and device to host each iteration
          *  - Unified: Initialise data as unified memory; no data movement semantics
          *             required */
-        virtual void initialise(gpuOffloadType offload, int n, float sparsity)
-        = 0;
+        virtual void initialise(gpuOffloadType offload, int m, int n,
+                                float sparsity) = 0;
         /** Whether data should be offloaded to/from the GPU each iteration, or just
diff --git a/include/kernels/spgemv.hh b/include/kernels/spgemv.hh
index 9e7d953..297b406 100644
--- a/include/kernels/spgemv.hh
+++ b/include/kernels/spgemv.hh
@@ -1,8 +1,131 @@
-// Created by Alexander Cockrean on 07/01/2025.
+#pragma once
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <limits>
+#include <random>
+#include "../utilities.hh"
+/** A generic abstract class defining the operation of timing an SPGEMM BLAS
+ * kernel for n iterations. */
+template <typename T>
+class spgemv {
+    spgemv(const int iters) : iterations_(iters) {}
+    /** Call the BLAS kernel n times.
+     * Returns the time elapsed for n BLAS calls in seconds. */
+    time_checksum_gflop compute() {
+      // Start timer
+      std::chrono::time_point<std::chrono::high_resolution_clock> startTime =
+              std::chrono::high_resolution_clock::now();
+      // Perform all SPGEMM calls
+      preLoopRequirements();
+      for (int i = 0; i < iterations_; i++) {
+        callSpgemv();
+      }
+      postLoopRequirements();
+      // Stop Timer
+      std::chrono::time_point<std::chrono::high_resolution_clock> endTime =
+              std::chrono::high_resolution_clock::now();
+      // Get time elapsed in seconds
+      std::chrono::duration<double> time_s = endTime - startTime;
+      double checksum = calcChecksum();
+      postCallKernelCleanup();
+      return {time_s.count(), checksum, 0.0};
+    }
+    int64_t nnz_ = 0;
+    /** Perform any required steps before calling the SPGEMV kernel that should
+     * be timed. */
+    virtual void preLoopRequirements() = 0;
+    /** Perform the SPGEMV kernel. */
+    virtual void callSpgemv() = 0;
+    /** Perform any required steps after calling the SPGEMV kernel that should
+     * be timed. */
+    virtual void postLoopRequirements() = 0;
+    /** Do any necessary cleanup (free pointers, close library handles, etc.)
+     * after Kernel has been called. */
+    virtual void postCallKernelCleanup() = 0;
+    /** Calculate a checksum from the result vector y. */
+    // Todo -- work out how to sensibly do this for sparse
+    constexpr double calcChecksum() {
+      // Checksum for GEMV calculated by summing max and min element of output
+      // vector
+      return ((double)y_[0] + (double)y_[m_ - 1]);
+    }
+    void initInputMatrixVector() {
+      // Initialise matric to
+      for (size_t i = 0; i < (n_ * m_); i++) {
+        A_[i] = 0.0;
+      }
+      // Random number generator objects for use in descent
+      std::default_random_engine gen;
+      gen.seed(std::chrono::system_clock::now()
+                       .time_since_epoch().count());
+      std::uniform_real_distribution<double> dist(0.0, 1.0);
+      // Using a=0.45 and b=c=0.22 as default probabilities
+      for (size_t i = 0; i < nnz_; i++) {
+        while (!rMat(A_, m_, 0, n_ - 1, 0, m_ - 1, 0.45, 0.22, 0.22, &gen, dist,
+                     false)) {}
+      }
+      // Initialise the input and output vectors
+      for (int y = 0; y < n_; y++) {
+        x_[y] = (T)((double)(rand() % 100) / 3.0);
+      }
+      for (int y = 0; y < m_; y++) {
+        y_[y] = (T)0.0;
+      }
+      toSparseFormat();
+    }
+    /** Move starting matrix into the sparse representation of for the given
+     * library */
+    virtual void toSparseFormat() = 0;
+    /** Call the extern consume() function. */
+    void callConsume() { consume((void*)A_, (void*)x_, (void*)y_); }
+    /** The number of iterations to perform per problem size. */
+    const int iterations_;
+    /** Matrix dimension M. */
+    int m_ = 0;
+    /** Matrix / vector dimension N. */
+    int n_ = 0;
+    /** Input matrix A. */
+    T* A_;
+    /** Input vector x. */
+    T* x_;
+    /** Input vector y. */
+    T* y_;
+    /** The distance between two vector elements. */
+    const int vecIncrement_ = 1;
+    double sparsity_ = 0.0;
diff --git a/include/kernels/spmm.hh b/include/kernels/spmm.hh
index 37de9cf..9d45f56 100644
--- a/include/kernels/spmm.hh
+++ b/include/kernels/spmm.hh
@@ -4,7 +4,6 @@
 #include <chrono>
 #include <cmath>
 #include <limits>
-#include <random>
 #include <iostream>
 #include "../utilities.hh"
@@ -94,7 +93,7 @@ protected:
                      false)) {}
-      toSparseFormat()
+      toSparseFormat();
     /** Move matrices into the sparse representation of for the given library */
@@ -103,47 +102,6 @@ protected:
     /** Call the external consume() function on the matrices */
     void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); }/** Recursive function to populate sparse matrices */
-    // On first iteration, n should be x2 + 1
-    bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b,
-              float c, std::default_random_engine* gen,
-              std::uniform_real_distribution<double> dist, bool bin) {
-      // If a 1x1 submatrix, then add an edge and return out
-      if (x1 >= x2 && y1 >= y2) {
-        // Needed to avoid overflow segfaults with large problem sizes
-        uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1);
-        if (abs(M[index]) > 0.1) {
-          return false;
-        } else {
-          // Add 1.0 if this is a binary graph, and a random real number otherwise
-          M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0);
-          return true;
-        }
-      } else {
-        // Divide up the matrix
-        int xMidPoint = x1 + floor((x2 - x1) / 2);
-        int yMidPoint = y1 + floor((y2 - y1) / 2);
-        // Work out which quarter to recurse into
-        // There are some ugly ternary operators here to avoid going out of bounds in the edge case
-        // that we are already at 1 width or 1 height
-        float randomNum = dist(*gen);
-        if (randomNum < a) {
-          return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
-                      a, b, c, gen, dist, bin);
-        } else if (randomNum < (a + b)) {
-          return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
-                      a, b, c, gen, dist, bin);
-        } else if (randomNum < (a + b + c)) {
-          return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
-                      a, b, c, gen, dist, bin);
-        } else {
-          return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
-                      ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, a,
-                      b, c, gen, dist, bin);
-        }
-      }
-    }
     /** The number of iterations to perform per problem size. */
     const int iterations_;
@@ -165,4 +123,6 @@ protected:
     /** Dense representation of output matrix C. */
     T* C_;
+    double sparsity_;
\ No newline at end of file
diff --git a/include/utilities.hh b/include/utilities.hh
index ac0aeb0..675ac2c 100644
--- a/include/utilities.hh
+++ b/include/utilities.hh
@@ -1,5 +1,7 @@
 #pragma once
+#include <random>
 // Define CPU related macros
 #if defined CPU_ARMPL
 #define CPU_LIB_NAME "Arm Performance Libraries"
@@ -76,4 +78,110 @@ struct cpuGpu_offloadThreshold {
 // performed.
 extern "C" {
 int consume(void* a, void* b, void* c);
\ No newline at end of file
+ * RMAT is a recursive function used to generate sparse matrices.  It is
+ * needed for both single and double precision so I've simply overloaded this
+ * function to have M as both float and double types.  Ugly, but works for
+ * now.
+ * Todo -- Consider different approach if other data types are supported in the
+ * future.
+ */
+ * @param M input matrix
+ * @param n number of columns in the full matrix (i.e. full range of the x axis)
+ * @param x1 beginning x coordinate of the submatrix
+ * @param x2 ending x coordinate of the submatrix
+ * @param y1 starting y coordinate of the submatrix
+ * @param y2 ending y coordinate of the submatrix
+ * @param a probability of tile a being chosen
+ * @param b probability of tile b being chosen
+ * @param c probability of tile c being chosen
+ * @param gen random number generator
+ * @param dist random number distribution
+ * @param bin bool to decide whether values added are binary of float/double
+ * @return
+ */
+bool rMat(float* M, int n, int x1, int x2, int y1, int y2, float a, float b,
+          float c, std::default_random_engine* gen,
+          std::uniform_real_distribution<double> dist, bool bin) {
+  // If a 1x1 submatrix, then add an edge and return out
+  if (x1 >= x2 && y1 >= y2) {
+    // Needed to avoid overflow segfaults with large problem sizes
+    uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1);
+    if (abs(M[index]) > 0.1) {
+      return false;
+    } else {
+      // Add 1.0 if this is a binary graph, and a random real number otherwise
+      M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0);
+      return true;
+    }
+  } else {
+    // Divide up the matrix
+    int xMidPoint = x1 + floor((x2 - x1) / 2);
+    int yMidPoint = y1 + floor((y2 - y1) / 2);
+    // Work out which quarter to recurse into
+    // There are some ugly ternary operators here to avoid going out of bounds
+    // in the edge case that we are already at 1 width or 1 height
+    float randomNum = dist(*gen);
+    if (randomNum < a) {
+      return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+                  a, b, c, gen, dist, bin);
+    } else if (randomNum < (a + b)) {
+      return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+                  a, b, c, gen, dist, bin);
+    } else if (randomNum < (a + b + c)) {
+      return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+                  a, b, c, gen, dist, bin);
+    } else {
+      return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+                  ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, a,
+                  b, c, gen, dist, bin);
+    }
+  }
+  return true;
+bool rMat(double* M, int n, int x1, int x2, int y1, int y2, float a, float b,
+          float c, std::default_random_engine* gen,
+          std::uniform_real_distribution<double> dist, bool bin) {
+  // If a 1x1 submatrix, then add an edge and return out
+  if (x1 >= x2 && y1 >= y2) {
+    // Needed to avoid overflow segfaults with large problem sizes
+    uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1);
+    if (abs(M[index]) > 0.1) {
+      return false;
+    } else {
+      // Add 1.0 if this is a binary graph, and a random real number otherwise
+      M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0);
+      return true;
+    }
+  } else {
+    // Divide up the matrix
+    int xMidPoint = x1 + floor((x2 - x1) / 2);
+    int yMidPoint = y1 + floor((y2 - y1) / 2);
+    // Work out which quarter to recurse into
+    // There are some ugly ternary operators here to avoid going out of bounds in the edge case
+    // that we are already at 1 width or 1 height
+    float randomNum = dist(*gen);
+    if (randomNum < a) {
+      return rMat(M, n, x1, xMidPoint, y1, yMidPoint,
+                  a, b, c, gen, dist, bin);
+    } else if (randomNum < (a + b)) {
+      return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint,
+                  a, b, c, gen, dist, bin);
+    } else if (randomNum < (a + b + c)) {
+      return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2,
+                  a, b, c, gen, dist, bin);
+    } else {
+      return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2,
+                  ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, a,
+                  b, c, gen, dist, bin);
+    }
+  }
+  return true;

From d7ad2b7639095e5bfa2e7f4985be5aa22b7112e7 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Thu, 9 Jan 2025 15:44:59 +0000
Subject: [PATCH 37/38] Finishing off armpl and cusparse kernels

 .idea/workspace.xml           |  33 +-
 ArmPL/spgemm.hh               | 417 +++++++++++++++++++++
 ArmPL/spmm.hh                 |   7 +-
 cuBLAS/spgemm.hh              | 323 +++++++++++++++++
 cuBLAS/spmm.hh                |   8 +-
 include/doSpgemm.hh           | 661 +++++++++++++++++++++++++++++++++-
 include/doSpmm.hh             |  10 +-
 include/kernels/CPU/spgemm.hh |  56 +++
 include/kernels/CPU/spgmm.hh  |   8 -
 include/kernels/CPU/spmm.hh   |   3 +-
 include/kernels/GPU/spgemm.hh |  32 +-
 include/kernels/spgemm.hh     | 134 ++++++-
 include/kernels/spmm.hh       |   3 +
 13 files changed, 1641 insertions(+), 54 deletions(-)
 create mode 100644 ArmPL/spgemm.hh
 create mode 100644 cuBLAS/spgemm.hh
 create mode 100644 include/kernels/CPU/spgemm.hh
 delete mode 100644 include/kernels/CPU/spgmm.hh

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 3d4f373..8556bf2 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -15,18 +15,19 @@
   <component name="ChangeListManager">
-    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Refactoring to make individual files relate to a single kernel">
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Moving spgemv into new format">
+      <change afterPath="$PROJECT_DIR$/ArmPL/spgemm.hh" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/cuBLAS/spgemm.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/ArmPL/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spgemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/cuBLAS/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spgemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/doSpgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpgemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spgemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/ArmPL/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spmm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/cuBLAS/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spmm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/doSpgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpgemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/doSpmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpmm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/spgmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spgemm.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/GPU/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemv.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spgemv.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/GPU/spgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/spgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spgemm.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/include/kernels/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spmm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/utilities.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/utilities.hh" afterDir="false" />
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -574,7 +575,15 @@
       <option name="project" value="LOCAL" />
-    <option name="localTasksCounter" value="49" />
+    <task id="LOCAL-00049" summary="Moving spgemv into new format">
+      <option name="closed" value="true" />
+      <created>1736345071717</created>
+      <option name="number" value="00049" />
+      <option name="presentableId" value="LOCAL-00049" />
+      <option name="project" value="LOCAL" />
+      <updated>1736345071717</updated>
+    </task>
+    <option name="localTasksCounter" value="50" />
     <servers />
   <component name="TypeScriptGeneratedFilesManager">
@@ -592,7 +601,6 @@
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="Now compiles" />
     <MESSAGE value="Now compiles with fewer runtime errors" />
     <MESSAGE value="Implementing other offload types - still some runtime errors" />
     <MESSAGE value="All implemented and running.  No checksum at the end" />
@@ -617,6 +625,7 @@
     <MESSAGE value="Beginning gemv ARMPL" />
     <MESSAGE value="Getting rid of old oneMKL sparse file" />
     <MESSAGE value="Refactoring to make individual files relate to a single kernel" />
-    <option name="LAST_COMMIT_MESSAGE" value="Refactoring to make individual files relate to a single kernel" />
+    <MESSAGE value="Moving spgemv into new format" />
+    <option name="LAST_COMMIT_MESSAGE" value="Moving spgemv into new format" />
\ No newline at end of file
diff --git a/ArmPL/spgemm.hh b/ArmPL/spgemm.hh
new file mode 100644
index 0000000..0f9e81d
--- /dev/null
+++ b/ArmPL/spgemm.hh
@@ -0,0 +1,417 @@
+#pragma once
+#ifdef CPU_ARMPL
+#include <stdlib.h>
+#include <armpl.h>
+#include <omp.h>
+#include <algorithm>
+#include <iostream>
+#include "../include/kernels/CPU/spgemm.hh"
+#include "../include/utilities.hh"
+namespace cpu {
+    /**
+     * a class for sparse matrix-dense matric CPU BLAS kernels
+     */
+class spgemm_cpu : public spgemm<T> {
+    using spgemm<T>::spgemm;
+    using spgemm<T>::callConsume;
+    using spgemm<T>::m_;
+    using spgemm<T>::n_;
+    using spgemm<T>::k_;
+    using spgemm<T>::A_;
+    using spgemm<T>::B_;
+    using spgemm<T>::C_;
+    using spgemm<T>::nnz_;
+  void toSparseFormat() override {
+    m_armpl_ = m_;
+    n_armpl_ = n_;
+    k_armpl_ = k_;
+    nnzA_ = nnz_;
+    nnzB_ = k_ * n_;
+    // ToDo -- check whether flags_ is correct!
+    flags_ = 0;
+    // Move A to CSR
+    A_armpl_row_ptr_ = new armpl_int_t[m_ + 1];
+    A_armpl_col_index_ = new armpl_int_t[nnzA_];
+    A_vals_ = new T[nnzA_];
+    A_armpl_row_ptr_[0] = 0;
+    int nnz_encountered = 0;
+    for (int row = 0; row < m_; row++) {
+      A_armpl_row_ptr_[row + 1] = nnz_encountered;
+      for (int col = 0; col < k_; col++) {
+        if (A_[(row * k_) + col] != 0.0) {
+          A_armpl_col_index_[nnz_encountered] = col;
+          A_vals_[nnz_encountered] = static_cast<T>(A_[(row * k_) + col]);
+          nnz_encountered++;
+        }
+      }
+    }
+    // Move B to CSR
+    B_armpl_row_ptr_ = new armpl_int_t[k_ + 1];
+    B_armpl_col_index_ = new armpl_int_t[nnz_];
+    B_vals_ = new T[nnz_];
+    B_armpl_row_ptr_[0] = 0;
+    nnz_encountered = 0;
+    for (int row = 0; row < k_; row++) {
+      B_armpl_row_ptr_[row + 1] = nnz_encountered;
+      for (int col = 0; col < n_; col++) {
+        if (B_[(row * n_) + col] != 0.0) {
+          B_armpl_col_index_[nnz_encountered] = col;
+          B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
+          nnz_encountered++;
+        }
+      }
+    }
+    // Move C to CSR
+    C_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+    C_armpl_col_index_ = new armpl_int_t[0];
+    C_vals_ = new T[0];
+    // ToDo Commented out below as it should be needed?
+//    C_armpl_row_ptr_[0] = 0;
+//    nnz_encountered = 0;
+//    for (int row = 0; row < n_; row++) {
+//      C_armpl_row_ptr_[row + 1] = nnz_encountered;
+//      for (int col = 0; col < n_; col++) {
+//        if (B_[(row * n_) + col] != 0.0) {
+//          C_armpl_col_index_[nnz_encountered] = col;
+//          C_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
+//          nnz_encountered++;
+//        }
+//      }
+//    }
+    if constexpr (std::is_same_v<T, float>) {
+//      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&A_armpl_,
+                                         m_armpl_,
+                                         k_armpl_,
+                                         A_armpl_row_ptr_,
+                                         A_armpl_col_index_,
+                                         A_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&B_armpl_,
+                                         k_armpl_,
+                                         n_armpl_,
+                                         B_armpl_row_ptr_,
+                                         B_armpl_col_index_,
+                                         B_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_s(&C_armpl_,
+                                         m_armpl_,
+                                         n_armpl_,
+                                         C_armpl_row_ptr_,
+                                         C_armpl_col_index_,
+                                         C_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+    } else if constexpr (std::is_same_v<T, double>) {
+//      printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_,
+//                nnz_, flags_
+      status_ = armpl_spmat_create_csr_d(&A_armpl_,
+                                         m_armpl_,
+                                         k_armpl_,
+                                         A_armpl_row_ptr_,
+                                         A_armpl_col_index_,
+                                         A_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+//      printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_d(&B_armpl_,
+                                         k_armpl_,
+                                         n_armpl_,
+                                         B_armpl_row_ptr_,
+                                         B_armpl_col_index_,
+                                         B_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+//      printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_,
+//                nnz_, flags_);
+      status_ = armpl_spmat_create_csr_d(&C_armpl_,
+                                         m_armpl_,
+                                         n_armpl_,
+                                         C_armpl_row_ptr_,
+                                         C_armpl_col_index_,
+                                         C_vals_,
+                                         flags_);
+      if (status_ != ARMPL_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+//      std::cout << "Okay, all matrices made!!" << std::endl;
+    }
+  }
+  /** Make call to the GEMM kernel. */
+  void callGemm() override {
+    /**
+     * Flow of ARMPL Sparse LA:
+     *
+     * 1. Create sparse matrix objects: armpl_spmat_create_csr[sdcz]()
+     *
+     * 2. Supply hints on usage: armpl_spmat_hint()
+     *
+     * 3. Optimise for SpMV: armpl_spmv_optimize()
+     *
+     * 4. Solve SpMV case: armpl_spmv_exec_[sdcz]()
+     *
+     * 5. Destroy sparse matrix object: armpl_spmat_destroy()
+     *
+     * In addiion, users can choose to update a set of non-zero values using
+     * armpl_spmat_update_[sdcz]()
+     */
+    // Todo -- See if using armpl_spmat_hint can improve performance here.
+    //  If so, follow with optimisation functions
+    if constexpr (std::is_same_v<T, float>) {
+      status_ = armpl_spmm_exec_s(transA_,
+                                  transB_,
+                                  alpha,
+                                  A_armpl_,
+                                  B_armpl_,
+                                  beta,
+                                  C_armpl_);
+    } else if constexpr (std::is_same_v<T, double>) {
+      status_ = armpl_spmm_exec_d(transA_,
+                                  transB_,
+                                  alpha,
+                                  A_armpl_,
+                                  B_armpl_,
+                                  beta,
+                                  C_armpl_);
+    } else {
+      // Un-specialised class will not do any work - print error and exit.
+      std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported."
+                << std::endl;
+      exit(1);
+    }
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    // Ensure compiler doesn't optimise away the work being done
+    callConsume();
+  }
+  /** Perform any required steps before calling the GEMM kernel that should
+   * be timed. */
+  void preLoopRequirements() override {
+    // Need to put A_ and B_ into A_armpl_ and B_armpl_
+    toCSR_armpl();
+    /** providing hints to ARMPL and optimizing the matrix datastructures */
+    // TODO -- is noallocs best here?
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_MEMORY,
+                               ARMPL_SPARSE_MEMORY_NOALLOCS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_MEMORY,
+                               ARMPL_SPARSE_MEMORY_NOALLOCS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_STRUCTURE,
+                               ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_STRUCTURE,
+                               ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    // TODO -- will this be FEW?
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
+                               ARMPL_SPARSE_INVOCATIONS_MANY);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
+                               ARMPL_SPARSE_INVOCATIONS_MANY);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION,
+                               ARMPL_SPARSE_OPERATION_NOTRANS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION,
+                               ARMPL_SPARSE_OPERATION_NOTRANS);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    // TODO -- investigate whch is better here
+    status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+                               ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+                               ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+//  TODO -- this is thorwing an error -- couldn't immediately fix so come
+//   back to
+//    /** provide hints for the optimisation of the spmm execution */
+//    status_ = armpl_spmm_optimize(ARMPL_SPARSE_OPERATION_NOTRANS,
+//                                  ARMPL_SPARSE_OPERATION_NOTRANS,
+//                                  ARMPL_SPARSE_SCALAR_ONE,
+//                                  A_armpl_, B_armpl_,
+//                                  ARMPL_SPARSE_SCALAR_ZERO,
+//                                  C_armpl_);
+//    if (status_ != ARMPL_STATUS_SUCCESS) {
+//      std::cout << "ERROR " << status_ << std::endl;
+//      exit(1);
+//    }
+  }
+  void postLoopRequirements() override {
+    status_ = armpl_spmat_destroy(A_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_destroy(B_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    status_ = armpl_spmat_destroy(C_armpl_);
+    if (status_ != ARMPL_STATUS_SUCCESS) {
+      std::cout << "ERROR " << status_ << std::endl;
+      exit(1);
+    }
+    delete [] A_armpl_row_ptr_;
+    delete [] A_armpl_col_index_;
+    delete [] A_vals_;
+    delete [] B_armpl_row_ptr_;
+    delete [] B_armpl_col_index_;
+    delete [] B_vals_;
+    delete [] C_armpl_row_ptr_;
+    delete [] C_armpl_col_index_;
+    delete [] C_vals_;
+  }
+  /** The constant value Alpha. */
+  const T alpha = ALPHA;
+  /** The constant value Beta. */
+  const T beta = BETA;
+  void printCSR(armpl_int_t n, armpl_int_t* rp, armpl_int_t* ci, T* v,
+                armpl_int_t nz, armpl_int_t f) {
+    std::cout << "\tn = " << n << std::endl;
+    std::cout << "\trow ptr (size = " << sizeof(rp[0]) << ") = [" << rp[0];
+    for (int i = 1; i < (n + 1); i++) {
+      std::cout << ", " << rp[i];
+    }
+    std::cout << "]" << std::endl << "\tcol ind (size = " << sizeof(ci[0]) <<
+    ") = [" << ci[0];
+    for (int i = 1; i < nz; i++) {
+      std::cout << ", " << ci[i];
+    }
+    std::cout << "]" << std::endl << "\tvals (size = " << sizeof(v[0]) <<
+    ") = [" << v[0];
+    for (int i = 1; i < nz; i++) {
+      std::cout << ", " << v[i];
+    }
+    std::cout << "]" << std::endl << "\tflags = " << f << std::endl;
+  }
+  int64_t nnzA_;
+  int64_t nnzB_;
+  armpl_status_t status_;
+  armpl_int_t flags_;
+  armpl_int_t m_armpl_;
+  armpl_int_t n_armpl_;
+  armpl_int_t k_armpl_;
+  armpl_int_t* A_armpl_row_ptr_;
+  armpl_int_t* A_armpl_col_index_;
+  armpl_int_t* B_armpl_row_ptr_;
+  armpl_int_t* B_armpl_col_index_;
+  armpl_int_t* C_armpl_row_ptr_;
+  armpl_int_t* C_armpl_col_index_;
+  armpl_spmat_t A_armpl_;
+  armpl_spmat_t B_armpl_;
+  armpl_spmat_t C_armpl_;
+  armpl_sparse_hint_value transA_ = ARMPL_SPARSE_OPERATION_NOTRANS;
+  armpl_sparse_hint_value transB_ = ARMPL_SPARSE_OPERATION_NOTRANS;
diff --git a/ArmPL/spmm.hh b/ArmPL/spmm.hh
index 93ed4b5..9680f09 100644
--- a/ArmPL/spmm.hh
+++ b/ArmPL/spmm.hh
@@ -1,18 +1,18 @@
 #pragma once
 #ifdef CPU_ARMPL
-#include <stdio.h>
 #include <stdlib.h>
 #include <armpl.h>
 #include <omp.h>
 #include <algorithm>
+#include <iostream>
 #include "../include/kernels/CPU/spmm.hh"
 #include "../include/utilities.hh"
 namespace cpu {
-/** A class for GEMM CPU BLAS kernels. */
+/** A class for sparse matrix-sparse matrix CPU BLAS kernels. */
 template <typename T>
 class spmm_cpu : public spmm<T> {
@@ -363,9 +363,6 @@ class spmm_cpu : public spmm<T> {
   /** The constant value Beta. */
   const T beta = BETA;
-  void toCSR_armpl() {
-  }
   void printCSR(armpl_int_t n, armpl_int_t* rp, armpl_int_t* ci, T* v,
                 armpl_int_t nz, armpl_int_t f) {
     std::cout << "\tn = " << n << std::endl;
diff --git a/cuBLAS/spgemm.hh b/cuBLAS/spgemm.hh
new file mode 100644
index 0000000..d4233fd
--- /dev/null
+++ b/cuBLAS/spgemm.hh
@@ -0,0 +1,323 @@
+#pragma once
+#ifdef GPU_CUBLAS
+#include <cusparse_v2.h>
+#include <cuda_runtime.h>
+#include <type_traits>
+#include <random>
+#include <iostream>
+#include "../include/kernels/GPU/spgemm.hh"
+#include "../include/utilities.hh"
+#include "common.hh"
+namespace gpu {
+    /**
+     * A class for sparse matrix-dense matrix BLAS
+     */
+template <typename T>
+class spgemm_gpu : public spgemm<T> {
+  using spmm<T>::spmm;
+  using spmm<T>::initInputMatrices;
+  using spmm<T>::m_
+  using spmm<T>::n_;
+  using spmm<T>::k_
+  using spmm<T>::A_;
+  using spmm<T>::B_;
+  using spmm<T>::C_;
+  using spmm<T>::offload_;
+  using spmm<T>::nnz_;
+  void initialise(gpuOffloadType offload, int n, double sparsity) override {
+    offload_ = offload;
+    if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F;
+    else if (std::is_same_v<T, double>) cudaDataType_ = CUDA_R_64F;
+    else {
+      std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
+      exit(1);
+    }
+    m_ = m;
+    n_ = n;
+    k_ = k;
+    A_ = (T*)malloc(sizeof(T) * m_ * k_);
+    B_ = (T*)malloc(sizeof(T) * k_ * n_);
+    C_ = (T*)calloc(sizeof(T) * m_ * n_);
+    /** Determine the number of nnz elements in A and B */
+    nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+    // Get device identifier
+    cudaCheckError(cudaGetDevice(&gpuDevice_));
+    // Initialise 3 streams to asynchronously move data between host and device
+    cudaCheckError(cudaStreamCreate(&s1_));
+    cudaCheckError(cudaStreamCreate(&s2_));
+    cudaCheckError(cudaStreamCreate(&s3_));
+    if (offload_ == gpuOffloadType::unified) {
+      cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * nnz_));
+      cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * nnz_));
+      cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (m_ + 1)));
+      cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * k_ * n_));
+      cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * m_ * n_));
+    } else {
+      A_val_ = (T*)malloc(sizeof(T) * nnz_);
+      A_col_ = (int*)malloc(sizeof(int) * nnz_);
+      A_row_ = (int*)malloc(sizeof(int) * (m_ + 1));
+      B_ = (T*)malloc(sizeof(T) * k_ * n_);
+      C_ = (T*)malloc(sizeof(T) * m_ * n_);
+      cudaCheckError(cudaMalloc((void**)&A_val_dev_, sizeof(T) * nnz_));
+      cudaCheckError(cudaMalloc((void**)&A_col_dev_, sizeof(T) * nnz_));
+      cudaCheckError(cudaMalloc((void**)&A_row_dev_, sizeof(T) * (m_ + 1)));
+      cudaCheckError(cudaMalloc((void**)&B_dev_, sizeof(T) * k_ * n_));
+      cudaCheckError(cudaMalloc((void**)&C_dev_, sizeof(T) * m_ * n_));
+    }
+    cusparseCheckError(cusparseCreate(&handle_));
+    initInputMatrices();
+  }
+  void toSparseFormat() override {
+    // Load A into CSR
+    int nnz_encountered = 0;
+    for (int row = 0; row < m_; row++) {
+      A_row_[row] = nnz_encountered;
+      int nnz_row = 0;
+      for (int col = 0; col < k_; col++) {
+        if (B_[(row * k_) + col] != 0.0) {
+          nnz_row++;
+          A_col_[nnz_encountered] = col;
+          A_val_[nnz_encountered] = A_[(row * k_) + col];
+          nnz_encountered++;
+        }
+      }
+    }
+    A_row_[m_] = nnz_encountered;
+    B_order_ = C_order_ = CUSPARSE_ORDER_ROW;
+  }
+  void preLoopRequirements() override {
+    // Todo -- do I need a SPMM description here?
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        [[fallthorugh]];
+      }
+      case gpuOffloadType::once: {
+        cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, (sizeof(T) * nnz_),
+                                       cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_,
+                                       (sizeof(int) * nnz_),
+                                       cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_,
+                                       (sizeof(int) * (m_ + 1)),
+                                       cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_dev_, B_, (sizeof(T) * k_ * n_),
+                                       cudaMemcpyHostToDevice, s2_));
+        cudaCheckError(cudaMemcpyAsync(C_dev_, C_, (sizeof(T) * m_ * n_),
+                                       cudaMemcpyHostToDevice, s3_));
+        cusparseCreateCsr(&descrA_, m_, k_, nnz_, A_row_dev_,
+                                  A_col_dev_, A_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateDnMat(&descrB_, B_num_rows_, B_num_cols_,
+                                    B_leading_dim_, B_dev_, cudaDataType_,
+                                    B_order_));
+        cusparseCheckError(
+                cusparseCreateDnMat(&descrC_, C_num_rows_, C_num_cols_,
+                                    C_leading_dim_, C_dev_, cudaDataType_,
+                                    C_order_));
+        break;
+      }
+      case gpuOffloadType::unified: {
+        cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * nnz_,
+                                            gpuDevice_, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * nnz_,
+                                            gpuDevice_, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (m_ + 1),
+                                            gpuDevice_, s1_));
+        cudaCheckError(cudaMemPrefetchAsync(B_, sizeof(T) * n_ * k_,
+                                            gpuDevice_, s2_));
+        cudaCheckError(cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_,
+                                            gpuDevice_, s3_));
+        cudaCheckError(cudaDeviceSynchronize());
+        cusparseCheckError(
+                cusparseCreateCsr(&descrA_, m_, k_, nnzA_, A_row_, A_col_,
+                                  A_val_, rType_, cType_, indType_,
+                                  cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateDnMat(&descrB_, B_num_rows_, B_num_cols_,
+                                    B_leading_dim_, B_, cudaDataType_,
+                                    B_order_));
+        cusparseCheckError(
+                cusparseCreateDnMat(&descrC_, C_num_rows_, C_num_cols_,
+                                    C_leading_dim_, C_, cudaDataType_,
+                                    C_order_));
+        break;
+      }
+    }
+  }
+  void callGemm() override {
+    switch(offload_) {
+      case gpuOffloadType::always: {
+        // Clean up old descriptors
+        cusparseCheckError(cusparseDestroySpMat(descrA_));
+        cuspraseCheckError(cusparseDestroyDnMat(descrB_));
+        cuspraseCheckError(cusparseDestroyDnMat(descrC_));
+        // Move over data
+        cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, (sizeof(T) * nnz_),
+                                       cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_,
+                                       (sizeof(int) * nnz_),
+                                       cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_,
+                                       (sizeof(int) * (m_ + 1)),
+                                       cudaMemcpyHostToDevice, s1_));
+        cudaCheckError(cudaMemcpyAsync(B_dev_, B_, (sizeof(T) * k_ * n_),
+                                       cudaMemcpyHostToDevice, s2_));
+        cudaCheckError(cudaMemcpyAsync(C_dev_, C_, (sizeof(T) * m_ * n_),
+                                       cudaMemcpyHostToDevice, s3_));
+        cudaCheckError(cudaDeviceSynchronize());
+        // Set up descriptors
+        cusparseCreateCsr(&descrA_, m_, k_, nnz_, A_row_dev_,
+                                  A_col_dev_, A_val_dev_, rType_, cType_,
+                                  indType_, cudaDataType_));
+        cusparseCheckError(
+                cusparseCreateDnMat(&descrB_, B_num_rows_, B_num_cols_,
+                                    B_leading_dim_, B_dev_, cudaDataType_,
+                                    B_order_));
+        cusparseCheckError(
+                cusparseCreateDnMat(&descrC_, C_num_rows_, C_num_cols_,
+                                    C_leading_dim_, C_dev_, cudaDataType_,
+                                    C_order_));
+        // Begin matrix-matrix multiplication
+        cusparseCheckError(
+                cusparseSpMM_bufferSize(handle_, opA_, opB_, &alpha, descrA_,
+                                        descrB_, &beta, descrC_,
+                                        cudaDataType_, alg_, &buffer_size_1_));
+        cudaCheckError(cudaMalloc((void**)&buffer1_, buffer_size_1_));
+        cusparseCheckError(
+                cusparseSpMM_preprocess(handle_, opA_, opB_, &alpha, descrA_,
+                                        descrB_, &beta, descrC_,
+                                        cudaDataType_, alg_, buffer1_));
+        cusparseCheckError(
+                cusparseSpMM(handle_, opA_, opB_, &alpha, descrA_, descrB_,
+                             &beta, descrC_, cudaDataType_, alg_, buffer1_));
+      }
+    }
+  }
+  /** Handle used when calling cuBLAS. */
+  cusparseHandle_t handle_;
+  /** CUDA Stream 1 - used to asynchronously move data between host and device.
+   */
+  cudaStream_t s1_;
+  /** CUDA Stream 1 - used to asynchronously move data between host and device.
+   */
+  cudaStream_t s2_;
+  /** CUDA Stream 1 - used to asynchronously move data between host and device.
+   */
+  cudaStream_t s3_;
+  /** The ID of the target GPU Device. */
+  int gpuDevice_;
+  bool C_mem_allocated_always_;
+  bool C_mem_allocated_once_;
+  bool C_mem_allocated_unified_;
+  /** The constant value Alpha. */
+  const T alpha = ALPHA;
+  /** The constant value Beta. */
+  const T beta = BETA;
+	size_t buffer_size1_ = 0;
+	size_t buffer_size2_ = 0;
+  void* buffer1_ = NULL;
+	void* buffer2_ = NULL;
+  cusparseOperation_t opA_ = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cusparseOperation_t opB_ = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  cusparseSpMMAlg_t alg_ = CUSPARSE_SPMM_ALG_DEFAULT;
+	// Data type depends on kernel being run
+	cudaDataType_t cudaDataType_;
+  /**
+   * ___________ Host data ______________
+   */
+	/** CSR format vectors for matrix A */
+  cusparseSpMatDescr_t descrA_;
+	T* A_val_;
+	int* A_col_;
+  int* A_row_;
+  int64_t A_num_rows_;
+  int64_t A_num_cols_;
+  /** dense format values for matrices B and C */
+  cusparseDnMatDescr_t descrB_;
+  int B_num_rows_;
+  int B_num_cols_;
+  int B_leading_dim_;
+  cusparseOrder_t B_order_;
+  cusaprseDnMatDescr_t descrC_;
+  int C_num_rows_;
+  int C_num_cols_;
+  int C_leading_dim_;
+  cusparseOrder_t C_order_;
+  /**
+   * _____________ Device data ________________
+   */
+  T* A_val_dev_;
+  int* A_col_dev_;
+  int* A_row_dev_;
+  T* B_dev_;
+  T* C_dev_;
diff --git a/cuBLAS/spmm.hh b/cuBLAS/spmm.hh
index 071c8c1..249f1ea 100644
--- a/cuBLAS/spmm.hh
+++ b/cuBLAS/spmm.hh
@@ -50,14 +50,12 @@ class spmm_gpu : public spmm<T> {
     A_ = (T*)malloc(sizeof(T) * m_ * k_);
     B_ = (T*)malloc(sizeof(T) * k_ * n_);
-    C_ = (T*)calloc(sizeof(T) * m_ * n_);Ã¥
+    C_ = (T*)calloc(sizeof(T) * m_ * n_);
     /** Determine the number of nnz elements in A and B */
     nnzA_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
     nnzB_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_));
-    initInputMatrices(sparsity_);
     // Get device identifier
@@ -118,6 +116,8 @@ class spmm_gpu : public spmm<T> {
     // Create a handle for cuSPARSE
+    initInputMatrices();
@@ -194,7 +194,7 @@ class spmm_gpu : public spmm<T> {
         cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_
         + 1), cudaMemcpyHostToDevice, s3_));
-        // Craete matrix descriptors
+        // Create matrix descriptors
                 cusparseCreateCsr(&descrA_, m_, k_, nnzA_, A_row_dev_,
                                   A_col_dev_, A_val_dev_, rType_, cType_,
diff --git a/include/doSpgemm.hh b/include/doSpgemm.hh
index 2131a7d..b8d1d9b 100644
--- a/include/doSpgemm.hh
+++ b/include/doSpgemm.hh
@@ -1,8 +1,657 @@
-// Created by Alexander Cockrean on 07/01/2025.
+#pragma once
+#include <sstream>
+#include <type_traits>
+#include "helpers.hh"
+#include "tablePrinter.hh"
+#include "utilities.hh"
+#if defined CPU_ARMPL
+#include "../ArmPL/spgemm.hh"
+#elif defined CPU_ONEMKL
+// Todo #include "../oneMKL/CPU/spgemm.hh"
+#elif defined CPU_AOCL
+// Todo #include "../AOCL/spgemm.hh"
+#elif defined CPU_NVPL
+// Todo #include "../NVPL/spgemm.hh"
+#elif defined CPU_OPENBLAS
+// Todo #include "../OpenBLAS/spgemm.hh"
+#if defined GPU_CUBLAS
+#include "../cuBLAS/spgemm.hh"
+#elif defined GPU_ONEMKL
+// Todo #include "../oneMKL/GPU/spgemm.hh"
+#elif defined GPU_ROCBLAS
+// Todo #include "../rocBLAS/spgemm.hh"
+* 'T represents the type of the sparse GEMM kernel that will be run. E.g.,
+ * T=float is for SSPGEMM
+template <typename T>
+class doSpgemm {
+    doSpgemm(const std::string csvDir, const int iters, const int startDim,
+             const int upperlimit, const bool cpuEnabled = true,
+             const bool gpuEnabled = true)
+          : CSV_DIR(csvDir),
+            iterations_(iterations),
+            startDimention_(startDim),
+            upperLimit_(upperLimit),
+            doCPU_(cpuEnables),
+            doGPU_(gpuEnabled)
+    ,
+        cpu_(iterations_)
+    ,
+        gpu_(iterations_)
+    {
+      static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>) &&
+      "ERROR - doGemm can only be constructed using one of the "
+      "following types: [float, double].");
+    }
+    void collectData() {
+      // ToDo -- I've hard coded false here as kernel selection was not working
+      //  .  Needs to be fixed
+      // Square Problem Sizes...
+      // Re-initialise offload threshold structures
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                                          "_square_square_M=N=K.csv");
+      for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+        // M = dim, N = dim, K = dim;
+        callKernels(csvFile, dim, dim, dim);
+      }
+      // Close file
+      csvFile.close();
+      if (doCPU_ && doGPU_) {
+      // Print offload results to stdout
+      printOffloadThreshold("Square x Square (M=N=K)");
+    }
+      // Rectangular Problem Sizes:
+      // Tall and thin x Short and wide
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_short-wide_M=N_M=16K.csv");
+      int K = startDimention_;
+      int M = 16 * K;
+      int N = 16 * K;
+      while (M <= upperLimit_) {
+        callKernels(csvFile, M, N, K);
+        M += 16;
+        N += 16;
+        K++;
+      }
+      // Close file
+      csvFile.close();
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, M=16K)");
+  }
+      // Tall and thin x Short and wide
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_short-wide_M=N_K=32.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+          // M = dim, N = dim, K = 32;
+          callKernels(csvFile, dim, dim, 32);
+        }
+      }
+      // Close file
+      csvFile.close();
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, K=32)");
+  }
+      // Short and wide x Tall and thin
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_short-wide_tall-thin_M=N_K=16M.csv");
+      M = startDimention_;
+      N = startDimention_;
+      K = 16 * M;
+      while (K <= upperLimit_) {
+        callKernels(csvFile, M, N, K);
+        M++;
+        N++;
+        K += 16;
+      }
+      // Close file
+      csvFile.close();
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N, K=16M)");
+  }
+      // Short and wide x Tall and thin
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_short-wide_tall-thin_M=N=32_K.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+          // M = 32, N = 32, K = dim;
+          callKernels(csvFile, 32, 32, dim);
+        }
+      }
+      // Close file
+      csvFile.close();
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N=32, K)");
+  }
+      // Tall and Thin x Square
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_square_K=N_M=16K.csv");
+      K = startDimention_;
+      N = startDimention_;
+      M = 16 * K;
+      while (M <= upperLimit_) {
+        callKernels(csvFile, M, N, K);
+        M += 16;
+        N++;
+        K++;
+      }
+      // Close file
+      csvFile.close();
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Square (K=N, M=16K)");
+  }
+      // Tall and Thin x Square
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_tall-thin_square_K=N=32_M.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+          // M = dim, N = 32, K = 32;
+          callKernels(csvFile, dim, 32, 32);
+        }
+      }
+      // Close file
+      csvFile.close();
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Tall-and-Thin x Square (M, K=N=32)");
+  }
+      // Square x Short and Wide
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_square_short-wide_M=K_N=16K.csv");
+      M = startDimention_;
+      K = startDimention_;
+      N = 16 * K;
+      while (N <= upperLimit_) {
+        callKernels(csvFile, M, N, K);
+        M++;
+        N += 16;
+        K++;
+      }
+      // Close file
+      csvFile.close();
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)");
+  }
+      // Square x Short and Wide
+      // Re-initialise offload threshold structures & previous results
+      cpuGpu_always_ = cpuGpu_offloadThreshold();
+      cpuGpu_once_ = cpuGpu_offloadThreshold();
+      cpuGpu_unified_ = cpuGpu_offloadThreshold();
+      prev_gpuResult_always = time_checksum_gflop();
+      prev_gpuResult_once = time_checksum_gflop();
+      prev_gpuResult_unified = time_checksum_gflop();
+      csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() +
+                            "_square_short-wide_M=K=32_N.csv");
+      if (upperLimit_ >= 32) {
+        for (int dim = startDimention_; dim <= upperLimit_; dim++) {
+          // M = 32, N = dim, K = 32;
+          callKernels(csvFile, 32, dim, 32);
+        }
+      }
+      if (doCPU_ && doGPU_) {
+    // Print offload results to stdout
+    printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)");
+  }
+      // Close file
+      csvFile.close();
+    }
+    /** Call the appropriate CPU and GPU GEMM kernels. */
+    void callKernels(std::ofstream& csvFile, const int M, const int N,
+                     const int K) {
+      const double probSize = calcKib(M, N, K);
+      const uint64_t flops = calcFlops(M, N, K);
+      std::string kernelName = getKernelName();
+      time_checksum_gflop cpuResult;
+      time_checksum_gflop gpuResult_once;
+      time_checksum_gflop gpuResult_always;
+      time_checksum_gflop gpuResult_unified;
+// Perform CPU kernel
+      if (doCPU_) {
+      cpu_.initialise(M, N, K);
+      cpuResult = cpu_.compute();
+      cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
+      // Write result to CSV file
+      writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize,
+                     0.0, iterations_, cpuResult.runtime, cpuResult.gflops);
+    }
+// Perform the GPU kernels
+      if (doGPU_) {
+      // - ONCE : Offload to/from GPU once before all iterations and once
+      // after
+      gpu_.initialise(gpuOffloadType::once, M, N, K);
+      gpuResult_once = gpu_.compute();
+      gpuResult_once.gflops =
+          calcGflops(flops, iterations_, gpuResult_once.runtime);
+      // - ALWAYS: Offload to/from GPU every iteration
+      gpu_.initialise(gpuOffloadType::always, M, N, K);
+      gpuResult_always = gpu_.compute();
+      gpuResult_always.gflops =
+          calcGflops(flops, iterations_, gpuResult_always.runtime);
+      // - UNIFIED : data passed from host to device (and device to host) as
+      //             needed
+      gpu_.initialise(gpuOffloadType::unified, M, N, K);
+      gpuResult_unified = gpu_.compute();
+      gpuResult_unified.gflops =
+          calcGflops(flops, iterations_, gpuResult_unified.runtime);
+      // Write results to CSV file
+      writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, K, probSize,
+                     0.0, iterations_, gpuResult_once.runtime,
+                     gpuResult_once.gflops);
+      writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, K,
+                     probSize, 0.0, iterations_, gpuResult_always.runtime,
+                     gpuResult_always.gflops);
+      writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, K, probSize,
+                     0.0, iterations_, gpuResult_unified.runtime,
+                     gpuResult_unified.gflops);
+    }
+      if (doCPU_ && doGPU_) {
+      // Make sure all checksums match if CPU and GPU kernels are run.
+      //  - The majority of BLAS Libraries guarentee the same result if a
+      //  function
+      //    is called multiple times. Given all input matrices are identical for
+      //    each GPU offload type, we need only to compare the CPU and GPU
+      //    checksums.
+      checkChecksums(cpuResult, gpuResult_once, gpuResult_always,
+                     gpuResult_unified, M, N, K);
+      // Check if offload structs should be reset
+      checkOffloadStructReset(cpuResult, gpuResult_once, gpuResult_always,
+                              gpuResult_unified);
+      // Check if offload threshold has been achieved for each GPU offload type.
+      updateOffloadStructs(cpuResult, gpuResult_once, gpuResult_always,
+                           gpuResult_unified, M, N, K, probSize);
+      // Update previous results
+      prev_gpuResult_once = gpuResult_once;
+      prev_gpuResult_always = gpuResult_always;
+      prev_gpuResult_unified = gpuResult_unified;
+    }
+    }
+    /** Ensure all CPU and GPU checksums are within the permitted limit of
+     * eachother. */
+     // Todo - think of a sensible way to do this for sparse!!!
+    void checkChecksums(time_checksum_gflop cpuResult,
+                        time_checksum_gflop gpuResult_once,
+                        time_checksum_gflop gpuResult_always,
+                        time_checksum_gflop gpuResult_unified, const int M,
+                        const int N, const int K) {
+      // Ensure that each checksum difference is less than 0.1%
+//      double hundredOverChecksum = 100 / std::fabs(cpuResult.checksum);
+//      if (((std::fabs(cpuResult.checksum - gpuResult_once.checksum) *
+//            hundredOverChecksum)) > 0.1 &&
+//          ((std::fabs(cpuResult.checksum - gpuResult_always.checksum) *
+//            hundredOverChecksum)) > 0.1 &&
+//          ((std::fabs(cpuResult.checksum - gpuResult_unified.checksum) *
+//            hundredOverChecksum)) > 0.1) {
+//        std::cerr << "ERROR - " << getKernelName()
+//                  << " kernel checksums do not match:\n\tInput "
+//                     "dimensions: M="
+//                  << M << ", N=" << N << ", K=" << K << std::endl;
+//        std::cerr << std::setprecision(10)
+//                  << "\tCPU Checksum = " << cpuResult.checksum << std::endl;
+//        std::cerr << std::setprecision(10)
+//                  << "\tGPU (Once) Checksum = " << gpuResult_once.checksum
+//                  << std::endl;
+//        std::cerr << std::setprecision(10)
+//                  << "\tGPU (Always) Checksum = " << gpuResult_always.checksum
+//                  << std::endl;
+//        std::cerr << std::setprecision(10)
+//                  << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum
+//                  << std::endl;
+//        exit(1);
+//      }
+    }
+    /** Check whether the offload structures need to be reset; and doing so if
+     * required.
+     *   - If CPU.gflops >= GPU.gflops for last two problem sizes, then reset
+     * offload structures as GPU may not necessarily have reached the offload
+     * threshold. */
+    void checkOffloadStructReset(time_checksum_gflop cpuResult,
+                                 time_checksum_gflop gpuResult_once,
+                                 time_checksum_gflop gpuResult_always,
+                                 time_checksum_gflop gpuResult_unified) {
+      if ((cpuGpu_once_.M != 0) && (cpuResult.gflops >= gpuResult_once.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_once.gflops)) {
+        cpuGpu_once_.cpuGflops = 0.0;
+        cpuGpu_once_.gpuGflops = 0.0;
+        cpuGpu_once_.probSize_kib = 0.0;
+        cpuGpu_once_.M = 0;
+        cpuGpu_once_.N = 0;
+        cpuGpu_once_.K = 0;
+      }
+      if ((cpuGpu_always_.M != 0) &&
+          (cpuResult.gflops >= gpuResult_always.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_always.gflops)) {
+        cpuGpu_always_.cpuGflops = 0.0;
+        cpuGpu_always_.gpuGflops = 0.0;
+        cpuGpu_always_.probSize_kib = 0.0;
+        cpuGpu_always_.M = 0;
+        cpuGpu_always_.N = 0;
+        cpuGpu_always_.K = 0;
+      }
+      if ((cpuGpu_unified_.M != 0) &&
+          (cpuResult.gflops >= gpuResult_unified.gflops) &&
+          (cpuResult.gflops >= prev_gpuResult_unified.gflops)) {
+        cpuGpu_unified_.cpuGflops = 0.0;
+        cpuGpu_unified_.gpuGflops = 0.0;
+        cpuGpu_unified_.probSize_kib = 0.0;
+        cpuGpu_unified_.M = 0;
+        cpuGpu_unified_.N = 0;
+        cpuGpu_unified_.K = 0;
+      }
+    }
+    /** Update the offload threshold structs if GPU.gflops > CPU.gflops. */
+    void updateOffloadStructs(time_checksum_gflop cpuResult,
+                              time_checksum_gflop gpuResult_once,
+                              time_checksum_gflop gpuResult_always,
+                              time_checksum_gflop gpuResult_unified, const int M,
+                              const int N, const int K, const double probSize) {
+      if ((cpuGpu_once_.M == 0) && cpuResult.gflops < gpuResult_once.gflops) {
+        cpuGpu_once_.cpuGflops = cpuResult.gflops;
+        cpuGpu_once_.gpuGflops = gpuResult_once.gflops;
+        cpuGpu_once_.probSize_kib = probSize;
+        cpuGpu_once_.M = M;
+        cpuGpu_once_.N = N;
+        cpuGpu_once_.K = K;
+      }
+      if ((cpuGpu_always_.M == 0) && cpuResult.gflops < gpuResult_always.gflops) {
+        cpuGpu_always_.cpuGflops = cpuResult.gflops;
+        cpuGpu_always_.gpuGflops = gpuResult_always.gflops;
+        cpuGpu_always_.probSize_kib = probSize;
+        cpuGpu_always_.M = M;
+        cpuGpu_always_.N = N;
+        cpuGpu_always_.K = K;
+      }
+      if ((cpuGpu_unified_.M == 0) &&
+          cpuResult.gflops < gpuResult_unified.gflops) {
+        cpuGpu_unified_.cpuGflops = cpuResult.gflops;
+        cpuGpu_unified_.gpuGflops = gpuResult_unified.gflops;
+        cpuGpu_unified_.probSize_kib = probSize;
+        cpuGpu_unified_.M = M;
+        cpuGpu_unified_.N = N;
+        cpuGpu_unified_.K = K;
+      }
+    }
+    /** A function for calculating FLOPs performed by a GEMM.
+     * C = alpha*AB + beta*C */
+     // ToDo -- Work out how to do this for an unknown algorithm
+    constexpr uint64_t calcFlops(const int M, const int N, const int K) const {
+      // A * B = 2*M*N*K (FMA)
+      // alpha * AB = M*N (multiplication)
+      // beta * C = M*N (multiplication)
+      // AB + C = M*N (addition)
+      // = 2MNK + MN + MN + MN
+      // If beta==0; = 2MNK + MN ------- alpha*AB Always done
+      // Else; = 2MNK + 3MN
+      uint64_t scalar = (BETA != 0) ? 3 : 1;
+      return (2 * (uint64_t)M * (uint64_t)N * (uint64_t)K) +
+             (scalar * (uint64_t)M * (uint64_t)N);
+    }
+    /** A function for calculating the total GEMM problem size in KiB. */
+    constexpr double calcKib(const int M, const int N, const int K) const {
+      uint64_t M_ = (uint64_t)M, N_ = (uint64_t)N, K_ = (uint64_t)K;
+      uint64_t probSize = (M_ * K_) + (K_ * N_) + (M_ * N_);
+      return ((double)(probSize * (sizeof(T))) / 1024);
+    }
+    /** Get the name of the kernel being run. */
+    std::string getKernelName() const {
+      switch (sizeof(T)) {
+        case 4:
+          return "sgemm";
+        case 8:
+          return "dgemm";
+        default:
+          return "unknown";
+      }
+    }
+    /** Print to stdout the offload thresholds. */
+    void printOffloadThreshold(const std::string& problemName) const {
+      std::vector<std::string> header = {
+              "Device",  "M",          "N", "K", "Total Prob. Size (KiB)",
+              "GFLOP/s", "CPU GFLOP/s"};
+      std::vector<std::vector<std::string>> rows;
+      // Initialise GPU_Once row
+      std::stringstream probSize_o;
+      std::stringstream gpuGflops_o;
+      std::stringstream cpuGflops_o;
+      probSize_o << std::fixed << std::setprecision(2)
+                 << cpuGpu_once_.probSize_kib;
+      gpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.gpuGflops;
+      cpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.cpuGflops;
+      if (cpuGpu_once_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Offload Once)", std::to_string(0),
+                        std::to_string(0), std::to_string(0), probSize_o.str(),
+                        "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Offload Once)", std::to_string(cpuGpu_once_.M),
+                        std::to_string(cpuGpu_once_.N),
+                        std::to_string(cpuGpu_once_.K), probSize_o.str(),
+                        gpuGflops_o.str(), cpuGflops_o.str()});
+      }
+      // Initialise GPU_always row
+      std::stringstream probSize_a;
+      std::stringstream gpuGflops_a;
+      std::stringstream cpuGflops_a;
+      probSize_a << std::fixed << std::setprecision(2)
+                 << cpuGpu_always_.probSize_kib;
+      gpuGflops_a << std::fixed << std::setprecision(2)
+                  << cpuGpu_always_.gpuGflops;
+      cpuGflops_a << std::fixed << std::setprecision(2)
+                  << cpuGpu_always_.cpuGflops;
+      if (cpuGpu_always_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Offload Always)", std::to_string(0),
+                        std::to_string(0), std::to_string(0), probSize_a.str(),
+                        "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Offload Always)", std::to_string(cpuGpu_always_.M),
+                        std::to_string(cpuGpu_always_.N),
+                        std::to_string(cpuGpu_always_.K), probSize_a.str(),
+                        gpuGflops_a.str(), cpuGflops_a.str()});
+      }
+      // Initialise GPU_unified row
+      std::stringstream probSize_u;
+      std::stringstream gpuGflops_u;
+      std::stringstream cpuGflops_u;
+      probSize_u << std::fixed << std::setprecision(2)
+                 << cpuGpu_unified_.probSize_kib;
+      gpuGflops_u << std::fixed << std::setprecision(2)
+                  << cpuGpu_unified_.gpuGflops;
+      cpuGflops_u << std::fixed << std::setprecision(2)
+                  << cpuGpu_unified_.cpuGflops;
+      if (cpuGpu_unified_.M == 0) {
+        // No offload threshold found
+        rows.push_back({"GPU (Unified Memory)", std::to_string(0),
+                        std::to_string(0), std::to_string(0), probSize_u.str(),
+                        "N/A", "N/A"});
+      } else {
+        rows.push_back({"GPU (Unified Memory)", std::to_string(cpuGpu_unified_.M),
+                        std::to_string(cpuGpu_unified_.N),
+                        std::to_string(cpuGpu_unified_.K), probSize_u.str(),
+                        gpuGflops_u.str(), cpuGflops_u.str()});
+      }
+      // Print table
+      tablePrinter tPrinter(
+              problemName + " Problem Domian GPU Offload Thresholds:", header, rows);
+      tPrinter.print(1);
+    }
+    /** The output directory where CSV files should be saved to. */
+    const std::string CSV_DIR;
+    /** The number of iterations to perform per problem size. */
+    const int iterations_;
+    /** The value of the first probelm size dimention run. */
+    const int startDimention_;
+    /** The maximum value of the largest problem size dimention. */
+    const int upperLimit_;
+    /** Whether the CPU kernels should be run. */
+    const bool doCPU_ = true;
+    /** Whether the GPU kernels should be run. */
+    const bool doGPU_ = true;
+    /** The GEMM CPU kernel. */
+  cpu::spgemm_cpu<T> cpu_;
+    /** The GEMM GPU kernel. */
+  gpu::spgemm_gpu<T> gpu_;
+    /** The point at which offloading to GPU (offload once) becomes worthwhile. */
+    cpuGpu_offloadThreshold cpuGpu_once_;
+    /** The point at which offloading to GPU (offload always) becomes worthwhile.
+     */
+    cpuGpu_offloadThreshold cpuGpu_always_;
+    /** The point at which offloading to GPU (unified memory) becomes worthwhile.
+     */
+    cpuGpu_offloadThreshold cpuGpu_unified_;
+    /** The previous problem size's GPU (offload once) performance results. */
+    time_checksum_gflop prev_gpuResult_once;
+    /** The previous problem size's GPU (offload always) performance results. */
+    time_checksum_gflop prev_gpuResult_always;
+    /** The previous problem size's GPU (unified memory) performance results. */
+    time_checksum_gflop prev_gpuResult_unified;
\ No newline at end of file
diff --git a/include/doSpmm.hh b/include/doSpmm.hh
index 2321636..51f3aba 100644
--- a/include/doSpmm.hh
+++ b/include/doSpmm.hh
@@ -12,19 +12,19 @@
 #elif defined CPU_ONEMKL
 // Todo #include "../oneMKL/CPU/spmm.hh"
 #elif defined CPU_AOCL
-// Todo #include "../AOCL/gemm.hh"
+// Todo #include "../AOCL/spmm.hh"
 #elif defined CPU_NVPL
- // Todo #include "../NVPL/gemm.hh"
+ // Todo #include "../NVPL/spmm.hh"
 #elif defined CPU_OPENBLAS
-// Todo #include "../OpenBLAS/gemm.hh"
+// Todo #include "../OpenBLAS/spmm.hh"
 #if defined GPU_CUBLAS
 #include "../cuBLAS/spmm.hh"
 #elif defined GPU_ONEMKL
-// Todo #include "../oneMKL/GPU/gemm.hh"
+// Todo #include "../oneMKL/GPU/spmm.hh"
 #elif defined GPU_ROCBLAS
-// Todo #include "../rocBLAS/gemm.hh"
+// Todo #include "../rocBLAS/spmm.hh"
 /** `T` represents the type of kernel that will be run - i.e. T=float is for
diff --git a/include/kernels/CPU/spgemm.hh b/include/kernels/CPU/spgemm.hh
new file mode 100644
index 0000000..03f897d
--- /dev/null
+++ b/include/kernels/CPU/spgemm.hh
@@ -0,0 +1,56 @@
+#pragma once
+#include "../spgemm.hh"
+namespace cpu {
+ * An abstract class for sparse matrix-dense matrix BLAS kernels
+ */
+template <typename T>
+class spgemm : public :: spgemm<T> {
+    using ::spgemm<T>::spgemm;
+    using ::spgemm<T>::initInputMatrices;
+    using ::spgemm<T>::iterations_;
+    using ::spgemm<T>::nnz_;
+    using ::spgemm<T>::sparsity_;
+    using ::spgemm<T>::m_;
+    using ::spgemm<T>::n_;
+    using ::spgemm<T>::k_;
+    using ::spgemm<T>::A_;
+    using ::spgemm<T>::B_;
+    using ::spgemm<T>::C_;
+    /**
+     * Initialise the required data structures.
+     */
+    void initialise(int n, int m, int k, double sparsity,
+                    bool binary = false) {
+      n_ = n;
+      m_ = m;
+      k_ = k;
+      sparsity_ = sparsity;
+      nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+      A_ = (T*)malloc(sizeof(T) * m_ * k_);
+      B_ = (T*)malloc(sizeof(T) * k_ * n_);
+      C_ = (T*)calloc(sizeof(T) * m_ * n_);
+      initInputMatrices();
+    }
+    /** Do any necessary cleanup (free pointers, close library handles, etc.)
+     * after Kernel has been called. */
+    void postCallKernelCleanup() {
+      free(A_);
+      free(B_);
+      free(C_);
+    }
\ No newline at end of file
diff --git a/include/kernels/CPU/spgmm.hh b/include/kernels/CPU/spgmm.hh
deleted file mode 100644
index 59856ed..0000000
--- a/include/kernels/CPU/spgmm.hh
+++ /dev/null
@@ -1,8 +0,0 @@
-// Created by Alexander Cockrean on 07/01/2025.
diff --git a/include/kernels/CPU/spmm.hh b/include/kernels/CPU/spmm.hh
index d90f48b..c698101 100644
--- a/include/kernels/CPU/spmm.hh
+++ b/include/kernels/CPU/spmm.hh
@@ -14,7 +14,6 @@ class spmm : public ::spmm<T> {
   using ::spmm<T>::spmm;
   using ::spmm<T>::initInputMatrices;
-  using ::spmm<T>::toCSR_int;
   using ::spmm<T>::iterations_;
   using ::spmm<T>::nnzA_;
   using ::spmm<T>::nnzB_;
@@ -29,7 +28,7 @@ public:
   /** Initialise the required data structures. */
   void initialise(int n, int m, int k, double sparsity,
-                          bool binary = false) {
+                  bool binary = false) {
     n_ = n;
     m_ = m;
     k_ = k;
diff --git a/include/kernels/GPU/spgemm.hh b/include/kernels/GPU/spgemm.hh
index 917469b..13aa4b9 100644
--- a/include/kernels/GPU/spgemm.hh
+++ b/include/kernels/GPU/spgemm.hh
@@ -1,8 +1,28 @@
-// Created by Alexander Cockrean on 07/01/2025.
+#pragma once
+#include "../spgemm.hh"
+namespace gpu {
+/** An abstract class for sparse matrix-dense matrix BLAS kernels. */
+    template <typename T>
+    class spgemm : public ::spgemm<T> {
+    public:
+        using ::spgemm<T>::spgemm;
+        /** Initialise the required data structures.
+       * `offload` refers to the data offload type:
+       *  - Once:    Move data from host to device before all iterations & move from
+       *             device to host after all iterations
+       *  - Always:  Move data from host to device and device to host each iteration
+       *  - Unified: Initialise data as unified memory; no data movement semantics
+       *             required */
+        virtual void initialise(gpuOffloadType offload, int m, int n, int k,
+                                double sparsity, bool binary = false) = 0;
+    protected:
+        /** Whether data should be offloaded to/from the GPU each iteration, or just
+         * before & after. */
+        gpuOffloadType offload_ = gpuOffloadType::always;
+    };
+}  // namespace gpu
\ No newline at end of file
diff --git a/include/kernels/spgemm.hh b/include/kernels/spgemm.hh
index 917469b..eb0594c 100644
--- a/include/kernels/spgemm.hh
+++ b/include/kernels/spgemm.hh
@@ -1,8 +1,130 @@
-// Created by Alexander Cockrean on 07/01/2025.
+#pragma once
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <limits>
+#include <random>
+#include <iostream>
+#include "../utilities.hh"
+* A generic abstract class defining the operation of timing a sparse GEMM
+ * BLAS kernel for n iterations
+template <typename T>
+class spgemm {
+    spgemm(const int iters) : iterations_(iters) {}
+    /** Call the kernel n times.  Returns the time elapsed for all n calls
+     * in seconds */
+    time_checksum_gflop compute() {
+      // Start the timer
+      std::chrono::time_point<std::chrono::high_resolution_clock> startTime =
+              std::chrono::high_resolution_clock::now();
+      // perform tje SPMM calls
+      preLoopRequirements();
+      for (int i = 0; i < iterations_; i++) {
+        callSpmm();
+      }
+      postLoopRequirements();
+      // Stop the timer
+      std::chrono::time_point<std::chrono::high_resolution_clock> endTime =
+              std::chrono::high_resolution_clock::now();
+      std::chrono::duration<double> time_s = endTime - startTime;
+      double checksum = calcChecksum();
+      postCallKernelCleanup();
+      return {time_s.count(), checksum, 0.0};
+    }
+    int64_t nnz_ = 0;
+    /** Performs the steps required before calling the SPMM kernel that
+     * should be timed */
+    virtual void preLoopRequirements() = 0;
+    /** Perform the SPMM kernel. */
+    virtual void callSpmm() = 0;
+    /** Perform any steps required after calling the SPMM kernel that should
+     * be timed */
+    virtual void postLoopRequirements() = 0;
+    /** Do the necessary cleanup after the kernel has been finished that
+     * should not be timed */
+    virtual void postCallKernelCleanup() = 0;
+    /** Calculate a checksum from the result matrix C. */
+    constexpr double calcChecksum() {
+      // Todo -- think about how this can sensibly be done for SPMM
+      return 0.0;
+    }
+    /** Set up the starting matrices */
+    void initInputMatrices() {
+      for (size_t i = 0; i < (m_ * k_); i++) {
+        A_[i] = 0.0;
+      }
+      srand(SEED);
+      for (size_t i = 0; i < (k_ * n_); i++) {
+        B_[i] = (T)((double)(rand() % 100) / 7.0);
+      }
+      for (size_t i = 0; i < (m_ * n_); i++) {
+        C_[i] = (T)0.0;
+      }
+      // Random number generator objects for use in descent
+      std::default_random_engine gen;
+      gen.seed(std::chrono::system_clock::now()
+                       .time_since_epoch().count());
+      std::uniform_real_distribution<double> dist(0.0, 1.0);
+      // Using a=0.45 and b=c=0.22 as default probabilities
+      for (size_t i = 0; i < nnz_; i++) {
+        while (!rMat(A_, k_, 0, k_ - 1, 0, m_ - 1, 0.45, 0.22, 0.22, &gen, dist,
+                     false)) {}
+      }
+      toSparseFormat();
+    }
+    /** Move matrices into the sparse representation of for the given library */
+    virtual void toSparseFormat() = 0;
+    /** Call the external consume() function on the matrices */
+    void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); }/** Recursive function to populate sparse matrices */
+    /** The number of iterations to perform per problem size. */
+    const int iterations_;
+    /** Matrix dimension M. */
+    int m_ = 0;
+    /** Matrix dimension N. */
+    int n_ = 0;
+    /** Matrix dimension K. */
+    int k_ = 0;
+    /** Dense representation of input matrix A. */
+    T* A_;
+    /** Dense representation of input matrix B. */
+    T* B_;
+    /** Dense representation of output matrix C. */
+    T* C_;
+    double sparsity_;
\ No newline at end of file
diff --git a/include/kernels/spmm.hh b/include/kernels/spmm.hh
index 9d45f56..28993c8 100644
--- a/include/kernels/spmm.hh
+++ b/include/kernels/spmm.hh
@@ -76,6 +76,9 @@ protected:
       for (size_t i = 0; i < (k_ * n_); i++) {
         B_[i] = 0.0;
+      for (size_t i = 0; i < (m_ * n_); i++) {
+        C_[i] = 0.0;
+      }
       // Random number generator objects for use in descent
       std::default_random_engine gen;

From 8bc912593093c0f8acf10c0e5059f552ee49e758 Mon Sep 17 00:00:00 2001
From: Alex Cockrean <>
Date: Tue, 14 Jan 2025 11:58:37 +0000
Subject: [PATCH 38/38] Finishing off OneMKL CPU support

 .idea/workspace.xml       |  37 ++++---
 ArmPL/spgemm.hh           |   2 +-
 ArmPL/spgemv.hh           |   4 +-
 ArmPL/spmm.hh             |   2 +-
 cuBLAS/spgemm.hh          |   2 +-
 cuBLAS/spmm.hh            |   2 +-
 include/doSpgemm.hh       |  14 +--
 include/doSpgemv.hh       |   2 +-
 include/doSpmm.hh         |  10 +-
 include/kernels/spgemm.hh |  14 +--
 include/kernels/spgemv.hh |   4 +-
 include/kernels/spmm.hh   |  12 +-
 oneMKL/CPU/spgemm.hh      | 177 +++++++++++++++++++++++++++++
 oneMKL/CPU/spgemv.hh      | 155 ++++++++++++++++++++++++++
 oneMKL/CPU/spmm.hh        | 228 ++++++++++++++++++++++++++++++++++++++
 15 files changed, 613 insertions(+), 52 deletions(-)
 create mode 100644 oneMKL/CPU/spgemm.hh
 create mode 100644 oneMKL/CPU/spgemv.hh
 create mode 100644 oneMKL/CPU/spmm.hh

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 8556bf2..9fb6a86 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -15,18 +15,21 @@
   <component name="ChangeListManager">
-    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Moving spgemv into new format">
-      <change afterPath="$PROJECT_DIR$/ArmPL/spgemm.hh" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/cuBLAS/spgemm.hh" afterDir="false" />
+    <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Finishing off armpl and cusparse kernels">
+      <change afterPath="$PROJECT_DIR$/oneMKL/CPU/spgemm.hh" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/oneMKL/CPU/spgemv.hh" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/oneMKL/CPU/spmm.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/ArmPL/spgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spgemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/ArmPL/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spgemv.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/ArmPL/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spmm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/cuBLAS/spgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spgemm.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/cuBLAS/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spmm.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/include/doSpgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpgemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/doSpgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpgemv.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/include/doSpmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpmm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/spgmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spgemm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/include/kernels/GPU/spgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemm.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/include/kernels/spgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spgemm.hh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/include/kernels/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spgemv.hh" afterDir="false" />
       <change beforePath="$PROJECT_DIR$/include/kernels/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spmm.hh" afterDir="false" />
     <option name="SHOW_DIALOG" value="false" />
@@ -191,14 +194,6 @@
       <workItem from="1729503392250" duration="1773000" />
       <workItem from="1730878516596" duration="9915000" />
-    <task id="LOCAL-00001" summary="trivial changes">
-      <option name="closed" value="true" />
-      <created>1706261672580</created>
-      <option name="number" value="00001" />
-      <option name="presentableId" value="LOCAL-00001" />
-      <option name="project" value="LOCAL" />
-      <updated>1706261672580</updated>
-    </task>
     <task id="LOCAL-00002" summary="Adding sparse algorithm">
       <option name="closed" value="true" />
@@ -583,7 +578,15 @@
       <option name="project" value="LOCAL" />
-    <option name="localTasksCounter" value="50" />
+    <task id="LOCAL-00050" summary="Finishing off armpl and cusparse kernels">
+      <option name="closed" value="true" />
+      <created>1736437501127</created>
+      <option name="number" value="00050" />
+      <option name="presentableId" value="LOCAL-00050" />
+      <option name="project" value="LOCAL" />
+      <updated>1736437501127</updated>
+    </task>
+    <option name="localTasksCounter" value="51" />
     <servers />
   <component name="TypeScriptGeneratedFilesManager">
@@ -601,7 +604,6 @@
   <component name="VcsManagerConfiguration">
-    <MESSAGE value="Now compiles with fewer runtime errors" />
     <MESSAGE value="Implementing other offload types - still some runtime errors" />
     <MESSAGE value="All implemented and running.  No checksum at the end" />
     <MESSAGE value="All three offload types working for large problem sizes" />
@@ -626,6 +628,7 @@
     <MESSAGE value="Getting rid of old oneMKL sparse file" />
     <MESSAGE value="Refactoring to make individual files relate to a single kernel" />
     <MESSAGE value="Moving spgemv into new format" />
-    <option name="LAST_COMMIT_MESSAGE" value="Moving spgemv into new format" />
+    <MESSAGE value="Finishing off armpl and cusparse kernels" />
+    <option name="LAST_COMMIT_MESSAGE" value="Finishing off armpl and cusparse kernels" />
\ No newline at end of file
diff --git a/ArmPL/spgemm.hh b/ArmPL/spgemm.hh
index 0f9e81d..85eb117 100644
--- a/ArmPL/spgemm.hh
+++ b/ArmPL/spgemm.hh
@@ -185,7 +185,7 @@ protected:
   /** Make call to the GEMM kernel. */
-  void callGemm() override {
+  void callSpgemm() override {
      * Flow of ARMPL Sparse LA:
diff --git a/ArmPL/spgemv.hh b/ArmPL/spgemv.hh
index 5045062..e64a665 100644
--- a/ArmPL/spgemv.hh
+++ b/ArmPL/spgemv.hh
@@ -78,8 +78,6 @@ class spgemv_cpu : public spgemv<T> {
   /** Perform any required steps before calling the GEMM kernel that should
    * be timed. */
   void preLoopRequirements() override {
-    // Need to put A_ and B_ into A_armpl_ and B_armpl_
-    toCSR_armpl();
     /** providing hints to ARMPL and optimizing the matrix datastructures */
     // TODO -- is noallocs best here?
@@ -162,7 +160,7 @@ class spgemv_cpu : public spgemv<T> {
     flags_ = 0;
     // Move A to CSR
-    A_armpl_row_ptr_ = new armpl_int_t[n_ + 1];
+    A_armpl_row_ptr_ = new armpl_int_t[m_ + 1];
     A_armpl_col_index_ = new armpl_int_t[nnz_];
     A_vals_ = new T[nnz_];
     A_armpl_row_ptr_[0] = 0;
diff --git a/ArmPL/spmm.hh b/ArmPL/spmm.hh
index 9680f09..889cb23 100644
--- a/ArmPL/spmm.hh
+++ b/ArmPL/spmm.hh
@@ -182,7 +182,7 @@ class spmm_cpu : public spmm<T> {
   /** Make call to the GEMM kernel. */
-  void callGemm() override {
+  void callSpmm() override {
      * Flow of ARMPL Sparse LA:
diff --git a/cuBLAS/spgemm.hh b/cuBLAS/spgemm.hh
index d4233fd..73e1dfb 100644
--- a/cuBLAS/spgemm.hh
+++ b/cuBLAS/spgemm.hh
@@ -180,7 +180,7 @@ private:
-  void callGemm() override {
+  void callSpgemm() override {
     switch(offload_) {
       case gpuOffloadType::always: {
         // Clean up old descriptors
diff --git a/cuBLAS/spmm.hh b/cuBLAS/spmm.hh
index 249f1ea..8db845a 100644
--- a/cuBLAS/spmm.hh
+++ b/cuBLAS/spmm.hh
@@ -242,7 +242,7 @@ class spmm_gpu : public spmm<T> {
   /** Make a call to the BLAS Library Kernel. */
-  void callGemm() override {
+  void callSpmm() override {
     switch(offload_) {
       case gpuOffloadType::always: {
         if (C_mem_allocated_always_) {
diff --git a/include/doSpgemm.hh b/include/doSpgemm.hh
index b8d1d9b..be3a77b 100644
--- a/include/doSpgemm.hh
+++ b/include/doSpgemm.hh
@@ -9,7 +9,7 @@
 #if defined CPU_ARMPL
 #include "../ArmPL/spgemm.hh"
 #elif defined CPU_ONEMKL
-// Todo #include "../oneMKL/CPU/spgemm.hh"
+#include "../oneMKL/CPU/spgemm.hh"
 #elif defined CPU_AOCL
 // Todo #include "../AOCL/spgemm.hh"
 #elif defined CPU_NVPL
@@ -38,10 +38,10 @@ public:
              const int upperlimit, const bool cpuEnabled = true,
              const bool gpuEnabled = true)
           : CSV_DIR(csvDir),
-            iterations_(iterations),
+            iterations_(iters),
-            upperLimit_(upperLimit),
-            doCPU_(cpuEnables),
+            upperLimit_(upperlimit),
+            doCPU_(cpuEnabled),
@@ -52,7 +52,7 @@ public:
-      static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>) &&
+      static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) &&
       "ERROR - doGemm can only be constructed using one of the "
       "following types: [float, double].");
@@ -313,12 +313,12 @@ private:
 // Perform CPU kernel
       if (doCPU_) {
-      cpu_.initialise(M, N, K);
+      cpu_.initialise(M, N, K, 0.99);
       cpuResult = cpu_.compute();
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
       // Write result to CSV file
       writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize,
-                     0.0, iterations_, cpuResult.runtime, cpuResult.gflops);
+                     0.99, iterations_, cpuResult.runtime, cpuResult.gflops);
diff --git a/include/doSpgemv.hh b/include/doSpgemv.hh
index c2c6a3d..3162736 100644
--- a/include/doSpgemv.hh
+++ b/include/doSpgemv.hh
@@ -9,7 +9,7 @@
 #if defined CPU_ARMPL
 #include "../ArmPL/spgemv.hh"
 #elif defined CPU_ONEMKL
-// Todo #include "../oneMKL/CPU/spgemv.hh"
+#include "../oneMKL/CPU/spgemv.hh"
 #elif defined CPU_AOCL
 // Todo #include "../AOCL/spgemv.hh"
 #elif defined CPU_NVPL
diff --git a/include/doSpmm.hh b/include/doSpmm.hh
index 51f3aba..3ac1e66 100644
--- a/include/doSpmm.hh
+++ b/include/doSpmm.hh
@@ -10,7 +10,7 @@
 #if defined CPU_ARMPL
 #include "../ArmPL/spmm.hh"
 #elif defined CPU_ONEMKL
-// Todo #include "../oneMKL/CPU/spmm.hh"
+#include "../oneMKL/CPU/spmm.hh"
 #elif defined CPU_AOCL
 // Todo #include "../AOCL/spmm.hh"
 #elif defined CPU_NVPL
@@ -236,7 +236,7 @@ private:
     if (doCPU_) {
-      cpu_.initialise(N, sparsity);
+      cpu_.initialise(N, N, N, sparsity);
       time_checksum_gflop cpuResult = cpu_.compute();
       cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
 		  writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize,
@@ -249,19 +249,19 @@ private:
     // - UNIFIED : data passed from host to device (and device to host) as
     //             needed
     if (doGPU_) {
-      gpu_.initialise(gpuOffloadType::unified, N, sparsity);
+      gpu_.initialise(gpuOffloadType::unified, N, N, N, sparsity);
       time_checksum_gflop gpuResult_unified = gpu_.compute();
       gpuResult_unified.gflops =
       calcGflops(flops, iterations_, gpuResult_unified.runtime);
     // - ALWAYS: Offload to/from GPU every iteration
-      gpu_.initialise(gpuOffloadType::always, N, sparsity);
+      gpu_.initialise(gpuOffloadType::always, N, N, N, sparsity);
       time_checksum_gflop gpuResult_always = gpu_.compute();
       gpuResult_always.gflops =
             calcGflops(flops, iterations_, gpuResult_always.runtime);
 		// - ONCE : Offload to/from GPU once before all iterations and once
 		// after
-      gpu_.initialise(gpuOffloadType::once, N, sparsity);
+      gpu_.initialise(gpuOffloadType::once, N, N, N, sparsity);
 		  time_checksum_gflop gpuResult_once = gpu_.compute();
 		  gpuResult_once.gflops =
 						calcGflops(flops, iterations_, gpuResult_once.runtime);
diff --git a/include/kernels/spgemm.hh b/include/kernels/spgemm.hh
index eb0594c..3aacf77 100644
--- a/include/kernels/spgemm.hh
+++ b/include/kernels/spgemm.hh
@@ -28,7 +28,7 @@ public:
       // perform tje SPMM calls
       for (int i = 0; i < iterations_; i++) {
-        callSpmm();
+        callSpgemm();
@@ -51,8 +51,8 @@ private:
      * should be timed */
     virtual void preLoopRequirements() = 0;
-    /** Perform the SPMM kernel. */
-    virtual void callSpmm() = 0;
+    /** Perform the sparse GEMM kernel. */
+    virtual void callSpgemm() = 0;
     /** Perform any steps required after calling the SPMM kernel that should
      * be timed */
@@ -71,16 +71,16 @@ private:
     /** Set up the starting matrices */
     void initInputMatrices() {
-      for (size_t i = 0; i < (m_ * k_); i++) {
+      for (int i = 0; i < (m_ * k_); i++) {
         A_[i] = 0.0;
-      for (size_t i = 0; i < (k_ * n_); i++) {
+      for (int i = 0; i < (k_ * n_); i++) {
         B_[i] = (T)((double)(rand() % 100) / 7.0);
-      for (size_t i = 0; i < (m_ * n_); i++) {
+      for (int i = 0; i < (m_ * n_); i++) {
         C_[i] = (T)0.0;
@@ -91,7 +91,7 @@ protected:
       std::uniform_real_distribution<double> dist(0.0, 1.0);
       // Using a=0.45 and b=c=0.22 as default probabilities
-      for (size_t i = 0; i < nnz_; i++) {
+      for (int i = 0; i < nnz_; i++) {
         while (!rMat(A_, k_, 0, k_ - 1, 0, m_ - 1, 0.45, 0.22, 0.22, &gen, dist,
                      false)) {}
diff --git a/include/kernels/spgemv.hh b/include/kernels/spgemv.hh
index 297b406..b07be26 100644
--- a/include/kernels/spgemv.hh
+++ b/include/kernels/spgemv.hh
@@ -72,7 +72,7 @@ private:
     void initInputMatrixVector() {
       // Initialise matric to
-      for (size_t i = 0; i < (n_ * m_); i++) {
+      for (int i = 0; i < (n_ * m_); i++) {
         A_[i] = 0.0;
@@ -83,7 +83,7 @@ protected:
       std::uniform_real_distribution<double> dist(0.0, 1.0);
       // Using a=0.45 and b=c=0.22 as default probabilities
-      for (size_t i = 0; i < nnz_; i++) {
+      for (int i = 0; i < nnz_; i++) {
         while (!rMat(A_, m_, 0, n_ - 1, 0, m_ - 1, 0.45, 0.22, 0.22, &gen, dist,
                      false)) {}
diff --git a/include/kernels/spmm.hh b/include/kernels/spmm.hh
index 28993c8..8dbb501 100644
--- a/include/kernels/spmm.hh
+++ b/include/kernels/spmm.hh
@@ -1,4 +1,4 @@
-#pragma one
+#pragma once
 #include <algorithm>
 #include <chrono>
@@ -70,13 +70,13 @@ private:
     /** Set up the starting matrices */
     void initInputMatrices() {
-      for (size_t i = 0; i < (m_ * k_); i++) {
+      for (int i = 0; i < (m_ * k_); i++) {
         A_[i] = 0.0;
-      for (size_t i = 0; i < (k_ * n_); i++) {
+      for (int i = 0; i < (k_ * n_); i++) {
         B_[i] = 0.0;
-      for (size_t i = 0; i < (m_ * n_); i++) {
+      for (int i = 0; i < (m_ * n_); i++) {
         C_[i] = 0.0;
@@ -87,11 +87,11 @@ protected:
       std::uniform_real_distribution<double> dist(0.0, 1.0);
       // Using a=0.45 and b=c=0.22 as default probabilities
-      for (size_t i = 0; i < nnzA_; i++) {
+      for (int i = 0; i < nnzA_; i++) {
         while (!rMat(A_, k_, 0, k_ - 1, 0, m_ - 1, 0.45, 0.22, 0.22, &gen, dist,
                      false)) {}
-      for (size_t i = 0; i < nnzB_; i++) {
+      for (int i = 0; i < nnzB_; i++) {
         while (!rMat(B_, n_, 0, n_ - 1, 0, k_ - 1, 0.45, 0.22, 0.22, &gen, dist,
                      false)) {}
diff --git a/oneMKL/CPU/spgemm.hh b/oneMKL/CPU/spgemm.hh
new file mode 100644
index 0000000..318bdb2
--- /dev/null
+++ b/oneMKL/CPU/spgemm.hh
@@ -0,0 +1,177 @@
+#pragma once
+#ifdef CPU_ONEMKL
+#include <mkl.h>
+#include <algorithm>
+#include "../../include/kernels/CPU/spgemm.hh"
+#include "../../include/utilities.hh"
+namespace cpu {
+/** A class for sparse matrix-dense matrix BLAS kernels. */
+template <typename T>
+class spgemm_cpu : public spgemm<T> {
+    using spgemm<T>::spgemm;
+    using spgemm<T>::callConsume;
+    using spgemm<T>::initInputMatrices;
+    using spgemm<T>::m_;
+    using spgemm<T>::n_;
+    using spgemm<T>::k_;
+    using spgemm<T>::A_;
+    using spgemm<T>::B_;
+    using spgemm<T>::C_;
+    using spgemm<T>::sparsity_;
+    using spgemm<T>::nnz_;
+    void initialise(int m, int n, int k, double sparsity,
+                    bool binary = false) {
+      m_mkl_ = m;
+      n_mkl_ = n;
+      k_mkl_ = k;
+      sparsity_ = sparsity;
+      /** Determine the number of nnz elements in A and B */
+      nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+      A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64);
+      B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64);
+      C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64);
+      initInputMatrices();
+    }
+    void toSparseFormat() override {
+      A_vals_ = new T[nnz_];
+      A_cols_ = new MKL_INT[nnz_];
+      A_rowsb_ = new MKL_INT[m_ + 1];
+      A_rowse_ = new MKL_INT[m_ + 1];
+      int nnz_encountered = 0;
+      A_rowsb_[0] = 0;
+      A_rowse_[0] = 0;
+      for (int row = 0; row < m_; row++) {
+        A_rowsb_[row + 1] = nnz_encountered;
+        for (int col = 0; col < k_; col++) {
+          if (A_[(row * k_) + col] != 0.0) {
+            A_cols_[nnz_encountered] = col;
+            A_vals_[nnz_encountered] = static_cast<T>(A_[(row * k_) + col]);
+            nnz_encountered++;
+          }
+        }
+        A_rowse_[row + 1] = nnz_encountered;
+      }
+    }
+    void callSpgemm() override {
+      /**
+       * Using:
+       * sparse_status_t mkl_sparse_s_mm (
+       *    const sparse_operation_t operation,
+       *    const float alpha,
+       *    const sparse_matrix_t A,
+       *    const struct matrix_descr descr,
+       *    const sparse_layout_t layout,
+       *    const float *B,
+       *    const MKL_INT columns,
+       *    const MKL_INT ldb,
+       *    const float beta,
+       *    float *C,
+       *    const MKL_INT ldc);
+       */
+      if constexpr (std::is_same_v<T, float>) {
+        status_ = mkl_sparse_s_mm(operation_, alpha, A_csr_, description_,
+                                  layout_, B_, n_mkl_, k_mkl_, beta, C_,
+                                  m_mkl_);
+      } else if constexpr (std::is_same_v<T, double>) {
+        status_ = mkl_sparse_d_mm(operation_, alpha, A_csr_, description_,
+                                  layout_, B_, n_mkl_, k_mkl_, beta, C_,
+                                  m_mkl_);
+      } else {
+        // Un-specialised class will not do any work - print error and exit.
+        std::cout << "ERROR - Datatype for OneMKL CPU SpGEMV kernel not "
+                     "supported." << std::endl;
+        exit(1);
+      }
+      callConsume();
+    }
+    void preLoopRequirements() override {
+      if constexpr (std::is_same_v<T, float>) {
+        status_ = mkl_sparse_s_create_csr(&A_csr_,
+                                          indexing_,
+                                          m_,
+                                          k_,
+                                          A_rowsb_,
+                                          A_rowse_,
+                                          A_cols_,
+                                          A_vals_);
+        if (status_ != SPARSE_STATUS_SUCCESS) {
+          std::cout << "ERROR " << status_ << std::endl;
+          exit(1);
+        }
+      } else if constexpr (std::is_same_v<T, double>) {
+        status_ = mkl_sparse_d_create_csr(&A_csr_,
+                                          indexing_,
+                                          m_,
+                                          k_,
+                                          A_rowsb_,
+                                          A_rowse_,
+                                          A_cols_,
+                                          A_vals_);
+        if (status_ != SPARSE_STATUS_SUCCESS) {
+          std::cout << "ERROR " << status_ << std::endl;
+          exit(1);
+        }
+      }
+    }
+    void postLoopRequirements() override {
+      status_ = mkl_sparse_destroy(A_csr_);
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+    }
+    void postCallKernelCleanup() override {
+      mkl_free(A_);
+      mkl_free(B_);
+      mkl_free(C_);
+    }
+    sparse_status_t status_;
+    sparse_index_base_t indexing_ = SPARSE_INDEX_BASE_ZERO;
+    sparse_operation_t operation_ = SPARSE_OPERATION_NON_TRANSPOSE;
+    // Todo -- investigate if other options for description_ improve performance
+    matrix_descr description_ = {SPARSE_MATRIX_TYPE_GENERAL,
+                                 SPARSE_FILL_MODE_LOWER,
+                                 SPARSE_DIAG_NON_UNIT};
+    sparse_layout_t layout_ = SPARSE_LAYOUT_COLUMN_MAJOR;
+    MKL_INT m_mkl_;
+    MKL_INT n_mkl_;
+    MKL_INT k_mkl_;
+    T* A_vals_;
+    MKL_INT* A_cols_;
+    MKL_INT* A_rowsb_;
+    MKL_INT* A_rowse_;
+    sparse_matrix_t A_csr_;
+    const T alpha = ALPHA;
+    const T beta = BETA;
\ No newline at end of file
diff --git a/oneMKL/CPU/spgemv.hh b/oneMKL/CPU/spgemv.hh
new file mode 100644
index 0000000..bac5e32
--- /dev/null
+++ b/oneMKL/CPU/spgemv.hh
@@ -0,0 +1,155 @@
+#pragma once
+#ifdef CPU_ONEMKL
+#include <mkl.h>
+#include <algorithm>
+#include "../../include/kernels/CPU/spgemv.hh"
+#include "../../include/utilities.hh"
+namespace cpu {
+template <typename T>
+class spgemv_cpu : public spgemv<T> {
+    using spgemv<T>::spgemv;
+    using spgemv<T>::callConsume;
+    using spgemv<T>::initInputMatrices;
+    using spgemv<T>::m_;
+    using spgemv<T>::n_;
+    using spgemv<T>::A_;
+    using spgemv<T>::x_;
+    using spgemv<T>::y_;
+    using spgemv<T>::sparsity_;
+    using spgemv<T>::nnz_;
+    void initialise(int m, int n, double sparsity) {
+      m_ = m;
+      n_ = n;
+      sparsity_ = sparsity;
+      nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_));
+      A_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64);
+      x_ = (T*)mkl_malloc(sizeof(T) * n_, 64);
+      y_ = (T*)mkl_malloc(sizeof(T) * m_, 64);
+      initInputMatrices();
+    }
+    void toSparseFormat() override {
+      A_vals_ = new T[nnz_];
+      A_cols_ = new MKL_INT[nnz_];
+      A_rowsb_ = new MKL_INT[m_ + 1];
+      A_rowse_ = new MKL_INT[m_ + 1];
+      int nnz_encountered = 0;
+      A_rowsb_[0] = 0;
+      A_rowse_[0] = 0;
+      for (int row = 0; row < m_; row++) {
+        A_rowsb_[row + 1] = nnz_encountered;
+        for (int col = 0; col < n_; col++) {
+          if (A_[(row * n_) + col] != 0.0) {
+            A_cols_[nnz_encountered] = col;
+            A_vals_[nnz_encountered] = static_cast<T>(A_[(row * n_) + col]);
+            nnz_encountered++;
+          }
+        }
+        A_rowse_[row + 1] = nnz_encountered;
+      }
+    }
+    void callGemv() override {
+      /**
+       * sparse_status_t mkl_sparse_s_mv (
+       *    const sparse_operation_t operation,
+       *    const float alpha,
+       *    const sparse_matrix_t A,
+       *    const struct matrix_descr descr,
+       *    const float *x,
+       *    const float beta,
+       *    float *y);
+       */
+      if constexpr (std::is_same_v<T, float>) {
+        status_ = mkl_sparse_s_mv(operation_, alpha, A_csr_, description_, x_,
+                                  beta, y_);
+      } else if constexpr (std::is_same_v<T, float>) {
+        status_ = mkl_sparse_s_mv(operation_, alpha, A_csr_, description_, x_,
+                                  beta, y_);
+      }
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+      callConsume();
+    }
+    void preLoopRequirements() override {
+      if constexpr (std::is_same_v<T, float>) {
+        status_ = mkl_sparse_s_create_csr(&A_csr_,
+                                          indexing_,
+                                          m_,
+                                          n_,
+                                          A_rowsb_,
+                                          A_rowse_,
+                                          A_cols_,
+                                          A_vals_);
+      } else if constexpr (std::is_same_v<T, double>) {
+        status_ = mkl_sparse_d_create_csr(&A_csr_,
+                                          indexing_,
+                                          m_,
+                                          n_,
+                                          A_rowsb_,
+                                          A_rowse_,
+                                          A_cols_,
+                                          A_vals_);
+      }
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+    }
+    void postLoopRequirements() override {
+      status_ = mkl_sparse_destroy(A_csr_);
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+    }
+    void postKernelCleanup() override {
+      mkl_free(A_);
+      mkl_free(x_);
+      mkl_free(y_);
+    }
+    sparse_status_t status_;
+    sparse_index_base_t indexing_ = SPARSE_INDEX_BASE_ZERO;
+    sparse_operation_t operation_ = SPARSE_OPERATION_NON_TRANSPOSE;
+    sparse_matrix_type_t description_ = SPARSE_MATRIX_TYPE_GENERAL;
+    MKL_INT m_mkl_;
+    MKL_INT n_mkl_;
+    T* A_vals_;
+    MKL_INT* A_cols_;
+    MKL_INT* A_rowsb_;
+    MKL_INT* A_rowse_;
+    sparse_matrix_t A_csr_;
+    const T alpha = ALPHA;
+    const T beta = BETA;
diff --git a/oneMKL/CPU/spmm.hh b/oneMKL/CPU/spmm.hh
new file mode 100644
index 0000000..936aeb5
--- /dev/null
+++ b/oneMKL/CPU/spmm.hh
@@ -0,0 +1,228 @@
+#pragma once
+#ifdef CPU_ONEMKL
+#include <mkl.h>
+#include <mkl_spblas.h>
+#include <algorithm>
+#include "../../include/kernels/CPU/spmm.hh"
+#include "../../include/utilities.hh"
+namespace cpu {
+/** A class for sparse matrix-sparse matrix CPU BLAS kernels. */
+template <typename T>
+class spmm_cpu : public spmm<T> {
+    using spmm<T>::spmm;
+    using spmm<T>::initInputMatrices;
+    using spmm<T>::callConsume;
+    using spmm<T>::m_;
+    using spmm<T>::n_;
+    using spmm<T>::k_;
+    using spmm<T>::A_;
+    using spmm<T>::B_;
+    using spmm<T>::C_;
+    using spmm<T>::sparsity_;
+    using spmm<T>::nnzA_;
+    using spmm<T>::nnzB_;
+    void initialise(int m, int n, int k, double sparsity,
+                    bool binary = false) {
+      m_mkl_ = m;
+      n_mkl_ = n;
+      k_mkl_ = k;
+      sparsity_ = sparsity;
+      /** Determine the number of nnz elements in A and B */
+      nnzA_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+      nnzB_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_));
+      A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64);
+      B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64);
+      C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64);
+      initInputMatrices();
+    }
+    void toSparseFormat() override {
+      A_vals_ = new T[nnzA_];
+      A_cols_ = new MKL_INT[nnzA_];
+      A_rowsb_ = new MKL_INT[m_ + 1];
+      A_rowse_ = new MKL_INT[m_ + 1];
+      int nnz_encountered = 0;
+      A_rowsb_[0] = 0;
+      A_rowse_[0] = 0;
+      for (int row = 0; row < m_; row++) {
+        A_rowsb_[row + 1] = nnz_encountered;
+        for (int col = 0; col < k_; col++) {
+          if (A_[(row * k_) + col] != 0.0) {
+            A_cols_[nnz_encountered] = col;
+            A_vals_[nnz_encountered] = static_cast<T>(A_[(row * k_) + col]);
+            nnz_encountered++;
+          }
+        }
+        A_rowse_[row + 1] = nnz_encountered;
+      }
+      B_vals_ = new T[nnzB_];
+      B_cols_ = new MKL_INT[nnzB_];
+      B_rowsb_ = new MKL_INT[k_ + 1];
+      B_rowse_ = new MKL_INT[k_ + 1];
+      nnz_encountered = 0;
+      B_rowsb_[0] = 0;
+      B_rowse_[0] = 0;
+      for (int row = 0; row < k_; row++) {
+        B_rowsb_[row + 1] = nnz_encountered;
+        for (int col = 0; col < n_; col++) {
+          if (B_[(row * n_) + col] != 0.0) {
+            B_cols_[nnz_encountered] = col;
+            B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]);
+            nnz_encountered++;
+          }
+        }
+        B_rowse_[row + 1] = nnz_encountered;
+      }
+    }
+    void callSpmm() override {
+      /**
+       * sparse_status_t mkl_sparse_spmm (
+       *  const sparse_operation_t operation,
+       *  const sparse_matrix_t A,
+       *  const sparse_matrix_t B,
+       *  sparse_matrix_t *C);
+       */
+       status_ = mkl_sparse_spmm(operation_, A_csr_, B_csr_, &C_csr_);
+       if (status_ != SPARSE_STATUS_SUCCESS) {
+         std::cout << "ERROR " << status_ << std::endl;
+         exit(1);
+       }
+       callConsume();
+    }
+    void preLoopRequirements() override {
+      if constexpr (std::is_same_v<T, float>) {
+        status_ = mkl_sparse_s_create_csr(&A_csr_,
+                                          indexing_,
+                                          m_,
+                                          k_,
+                                          A_rowsb_,
+                                          A_rowse_,
+                                          A_cols_,
+                                          A_vals_);
+        if (status_ != SPARSE_STATUS_SUCCESS) {
+          std::cout << "ERROR " << status_ << std::endl;
+          exit(1);
+        }
+        status_ = mkl_sparse_s_create_csr(&B_csr_,
+                                          indexing_,
+                                          k_,
+                                          n_,
+                                          B_rowsb_,
+                                          B_rowse_,
+                                          B_cols_,
+                                          B_vals_);
+        if (status_ != SPARSE_STATUS_SUCCESS) {
+          std::cout << "ERROR " << status_ << std::endl;
+          exit(1);
+        }
+      } else if constexpr (std::is_same_v<T, double>) {
+        status_ = mkl_sparse_d_create_csr(&A_csr_,
+                                          indexing_,
+                                          m_,
+                                          k_,
+                                          A_rowsb_,
+                                          A_rowse_,
+                                          A_cols_,
+                                          A_vals_);
+        if (status_ != SPARSE_STATUS_SUCCESS) {
+          std::cout << "ERROR " << status_ << std::endl;
+          exit(1);
+        }
+        status_ = mkl_sparse_d_create_csr(&B_csr_,
+                                          indexing_,
+                                          k_,
+                                          n_,
+                                          B_rowsb_,
+                                          B_rowse_,
+                                          B_cols_,
+                                          B_vals_);
+        if (status_ != SPARSE_STATUS_SUCCESS) {
+          std::cout << "ERROR " << status_ << std::endl;
+          exit(1);
+        }
+      }
+    }
+    void postLoopRequirements() override {
+      status_ = mkl_sparse_destroy(A_csr_);
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+      status_ = mkl_sparse_destroy(B_csr_);
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+      status_ = mkl_sparse_destroy(C_csr_);
+      if (status_ != SPARSE_STATUS_SUCCESS) {
+        std::cout << "ERROR " << status_ << std::endl;
+        exit(1);
+      }
+    }
+    void postCallKernelCleanup() override {
+      mkl_free(A_);
+      mkl_free(B_);
+      mkl_free(C_);
+    }
+    sparse_status_t status_;
+    sparse_index_base_t indexing_ = SPARSE_INDEX_BASE_ZERO;
+    sparse_operation_t operation_ = SPARSE_OPERATION_NON_TRANSPOSE;
+    MKL_INT m_mkl_;
+    MKL_INT n_mkl_;
+    MKL_INT k_mkl_;
+    T* A_vals_;
+    MKL_INT* A_cols_;
+    MKL_INT* A_rowsb_;
+    MKL_INT* A_rowse_;
+    T* B_vals_;
+    MKL_INT* B_cols_;
+    MKL_INT* B_rowsb_;
+    MKL_INT* B_rowse_;
+    T* C_vals_;
+    MKL_INT* C_cols_;
+    MKL_INT* C_rowsb_;
+    MKL_INT* C_rowse_;
+    sparse_matrix_t A_csr_;
+    sparse_matrix_t B_csr_;
+    sparse_matrix_t C_csr_;
+    const T alpha = ALPHA;
+    const T beta = BETA;
\ No newline at end of file