From c1a3cd7acba859d9df200e557bd3454dc93c1abf Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:15:21 +0100 Subject: [PATCH 01/38] rebsing --- src/main.cc | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/main.cc b/src/main.cc index 2d046e3..c61df37 100644 --- a/src/main.cc +++ b/src/main.cc @@ -1,7 +1,6 @@ #include "../include/main.hh" int iters = 10; -int startDim = 1; int upperLimit = 128; bool doCpu = CPU_ENABLED; @@ -141,6 +140,32 @@ void getParameters(int argc, char* argv[]) { doCpu = false; } else if (!strcmp(argv[i], "--no_gpu")) { doGpu = false; + } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) { + sgemm = dgemm = sp_sgemm = sp_dgemm = false; + std::string kernelList = argv[++i]; + if (kernelList.find("sp-sgemm") != std::string::npos) { + sp_sgemm = true; + if (kernelList.find("sgemm") != std::string::npos && + kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) { + sgemm = true; + } + } else if (kernelList.find("sgemm") != std::string::npos) { + sgemm = true; + } + if (kernelList.find("sp-dgemm") != std::string::npos) { + sp_dgemm = true; + if (kernelList.find("dgemm") != std::string::npos && + kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) { + dgemm = true; + } + } else if (kernelList.find("dgemm") != std::string::npos) { + dgemm = true; + } + + if (!sgemm && !dgemm && !sp_sgemm && !sp_dgemm) { + std::cout << "ERROR - no implemented kernels in list" << std::endl; + exit(1); + } } else if (!strcmp(argv[i], "--output_dir") || !strcmp(argv[i], "-o")) { if (++i >= argc) { std::cout << "ERROR - Invalid output directory" << std::endl; From 21366b4359101379b640faf814173620f0635e4d Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:22:26 +0100 Subject: [PATCH 02/38] rebsing --- DefaultCPU/sp_gemm.hh | 55 ++++++ DefaultGPU/sp_gemm.hh | 54 ++++++ cuBLAS/sp_gemm.hh | 295 +++++++++++++++++++++++++++++++++ include/doGemm.hh | 94 +++++++++-- include/kernels/CPU/sp_gemm.hh | 110 ++++++++++++ include/kernels/GPU/sp_gemm.hh | 27 +++ src/main.cc | 4 + 7 files changed, 626 insertions(+), 13 deletions(-) create mode 100644 DefaultCPU/sp_gemm.hh create mode 100644 DefaultGPU/sp_gemm.hh create mode 100644 cuBLAS/sp_gemm.hh create mode 100644 include/kernels/CPU/sp_gemm.hh create mode 100644 include/kernels/GPU/sp_gemm.hh diff --git a/DefaultCPU/sp_gemm.hh b/DefaultCPU/sp_gemm.hh new file mode 100644 index 0000000..d7ecb37 --- /dev/null +++ b/DefaultCPU/sp_gemm.hh @@ -0,0 +1,55 @@ +#pragma once + +#if defined CPU_DEFAULT + +#include "../include/kernels/CPU/sp_gemm.hh" +#include "../include/utilities.hh" + +namespace cpu { +/** A class for GEMM CPU BLAS kernels. */ +template <typename T> +class sp_gemm_cpu : public sp_gemm<T> { + public: + using sp_gemm<T>::sp_gemm; + using sp_gemm<T>::callConsume; + using sp_gemm<T>::m_; + using sp_gemm<T>::n_; + using sp_gemm<T>::k_; + using sp_gemm<T>::A_; + using sp_gemm<T>::B_; + using sp_gemm<T>::C_; + + private: + /** Perform the GEMM kernel. */ + void callGemm() override { + /** A naive implementation of a column-major GEMM. Alpha and Beta are always + * 1 and 0 respectively. + * Operation takes the form of C[M,N] = A[M,K] * B[K,N]. + * callConsume() is required to ensure that the compiler does not optimise + * away this function. */ + int x, y, z; + T acc; + for (x = 0; x < m_; x++) { + for (y = 0; y < n_; y++) { + acc = 0.0; + for (z = 0; z < k_; z++) { + acc += A_[z * m_ + x] * B_[y * k_ + z]; + } + C_[y * m_ + x] = acc; + } + } + // Ensure compiler doesn't optimise away the work being done + callConsume(); + } + + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override {} + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override {} +}; + +} // namespace cpu +#endif diff --git a/DefaultGPU/sp_gemm.hh b/DefaultGPU/sp_gemm.hh new file mode 100644 index 0000000..92d157c --- /dev/null +++ b/DefaultGPU/sp_gemm.hh @@ -0,0 +1,54 @@ +#pragma once + +#if defined GPU_DEFAULT + +#include <cmath> + +#include "../include/kernels/GPU/sp_gemm.hh" +#include "../include/utilities.hh" + +namespace gpu { +/** A class for GEMM GPU BLAS kernels. */ +template <typename T> +class sp_gemm_gpu : public sp_gemm<T> { + public: + using sp_gemm<T>::sp_gemm; + + /** Call the BLAS kernel n times, with 1 warmup run. + * Returns the time elapsed for n BLAS calls in seconds. */ + time_checksum_gflop compute() { + // Override function in base `kernel` class as DefaultGPU should do nothing. + return {INFINITY, INFINITY, 0.0}; + } + + /** Initialise the required data structures. */ + void initialise(gpuOffloadType offload, int m, int n, int k) override { + // Default GPU implementation - do nothing. + } + + private: + /** Make a call to the BLAS Library Kernel. */ + void callGemm() override { + // Default GPU implementation - do nothing. + } + + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override { + // Default GPU implementation - do nothing. + } + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override { + // Default GPU implementation - do nothing. + } + + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() override { + // Default GPU implementation - do nothing. + } +}; +} // namespace gpu +#endif \ No newline at end of file diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh new file mode 100644 index 0000000..3a9cff0 --- /dev/null +++ b/cuBLAS/sp_gemm.hh @@ -0,0 +1,295 @@ +#pragma once + +#ifdef GPU_CUBLAS +#include <cublas_v2.h> +#include <cuda_runtime.h> + +#include "../include/kernels/GPU/gemm.hh" +#include "../include/utilities.hh" +#include "common.hh" + +namespace gpu { +/** A class for GEMM GPU BLAS kernels. */ +template <typename T> +class sp_gemm_gpu : public gemm<T> { + public: + using gemm<T>::gemm; + using gemm<T>::m_; + using gemm<T>::n_; + using gemm<T>::k_; + using gemm<T>::A_; + using gemm<T>::B_; + using gemm<T>::C_; + using gemm<T>::offload_; + + /** Initialise the required data structures. + * `offload` refers to the data offload type: + * - Once: Move data from host to device before all iterations & move from + * device to host after all iterations + * - Always: Move data from host to device and device to host each iteration + * - Unified: Initialise data as unified memory; no data movement semantics + * required */ + void initialise(gpuOffloadType offload, int m, int n, int k) override { + offload_ = offload; + + m_ = m; + n_ = n; + k_ = k; + + // Create a handle for CUBLAS + cublasCreate(&handle_); + + // Get device identifier + cudaCheckError(cudaGetDevice(&gpuDevice_)); + + // Initialise 3 streams to asynchronously move data between host and device + cudaCheckError(cudaStreamCreate(&s1_)); + cudaCheckError(cudaStreamCreate(&s2_)); + cudaCheckError(cudaStreamCreate(&s3_)); + + if (offload_ == gpuOffloadType::unified) { + cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * m_ * k_)); + cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * k_ * n_)); + cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * m_ * n_)); + } else { + // Allocate matrices on host + A_ = (T*)malloc(sizeof(T) * m_ * k_); + B_ = (T*)malloc(sizeof(T) * k_ * n_); + C_ = (T*)malloc(sizeof(T) * m_ * n_); + // Allocate matrices on device + cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * m_ * k_)); + cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * k_ * n_)); + cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * m_ * n_)); + } + + // Initialise the host matricies + srand(SEED); + for (int y = 0; y < m_; y++) { + for (int x = 0; x < k_; x++) { + A_[y * k_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0); + } + } + for (int y = 0; y < k_; y++) { + for (int x = 0; x < n_; x++) { + B_[y * n_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0); + } + } + } + + private: + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override { + switch (offload_) { + case gpuOffloadType::always: { + // Offload data each iteration - no requirements + break; + } + case gpuOffloadType::once: { + // Offload data from host to the device. + cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_, + cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_, + cudaMemcpyHostToDevice, s2_)); + cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_, + cudaMemcpyHostToDevice, s3_)); + break; + } + case gpuOffloadType::unified: { + // Prefetch memory to device + cudaCheckError( + cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_, gpuDevice_, s1_)); + cudaCheckError( + cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_, gpuDevice_, s2_)); + cudaCheckError( + cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_, gpuDevice_, s3_)); + break; + } + } + } + + /** Make a call to the BLAS Library Kernel. */ + void callGemm() override { + switch (offload_) { + case gpuOffloadType::always: { + // Offload data from host to the device. + cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_, + cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_, + cudaMemcpyHostToDevice, s2_)); + cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_, + cudaMemcpyHostToDevice, s3_)); + // Call cuBLAS GEMM kernel + if constexpr (std::is_same_v<T, float>) { + cublasStatus_t stat = + cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, + A_device_, std::max(1, m_), B_device_, + std::max(1, k_), &beta, C_device_, std::max(1, m_)); + if (stat != CUBLAS_STATUS_SUCCESS) { + std::cout << "cuBLAS error:" << stat << std::endl; + exit(1); + } + } else if constexpr (std::is_same_v<T, double>) { + cublasStatus_t stat = + cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, + A_device_, std::max(1, m_), B_device_, + std::max(1, k_), &beta, C_device_, std::max(1, m_)); + if (stat != CUBLAS_STATUS_SUCCESS) { + std::cout << "cuBLAS error:" << stat << std::endl; + exit(1); + } + } + // Offload data from device to host + cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_, + cudaMemcpyDeviceToHost, s1_)); + cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_, + cudaMemcpyDeviceToHost, s2_)); + cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_, + cudaMemcpyDeviceToHost, s3_)); + // Ensure device has finished all work. + cudaCheckError(cudaDeviceSynchronize()); + break; + } + case gpuOffloadType::once: { + // Call cuBLAS GEMM kernel + if constexpr (std::is_same_v<T, float>) { + cublasStatus_t stat = + cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, + A_device_, std::max(1, m_), B_device_, + std::max(1, k_), &beta, C_device_, std::max(1, m_)); + if (stat != CUBLAS_STATUS_SUCCESS) { + std::cout << "cuBLAS error:" << stat << std::endl; + exit(1); + } + } else if constexpr (std::is_same_v<T, double>) { + cublasStatus_t stat = + cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, + A_device_, std::max(1, m_), B_device_, + std::max(1, k_), &beta, C_device_, std::max(1, m_)); + if (stat != CUBLAS_STATUS_SUCCESS) { + std::cout << "cuBLAS error:" << stat << std::endl; + exit(1); + } + } + break; + } + case gpuOffloadType::unified: { + // Call cuBLAS GEMM kernel + if constexpr (std::is_same_v<T, float>) { + cublasStatus_t stat = cublasSgemm( + handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_, + std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_)); + if (stat != CUBLAS_STATUS_SUCCESS) { + std::cout << "cuBLAS error:" << stat << std::endl; + exit(1); + } + } else if constexpr (std::is_same_v<T, double>) { + cublasStatus_t stat = cublasDgemm( + handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_, + std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_)); + if (stat != CUBLAS_STATUS_SUCCESS) { + std::cout << "cuBLAS error:" << stat << std::endl; + exit(1); + } + } + break; + } + } + } + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override { + switch (offload_) { + case gpuOffloadType::always: { + // Offload data each iteration - no requirements + break; + } + case gpuOffloadType::once: { + // Offload data from device to host + cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_, + cudaMemcpyDeviceToHost, s1_)); + cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_, + cudaMemcpyDeviceToHost, s2_)); + cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_, + cudaMemcpyDeviceToHost, s3_)); + // Ensure device has finished all work. + cudaCheckError(cudaDeviceSynchronize()); + break; + } + case gpuOffloadType::unified: { + // Ensure all data resides on host once work has completed + cudaCheckError(cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_, + cudaCpuDeviceId, s1_)); + cudaCheckError(cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_, + cudaCpuDeviceId, s2_)); + cudaCheckError(cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_, + cudaCpuDeviceId, s3_)); + // Ensure device has finished all work. + cudaCheckError(cudaDeviceSynchronize()); + break; + } + } + } + + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() override { + // Destroy the handle + cublasDestroy(handle_); + + // Destroy streams after use + cudaCheckError(cudaStreamDestroy(s1_)); + cudaCheckError(cudaStreamDestroy(s2_)); + cudaCheckError(cudaStreamDestroy(s3_)); + + if (offload_ == gpuOffloadType::unified) { + cudaFree(A_); + cudaFree(B_); + cudaFree(C_); + } else { + // Free the memory held on host and device + free(A_); + free(B_); + free(C_); + cudaFree(A_device_); + cudaFree(B_device_); + cudaFree(C_device_); + } + } + + /** Handle used when calling cuBLAS. */ + cublasHandle_t handle_; + + /** CUDA Stream 1 - used to asynchronously move data between host and device. + */ + cudaStream_t s1_; + + /** CUDA Stream 1 - used to asynchronously move data between host and device. + */ + cudaStream_t s2_; + + /** CUDA Stream 1 - used to asynchronously move data between host and device. + */ + cudaStream_t s3_; + + /** The ID of the target GPU Device. */ + int gpuDevice_; + + /** Input matrix A, held on the device. */ + T* A_device_; + + /** Input matrix B, held on the device. */ + T* B_device_; + + /** Input matrix C, held on the device. */ + T* C_device_; + + /** The constant value Alpha. */ + const T alpha = ALPHA; + + /** The constant value Beta. */ + const T beta = BETA; +}; +} // namespace gpu +#endif \ No newline at end of file diff --git a/include/doGemm.hh b/include/doGemm.hh index c1aa742..4a7c564 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -20,6 +20,7 @@ #if defined GPU_CUBLAS #include "../cuBLAS/gemm.hh" +#include "../cuBLAS/sp_gemm.hh" #elif defined GPU_ONEMKL #include "../oneMKL/GPU/gemm.hh" #elif defined GPU_ROCBLAS @@ -42,11 +43,13 @@ class doGemm { doGPU_(gpuEnabled) #if CPU_ENABLED , - gemmCpu_(iterations_) + gemmCpu_(iterations_), + spGemmCpu_(iterations_) #endif #if GPU_ENABLED , - gemmGpu_(iterations_) + gemmGpu_(iterations_), + spGemmGpu_(iterations_) #endif { static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) && @@ -68,7 +71,7 @@ class doGemm { "_square_square_M=N=K.csv"); for (int dim = startDimention_; dim <= upperLimit_; dim++) { // M = dim, N = dim, K = dim; - callKernels(csvFile, dim, dim, dim); + callDenseKernels(csvFile, dim, dim, dim); } // Close file csvFile.close(); @@ -94,7 +97,7 @@ class doGemm { int M = 16 * K; int N = 16 * K; while (M <= upperLimit_) { - callKernels(csvFile, M, N, K); + callDenseKernels(csvFile, M, N, K); M += 16; N += 16; K++; @@ -121,7 +124,7 @@ class doGemm { if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { // M = dim, N = dim, K = 32; - callKernels(csvFile, dim, dim, 32); + callDenseKernels(csvFile, dim, dim, 32); } } // Close file @@ -147,7 +150,7 @@ class doGemm { N = startDimention_; K = 16 * M; while (K <= upperLimit_) { - callKernels(csvFile, M, N, K); + callDenseKernels(csvFile, M, N, K); M++; N++; K += 16; @@ -174,7 +177,7 @@ class doGemm { if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { // M = 32, N = 32, K = dim; - callKernels(csvFile, 32, 32, dim); + callDenseKernels(csvFile, 32, 32, dim); } } // Close file @@ -200,7 +203,7 @@ class doGemm { N = startDimention_; M = 16 * K; while (M <= upperLimit_) { - callKernels(csvFile, M, N, K); + callDenseKernels(csvFile, M, N, K); M += 16; N++; K++; @@ -227,7 +230,7 @@ class doGemm { if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { // M = dim, N = 32, K = 32; - callKernels(csvFile, dim, 32, 32); + callDenseKernels(csvFile, dim, 32, 32); } } // Close file @@ -253,7 +256,7 @@ class doGemm { K = startDimention_; N = 16 * K; while (N <= upperLimit_) { - callKernels(csvFile, M, N, K); + callDenseKernels(csvFile, M, N, K); M++; N += 16; K++; @@ -280,7 +283,7 @@ class doGemm { if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { // M = 32, N = dim, K = 32; - callKernels(csvFile, 32, dim, 32); + callDenseKernels(csvFile, 32, dim, 32); } } // Close file @@ -291,12 +294,27 @@ class doGemm { printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)"); } #endif + + // Square sparse matrix - sparse matrix multiplication + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() + + "_sparse_square.csv"); + if (upperLimit_ >= 32) { + for (int dim = 1; dim <= upperLimit_; dim++) { + const int N = dim; + callSparseKernels(csvFile, N, 0.99); + } + } + // Close file + csvFile.close(); } private: /** Call the appropriate CPU and GPU GEMM kernels. */ - void callKernels(std::ofstream& csvFile, const int M, const int N, - const int K) { + void callDenseKernels(std::ofstream& csvFile, const int M, const int N, + const int K) { const double probSize = calcKib(M, N, K); const uint64_t flops = calcFlops(M, N, K); std::string kernelName = getKernelName(); @@ -488,6 +506,52 @@ class doGemm { } } + void callSparseKernels(std::ofstream& csvFile, const int N, const float + sparsity) { + const double probSize = calcKib(N, N, N); + const uint64_t flops = calcFlops(N, N, N); + std::string kernelName = getKernelName(); + + spGemmCpu_.initialise(N, sparsity); + time_checksum_gflop cpuResult = spGemmCpu_.compute(); + cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); + + // Perform the GPU kernels + // - ONCE : Offload to/from GPU once before all iterations and once + // after + spGemmGpu_.initialise(gpuOffloadType::once, N, N, N); + time_checksum_gflop gpuResult_once = gemmGpu_.compute(); + gpuResult_once.gflops = + calcGflops(flops, iterations_, gpuResult_once.runtime); + + // - ALWAYS: Offload to/from GPU every iteration + spGemmGpu_.initialise(gpuOffloadType::always, N, N, N); + time_checksum_gflop gpuResult_always = gemmGpu_.compute(); + gpuResult_always.gflops = + calcGflops(flops, iterations_, gpuResult_always.runtime); + + // - UNIFIED : data passed from host to device (and device to host) as + // needed + spGemmGpu_.initialise(gpuOffloadType::unified, N, N, N); + time_checksum_gflop gpuResult_unified = gemmGpu_.compute(); + gpuResult_unified.gflops = + calcGflops(flops, iterations_, gpuResult_unified.runtime); + + // ToDo -- non-default GPU operations + + // Write lines to CSV file + writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, + cpuResult.runtime, cpuResult.gflops); + writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, + iterations_, gpuResult_once.runtime, gpuResult_once.gflops); + writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, + iterations_, gpuResult_always.runtime, + gpuResult_always.gflops); + writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, + iterations_, gpuResult_unified.runtime, + gpuResult_unified.gflops); + } + /** A function for calculating FLOPs performed by a GEMM. * C = alpha*AB + beta*C */ constexpr uint64_t calcFlops(const int M, const int N, const int K) const { @@ -623,11 +687,15 @@ class doGemm { cpu::gemm_cpu<T> gemmCpu_; #endif + cpu::sp_gemm_cpu<T> spGemmCpu_; + #if GPU_ENABLED /** The GEMM GPU kernel. */ gpu::gemm_gpu<T> gemmGpu_; #endif + gpu::sp_gemm_gpu<T> spGemmGpu_; + /** The point at which offloading to GPU (offload once) becomes worthwhile. */ cpuGpu_offloadThreshold cpuGpu_once_; diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh new file mode 100644 index 0000000..3de5ea5 --- /dev/null +++ b/include/kernels/CPU/sp_gemm.hh @@ -0,0 +1,110 @@ +#pragma once + +#include "../gemm.hh" + +#include <random> + +namespace cpu { + +/** An abstract class for GEMM BLAS kernels. */ + template <typename T> + class sp_gemm : public ::gemm<T> { + public: + using ::gemm<T>::gemm; + using ::gemm<T>::m_; + using ::gemm<T>::n_; + using ::gemm<T>::k_; + using ::gemm<T>::A_; + using ::gemm<T>::B_; + using ::gemm<T>::C_; + + public: + /** Initialise the required data structures. */ + virtual void initialise(int n, double sparsity, bool binary = false) { + n_ = n; + + A_ = (T*)malloc(sizeof(T) * n_ * n_); + B_ = (T*)malloc(sizeof(T) * n_ * n_); + C_ = (T*)malloc(sizeof(T) * n_ * n_); + + // Set initial values to 0 + for (int i = 0; i < (n_ * n_); i++) { + A_[i] = 0.0; + B_[i] = 0.0; + } + + // Random number generator objects for use in descent + std::default_random_engine gen; + gen.seed(std::chrono::system_clock::now() + .time_since_epoch().count()); + std::uniform_real_distribution<double> dist(0.0, 1.0); + + // Work out number of edges needed to achieve target sparsity + int edges = 1 + (int) (n * n * (1 - sparsity)); + + // Initialise the matrices + // Using a=0.45 and b=c=0.22 as default probabilities + for (int i = 0; i < edges; i++) { + while (!rMat(A_, n, 0, n - 1, 0, n - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + while (!rMat(B_, n, 0, n - 1, 0, n - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + } + } + + private: + bool rMat(T* M, int n, int x1, int x2, int y1, int y2, + float a, float b, float c, std::default_random_engine* gen, + std::uniform_real_distribution<double> dist, bool bin) { + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + if (abs(M[(y1 * n) + x1]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / + 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // ToDo -- add some noise to these values between iterations + float newA = a; + float newB = b; + float newC = c; + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + newA, newB, newC, gen, dist, bin); + } else { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, + gen, dist, bin); + } + } + return true; + } + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() { + free(A_); + free(B_); + free(C_); + } + }; +} // namespace cpu \ No newline at end of file diff --git a/include/kernels/GPU/sp_gemm.hh b/include/kernels/GPU/sp_gemm.hh new file mode 100644 index 0000000..684c166 --- /dev/null +++ b/include/kernels/GPU/sp_gemm.hh @@ -0,0 +1,27 @@ +#pragma once + +#include "../gemm.hh" + +namespace gpu { + +/** An abstract class for GEMM BLAS kernels. */ + template <typename T> + class sp_gemm : public ::gemm<T> { + public: + using ::gemm<T>::gemm; + + /** Initialise the required data structures. + * `offload` refers to the data offload type: + * - Once: Move data from host to device before all iterations & move from + * device to host after all iterations + * - Always: Move data from host to device and device to host each iteration + * - Unified: Initialise data as unified memory; no data movement semantics + * required */ + virtual void initialise(gpuOffloadType offload, int m, int n, int k) = 0; + + protected: + /** Whether data should be offloaded to/from the GPU each iteration, or just + * before & after. */ + gpuOffloadType offload_ = gpuOffloadType::always; + }; +} // namespace gpu \ No newline at end of file diff --git a/src/main.cc b/src/main.cc index c61df37..38e2b5a 100644 --- a/src/main.cc +++ b/src/main.cc @@ -2,6 +2,10 @@ int iters = 10; int upperLimit = 128; +bool sgemm = true; +bool dgemm = true; +bool sp_sgemm = true; +bool sp_dgemm = true; bool doCpu = CPU_ENABLED; bool doGpu = GPU_ENABLED; From f2ed11f5325e2e063d0f92e07d09b13db6b356d7 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Wed, 13 Mar 2024 13:43:05 +0000 Subject: [PATCH 03/38] Implementing cuSPARSE kernel --- cuBLAS/sp_gemm.hh | 208 +++++++++++++++++++++++++--------------------- 1 file changed, 111 insertions(+), 97 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 3a9cff0..67d030c 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -1,7 +1,7 @@ #pragma once #ifdef GPU_CUBLAS -#include <cublas_v2.h> +#include "cusparse.h" #include <cuda_runtime.h> #include "../include/kernels/GPU/gemm.hh" @@ -14,9 +14,7 @@ template <typename T> class sp_gemm_gpu : public gemm<T> { public: using gemm<T>::gemm; - using gemm<T>::m_; using gemm<T>::n_; - using gemm<T>::k_; using gemm<T>::A_; using gemm<T>::B_; using gemm<T>::C_; @@ -29,15 +27,28 @@ class sp_gemm_gpu : public gemm<T> { * - Always: Move data from host to device and device to host each iteration * - Unified: Initialise data as unified memory; no data movement semantics * required */ - void initialise(gpuOffloadType offload, int m, int n, int k) override { + void initialise(gpuOffloadType offload, int n, float sparsity) override { offload_ = offload; - m_ = m; + // Create a handle for cuSPARSE + cusparseCreate(&handle_); + n_ = n; - k_ = k; - // Create a handle for CUBLAS - cublasCreate(&handle_); + // Create descriptors for matrices A->C + cusparseMatDescr_t descrA, descrB, descrC; + + cusparseCreateMatDescr(&descrA); + cusparseCreateMatDescr(&descrB); + cusparseCreateMatDescr(&descrC); + + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL); + + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO); // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); @@ -47,38 +58,96 @@ class sp_gemm_gpu : public gemm<T> { cudaCheckError(cudaStreamCreate(&s2_)); cudaCheckError(cudaStreamCreate(&s3_)); + + // Work out number of edges needed to achieve target sparsity + int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); + if (offload_ == gpuOffloadType::unified) { - cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * m_ * k_)); - cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * k_ * n_)); - cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * m_ * n_)); + cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * n_ * n_)); + cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * n_ * n_)); + cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * n_ * n_)); + cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_)); } else { // Allocate matrices on host - A_ = (T*)malloc(sizeof(T) * m_ * k_); - B_ = (T*)malloc(sizeof(T) * k_ * n_); - C_ = (T*)malloc(sizeof(T) * m_ * n_); + A_ = (T*)malloc(sizeof(T) * n_ * n_); + B_ = (T*)malloc(sizeof(T) * n_ * n_); + C_ = (T*)malloc(sizeof(T) * n_ * n_); + // Allocate matrices on device - cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * m_ * k_)); - cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * k_ * n_)); - cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * m_ * n_)); + cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * n_ * n_)); + cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * n_ * n_)); + cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * n_ * n_)); + // Alloce non-zero vector for A + cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_)); } - // Initialise the host matricies - srand(SEED); - for (int y = 0; y < m_; y++) { - for (int x = 0; x < k_; x++) { - A_[y * k_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0); - } - } - for (int y = 0; y < k_; y++) { - for (int x = 0; x < n_; x++) { - B_[y * n_ + x] = (((T)(rand() % 10000) / 100.0) - 30.0); - } - } + // Initialise the host matricies + // cusparseSpGEMM() works on CSR format only. This helpfully makes our + // sparse matrix format decision for us! + // ToDo -- do the RMAT instantiation of A_ and B_. Need to think about + // how this can be done in the context of CSR. + + // Initialise the matrices + // Using a=0.45 and b=c=0.22 as default probabilities + for (int i = 0; i < edges; i++) { + while (!rMat(A_, n, 0, n - 1, 0, n - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + while (!rMat(B_, n, 0, n - 1, 0, n - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + } } private: + bool rMat(T* M, int n, int x1, int x2, int y1, int y2, + float a, float b, float c, std::default_random_engine* gen, + std::uniform_real_distribution<double> dist, bool bin) { + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + if (abs(M[(y1 * n) + x1]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / + 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // ToDo -- add some noise to these values between iterations + float newA = a; + float newB = b; + float newC = c; + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + newA, newB, newC, gen, dist, bin); + } else { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, + gen, dist, bin); + } + } + return true; + } + /** Perform any required steps before calling the GEMM kernel that should * be timed. */ + // ToDo -- update this to apply to CSR format void preLoopRequirements() override { switch (offload_) { case gpuOffloadType::always: { @@ -119,79 +188,20 @@ class sp_gemm_gpu : public gemm<T> { cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_, cudaMemcpyHostToDevice, s3_)); - // Call cuBLAS GEMM kernel - if constexpr (std::is_same_v<T, float>) { - cublasStatus_t stat = - cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, - A_device_, std::max(1, m_), B_device_, - std::max(1, k_), &beta, C_device_, std::max(1, m_)); - if (stat != CUBLAS_STATUS_SUCCESS) { - std::cout << "cuBLAS error:" << stat << std::endl; - exit(1); - } - } else if constexpr (std::is_same_v<T, double>) { - cublasStatus_t stat = - cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, - A_device_, std::max(1, m_), B_device_, - std::max(1, k_), &beta, C_device_, std::max(1, m_)); - if (stat != CUBLAS_STATUS_SUCCESS) { - std::cout << "cuBLAS error:" << stat << std::endl; - exit(1); - } - } - // Offload data from device to host - cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_, - cudaMemcpyDeviceToHost, s1_)); - cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_, - cudaMemcpyDeviceToHost, s2_)); - cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_, - cudaMemcpyDeviceToHost, s3_)); - // Ensure device has finished all work. - cudaCheckError(cudaDeviceSynchronize()); + // Call cuSPARSE SpGEMM kernel + // ToDo -- implement break; } case gpuOffloadType::once: { - // Call cuBLAS GEMM kernel - if constexpr (std::is_same_v<T, float>) { - cublasStatus_t stat = - cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, - A_device_, std::max(1, m_), B_device_, - std::max(1, k_), &beta, C_device_, std::max(1, m_)); - if (stat != CUBLAS_STATUS_SUCCESS) { - std::cout << "cuBLAS error:" << stat << std::endl; - exit(1); - } - } else if constexpr (std::is_same_v<T, double>) { - cublasStatus_t stat = - cublasDgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, - A_device_, std::max(1, m_), B_device_, - std::max(1, k_), &beta, C_device_, std::max(1, m_)); - if (stat != CUBLAS_STATUS_SUCCESS) { - std::cout << "cuBLAS error:" << stat << std::endl; - exit(1); - } - } + // Call cuSPRASE SpGEMM kernel + // ToDo -- implement + break; } case gpuOffloadType::unified: { - // Call cuBLAS GEMM kernel - if constexpr (std::is_same_v<T, float>) { - cublasStatus_t stat = cublasSgemm( - handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_, - std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_)); - if (stat != CUBLAS_STATUS_SUCCESS) { - std::cout << "cuBLAS error:" << stat << std::endl; - exit(1); - } - } else if constexpr (std::is_same_v<T, double>) { - cublasStatus_t stat = cublasDgemm( - handle_, CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, k_, &alpha, A_, - std::max(1, m_), B_, std::max(1, k_), &beta, C_, std::max(1, m_)); - if (stat != CUBLAS_STATUS_SUCCESS) { - std::cout << "cuBLAS error:" << stat << std::endl; - exit(1); - } - } + // Call cuSPARSE SpGEMM kernel + // ToDo -- implement + break; } } @@ -199,6 +209,7 @@ class sp_gemm_gpu : public gemm<T> { /** Perform any required steps after calling the GEMM kernel that should * be timed. */ + // ToDo -- check that this all still works void postLoopRequirements() override { switch (offload_) { case gpuOffloadType::always: { @@ -236,7 +247,7 @@ class sp_gemm_gpu : public gemm<T> { * after Kernel has been called. */ void postCallKernelCleanup() override { // Destroy the handle - cublasDestroy(handle_); + cusparseDestroy(handle_); // Destroy streams after use cudaCheckError(cudaStreamDestroy(s1_)); @@ -285,6 +296,9 @@ class sp_gemm_gpu : public gemm<T> { /** Input matrix C, held on the device. */ T* C_device_; + /** Vector for number non-zeros, held on the device */ + int* dANnzPerRow; + /** The constant value Alpha. */ const T alpha = ALPHA; From c208246927e738615a94c0308e845cf42c198f98 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Wed, 13 Mar 2024 14:05:20 +0000 Subject: [PATCH 04/38] Trying to work out CSR malloc bug --- cuBLAS/sp_gemm.hh | 126 ++++++++++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 50 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 67d030c..3232293 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -66,7 +66,19 @@ class sp_gemm_gpu : public gemm<T> { cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * n_ * n_)); cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * n_ * n_)); cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * n_ * n_)); - cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_)); + + cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges)); + cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges)); + cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * edges)); + + cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges)); + cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges)); + cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * edges)); + + cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * edges)); + cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * edges)); + cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * edges)); +// cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_)); } else { // Allocate matrices on host A_ = (T*)malloc(sizeof(T) * n_ * n_); @@ -78,7 +90,7 @@ class sp_gemm_gpu : public gemm<T> { cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * n_ * n_)); cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * n_ * n_)); // Alloce non-zero vector for A - cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_)); +// cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_)); } // Initialise the host matricies @@ -88,6 +100,11 @@ class sp_gemm_gpu : public gemm<T> { // how this can be done in the context of CSR. // Initialise the matrices + // Set initial values to 0 + for (int i = 0; i < (n_ * n_); i++) { + A_[i] = 0.0; + B_[i] = 0.0; + } // Using a=0.45 and b=c=0.22 as default probabilities for (int i = 0; i < edges; i++) { while (!rMat(A_, n, 0, n - 1, 0, n - 1, @@ -97,57 +114,17 @@ class sp_gemm_gpu : public gemm<T> { 0.45, 0.22, 0.22, &gen, dist, false)) {} } + +// for (int i = 0; i < (n_ * n_); i++) { +// C_[i] = 0.0; +// } } private: - bool rMat(T* M, int n, int x1, int x2, int y1, int y2, - float a, float b, float c, std::default_random_engine* gen, - std::uniform_real_distribution<double> dist, bool bin) { - // If a 1x1 submatrix, then add an edge and return out - if (x1 >= x2 && y1 >= y2) { - if (abs(M[(y1 * n) + x1]) > 0.1) { - return false; - } else { - // Add 1.0 if this is a binary graph, and a random real number otherwise - M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / - 100.0) - 50.0); - return true; - } - } else { - // Divide up the matrix - int xMidPoint = x1 + floor((x2 - x1) / 2); - int yMidPoint = y1 + floor((y2 - y1) / 2); - - // ToDo -- add some noise to these values between iterations - float newA = a; - float newB = b; - float newC = c; - - // Work out which quarter to recurse into - // There are some ugly ternary operators here to avoid going out of bounds in the edge case - // that we are already at 1 width or 1 height - float randomNum = dist(*gen); - if (randomNum < a) { - return rMat(M, n, x1, xMidPoint, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b)) { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b + c)) { - return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, - newA, newB, newC, gen, dist, bin); - } else { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, - ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, - gen, dist, bin); - } - } - return true; - } + /** Perform any required steps before calling the GEMM kernel that should * be timed. */ - // ToDo -- update this to apply to CSR format void preLoopRequirements() override { switch (offload_) { case gpuOffloadType::always: { @@ -188,8 +165,8 @@ class sp_gemm_gpu : public gemm<T> { cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_, cudaMemcpyHostToDevice, s3_)); - // Call cuSPARSE SpGEMM kernel - // ToDo -- implement + + break; } case gpuOffloadType::once: { @@ -269,6 +246,51 @@ class sp_gemm_gpu : public gemm<T> { } } + bool rMat(T* M, int n, int x1, int x2, int y1, int y2, + float a, float b, float c, std::default_random_engine* gen, + std::uniform_real_distribution<double> dist, bool bin) { + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + if (abs(M[(y1 * n) + x1]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / + 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // ToDo -- add some noise to these values between iterations + float newA = a; + float newB = b; + float newC = c; + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + newA, newB, newC, gen, dist, bin); + } else { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, + gen, dist, bin); + } + } + return true; + } + /** Handle used when calling cuBLAS. */ cublasHandle_t handle_; @@ -297,7 +319,11 @@ class sp_gemm_gpu : public gemm<T> { T* C_device_; /** Vector for number non-zeros, held on the device */ - int* dANnzPerRow; +// int* dANnzPerRow; + + /** CSR format vectors for matrices A, B and C on the device */ + T* A_val_, B_val_, C_val_; + int* A_col_, A_row_, B_col_, B_row_, C_col_, C_row_; /** The constant value Alpha. */ const T alpha = ALPHA; From de14a5682aae00ab582f87a396eaf3da5b66b99f Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Wed, 13 Mar 2024 14:07:46 +0000 Subject: [PATCH 05/38] Trying to work out CSR malloc bug --- cuBLAS/sp_gemm.hh | 2 -- 1 file changed, 2 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 3232293..0765adb 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -96,8 +96,6 @@ class sp_gemm_gpu : public gemm<T> { // Initialise the host matricies // cusparseSpGEMM() works on CSR format only. This helpfully makes our // sparse matrix format decision for us! - // ToDo -- do the RMAT instantiation of A_ and B_. Need to think about - // how this can be done in the context of CSR. // Initialise the matrices // Set initial values to 0 From 49cddf02f8a50571d2eaa5b653bdf8fb49198d91 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Tue, 19 Mar 2024 13:05:58 +0000 Subject: [PATCH 06/38] cuSPARSE unified memory implementation --- cuBLAS/sp_gemm.hh | 433 ++++++++++++++++++++++++++-------------------- 1 file changed, 250 insertions(+), 183 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 0765adb..68e3b84 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -3,6 +3,7 @@ #ifdef GPU_CUBLAS #include "cusparse.h" #include <cuda_runtime.h> +#include <type_traits> #include "../include/kernels/GPU/gemm.hh" #include "../include/utilities.hh" @@ -20,6 +21,8 @@ class sp_gemm_gpu : public gemm<T> { using gemm<T>::C_; using gemm<T>::offload_; + // ToDo -- just unified implemented so far. Fill in Always and Once later + /** Initialise the required data structures. * `offload` refers to the data offload type: * - Once: Move data from host to device before all iterations & move from @@ -33,10 +36,10 @@ class sp_gemm_gpu : public gemm<T> { // Create a handle for cuSPARSE cusparseCreate(&handle_); - n_ = n; + cudaDataType_ = (std::is_same_v<T, float>) ? CUDA_R_32F : + CUDA_R_64F; - // Create descriptors for matrices A->C - cusparseMatDescr_t descrA, descrB, descrC; + n_ = n; cusparseCreateMatDescr(&descrA); cusparseCreateMatDescr(&descrB); @@ -61,37 +64,30 @@ class sp_gemm_gpu : public gemm<T> { // Work out number of edges needed to achieve target sparsity int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); + A_nnz_ = B_nnz_ = edges + + // ToDo -- for all of this mallocing, bear in mind that row will probably + // have fewer than 'edges' values (thats the whole point). May need to + // reorganise + + cudaCheckError(cudaMallocManaged(A_num_rows_, sizeof(int))); + cudaCheckError(cudaMallocManaged(A_num_cols_, sizeof(int))); + cudaCheckError(cudaMallocManaged(A_nnz_, sizeof(int))); + cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges)); + cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges)); + cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1))); + + cudaCheckError(cudaMallocManaged(B_num_rows_, sizeof(int))); + cudaCheckError(cudaMallocManaged(B_num_cols_, sizeof(int))); + cudaCheckError(cudaMallocManaged(B_nnz_, sizeof(int))); + cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges)); + cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges)); + cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1))); + + C_val_ = NULL; + C_col_ = NULL; + C_row_ = NULL; - if (offload_ == gpuOffloadType::unified) { - cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * n_ * n_)); - cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * n_ * n_)); - cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * n_ * n_)); - - cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges)); - cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges)); - cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * edges)); - - cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges)); - cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges)); - cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * edges)); - - cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * edges)); - cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * edges)); - cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * edges)); -// cudaCheckError(cudaMallocManaged(&DANnzPerRow, sizeof(int) * n_)); - } else { - // Allocate matrices on host - A_ = (T*)malloc(sizeof(T) * n_ * n_); - B_ = (T*)malloc(sizeof(T) * n_ * n_); - C_ = (T*)malloc(sizeof(T) * n_ * n_); - - // Allocate matrices on device - cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * n_ * n_)); - cudaCheckError(cudaMalloc((void**)&B_device_, sizeof(T) * n_ * n_)); - cudaCheckError(cudaMalloc((void**)&C_device_, sizeof(T) * n_ * n_)); - // Alloce non-zero vector for A -// cudaCheckError(cudaMalloc((void**)&dANnzPerRow, sizeof(int) * n_)); - } // Initialise the host matricies // cusparseSpGEMM() works on CSR format only. This helpfully makes our @@ -113,109 +109,160 @@ class sp_gemm_gpu : public gemm<T> { &gen, dist, false)) {} } -// for (int i = 0; i < (n_ * n_); i++) { -// C_[i] = 0.0; -// } + toCSR(A_, n, n, edges, A_val_, A_col_, A_row_); + toCSR(B_, n, n, edges, B_val_, B_col_, B_row_); + } + + private: /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { - switch (offload_) { - case gpuOffloadType::always: { - // Offload data each iteration - no requirements - break; - } - case gpuOffloadType::once: { - // Offload data from host to the device. - cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_, - cudaMemcpyHostToDevice, s1_)); - cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_, - cudaMemcpyHostToDevice, s2_)); - cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_, - cudaMemcpyHostToDevice, s3_)); - break; - } - case gpuOffloadType::unified: { - // Prefetch memory to device - cudaCheckError( - cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_, gpuDevice_, s1_)); - cudaCheckError( - cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_, gpuDevice_, s2_)); - cudaCheckError( - cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_, gpuDevice_, s3_)); - break; - } - } + // Prefetch memory to device + cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int), gpuDevice_, + s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int), gpuDevice_, + s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int), gpuDevice_, + s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges, gpuDevice_, + s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges, + gpuDevice_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1), + gpuDevice_, s1_)); + + cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int), gpuDevice_, + s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int), gpuDevice_, + s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int), gpuDevice_, + s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges, gpuDevice_, + s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges, + gpuDevice_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1), + gpuDevice_, s2_)); +// +// cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_, +// s3_)); +// cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_, +// s3_)); +// cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_, +// s3_)); +// cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_, +// s3_)); +// cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges, +// gpuDevice_, s3_)); +// cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges, +// gpuDevice_, s3_)); + + // Create the CSR matrices on the device + cusparseCreateCsr(descrA_, n_, n_, A_nnz_, A_row_, A_col_, A_val_, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, cudaDateType_); + cusparseCreateCsr(descrB_, n_, n_, B_nnz_, B_row_, B_col_, B_val_, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, cudaDateType_); + cusparseCreateCsr(descrC_, n_, n_, 0, NULL, NULL, NULL, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); + + cusparseSpGEMM_createDescr(&spgemmDesc_); } /** Make a call to the BLAS Library Kernel. */ void callGemm() override { - switch (offload_) { - case gpuOffloadType::always: { - // Offload data from host to the device. - cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * k_, - cudaMemcpyHostToDevice, s1_)); - cudaCheckError(cudaMemcpyAsync(B_device_, B_, sizeof(T) * k_ * n_, - cudaMemcpyHostToDevice, s2_)); - cudaCheckError(cudaMemcpyAsync(C_device_, C_, sizeof(T) * m_ * n_, - cudaMemcpyHostToDevice, s3_)); - - - break; - } - case gpuOffloadType::once: { - // Call cuSPRASE SpGEMM kernel - // ToDo -- implement - - break; - } - case gpuOffloadType::unified: { - // Call cuSPARSE SpGEMM kernel - // ToDo -- implement - - break; - } - } - } + cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, + descrA_, descrB_, &beta, descrC_, + CUSPARSE_SPGEMM_DEFAULT, cudaDataType_, + spgemmDesc_, buffer_size1_, NULL); + cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_)); + cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, + descrA_, descrB_, &beta, descrC_, + CUSPARSE_SPGEMM_DEFAULT, cudaDataType_, + spgemmDesc_, buffer_size1_, buffer1_); + cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, + descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT, + cudaDataType_, spgemmDesc_, buffer_size2_, NULL); + cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2)); + + if (cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, + descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT, + cudaDataType_, spgemmDesc_, buffer_size2_, buffer2_) + == CUSPARSE_SATUS_INSUFFICIENT_RESOURCES) { + std::cout << "Insufficient resources" << std::endl; + exit(1); + } + + int rows, cols, nnz; + + cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz_); + C_nnz_ = nnz; + cudaCheckError(cudaMallocManaged(C_val_), sizeof(T) * nnz); + cudaCheckError(cudaMallocManaged(C_col_), sizeof(int) * nnz); + cudaCheckError(cudaMallocManaged(C_row_), sizeof(int) * (n_ + 1)); + + cusparseCstSetPointers(descrC_, *C_row, *C_colind, *C_val); + cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, + descrB_, &beta, descrC_, CUDA_R_32F, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_); + } /** Perform any required steps after calling the GEMM kernel that should * be timed. */ - // ToDo -- check that this all still works void postLoopRequirements() override { - switch (offload_) { - case gpuOffloadType::always: { - // Offload data each iteration - no requirements - break; - } - case gpuOffloadType::once: { - // Offload data from device to host - cudaCheckError(cudaMemcpyAsync(A_, A_device_, sizeof(T) * m_ * k_, - cudaMemcpyDeviceToHost, s1_)); - cudaCheckError(cudaMemcpyAsync(B_, B_device_, sizeof(T) * k_ * n_, - cudaMemcpyDeviceToHost, s2_)); - cudaCheckError(cudaMemcpyAsync(C_, C_device_, sizeof(T) * m_ * n_, - cudaMemcpyDeviceToHost, s3_)); - // Ensure device has finished all work. - cudaCheckError(cudaDeviceSynchronize()); - break; - } - case gpuOffloadType::unified: { - // Ensure all data resides on host once work has completed - cudaCheckError(cudaMemPrefetchAsync(A_, sizeof(T) * m_ * k_, - cudaCpuDeviceId, s1_)); - cudaCheckError(cudaMemPrefetchAsync(B_, sizeof(T) * k_ * n_, - cudaCpuDeviceId, s2_)); - cudaCheckError(cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_, - cudaCpuDeviceId, s3_)); - // Ensure device has finished all work. - cudaCheckError(cudaDeviceSynchronize()); - break; - } - } + // Ensure all data resides on host once work has completed + cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int), + cudaCpuDeviceId_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int), + cudaCpuDeviceId_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int), + cudaCpuDeviceId_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges, + cudaCpuDeviceId_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges, + cudaCpuDeviceId_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1), + cudaCpuDeviceId_, s1_)); + + cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int), + cudaCpuDeviceId_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int), + cudaCpuDeviceId_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int), + cudaCpuDeviceId_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges, + cudaCpuDeviceId_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges, + cudaCpuDeviceId_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1), + cudaCpuDeviceId_, s2_)); + + cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), + cudaCpuDeviceId_, s3_)); + cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), + cudaCpuDeviceId_, s3_)); + cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), + cudaCpuDeviceId_, s3_)); + cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * C_nnz_, + cudaCpuDeviceId_, s3_)); + cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * C_nnz_, + cudaCpuDeviceId_, s3_)); + cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * (n_ + 1), + cudaCpuDeviceId_, s3_)); + // Ensure device has finished all work. + cudaCheckError(cudaDeviceSynchronize()); } /** Do any necessary cleanup (free pointers, close library handles, etc.) @@ -229,65 +276,76 @@ class sp_gemm_gpu : public gemm<T> { cudaCheckError(cudaStreamDestroy(s2_)); cudaCheckError(cudaStreamDestroy(s3_)); - if (offload_ == gpuOffloadType::unified) { - cudaFree(A_); - cudaFree(B_); - cudaFree(C_); - } else { - // Free the memory held on host and device - free(A_); - free(B_); - free(C_); - cudaFree(A_device_); - cudaFree(B_device_); - cudaFree(C_device_); - } + cudaFree(A_); + cudaFree(B_); + cudaFree(C_); } bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b, float c, std::default_random_engine* gen, std::uniform_real_distribution<double> dist, bool bin) { - // If a 1x1 submatrix, then add an edge and return out - if (x1 >= x2 && y1 >= y2) { - if (abs(M[(y1 * n) + x1]) > 0.1) { - return false; - } else { - // Add 1.0 if this is a binary graph, and a random real number otherwise - M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / - 100.0) - 50.0); - return true; - } + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + if (abs(M[(y1 * n) + x1]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / + 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // ToDo -- add some noise to these values between iterations + float newA = a; + float newB = b; + float newC = c; + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + newA, newB, newC, gen, dist, bin); } else { - // Divide up the matrix - int xMidPoint = x1 + floor((x2 - x1) / 2); - int yMidPoint = y1 + floor((y2 - y1) / 2); - - // ToDo -- add some noise to these values between iterations - float newA = a; - float newB = b; - float newC = c; - - // Work out which quarter to recurse into - // There are some ugly ternary operators here to avoid going out of bounds in the edge case - // that we are already at 1 width or 1 height - float randomNum = dist(*gen); - if (randomNum < a) { - return rMat(M, n, x1, xMidPoint, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b)) { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b + c)) { - return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, - newA, newB, newC, gen, dist, bin); - } else { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, - ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, - gen, dist, bin); + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, + gen, dist, bin); + } + } + return true; + } + + void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index, + int* row_ptr) { + int nnz_encountered = 0; + int prev_row_ptr = 0; + for (int row = 0; row < n_row; row++) { + if (nnz_encountered >= nnz) break; + row_ptr[row] = prev_row_ptr; + int nnz_row = 0; + for (int col = 0; col < n_col; col++) { + if (nnz_encountered >= nnz) break; + if (dense[(row * n_col) + col] != 0.0) { + nnz_row++; + col_index[nnz_encountered] = col; + vals[nnz_encountered] = dense[(row * n_col) + col]; + nnz_encountered++; } } - return true; + prev_row_ptr += nnz_row; } + } /** Handle used when calling cuBLAS. */ cublasHandle_t handle_; @@ -307,27 +365,36 @@ class sp_gemm_gpu : public gemm<T> { /** The ID of the target GPU Device. */ int gpuDevice_; - /** Input matrix A, held on the device. */ - T* A_device_; - - /** Input matrix B, held on the device. */ - T* B_device_; - - /** Input matrix C, held on the device. */ - T* C_device_; - - /** Vector for number non-zeros, held on the device */ -// int* dANnzPerRow; - - /** CSR format vectors for matrices A, B and C on the device */ + /** CSR format vectors for matrices A, B and C on the host */ + int A_nnz_, B_nnz_, C_nnz_; T* A_val_, B_val_, C_val_; int* A_col_, A_row_, B_col_, B_row_, C_col_, C_row_; + /** CSR format vectors for matrices A, B and C on the device. */ + int A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_, + B_num_cols_dev_, B_nnz_dev_, C_num_rows_dev_, C_num_cols_dev_, C_nnz_dev_; + T* A_val_dev_, B_val_dev_, C_val_dev_; + int* A_col_dev_, A_row_dev_, B_col_dev_, B_row_dev_, C_col_dev_, C_row_dev_; + /** The constant value Alpha. */ const T alpha = ALPHA; /** The constant value Beta. */ const T beta = BETA; + + + // Create descriptors for matrices A->C + cusparseMatDescr_t descrA_, descrB_, descrC_; + + // index type depends on kernel being run + cusparseIndexType_t cudaDataType_; + + cusparceSpGEMMDescr_t spgemmDesc_; + + size_t buffer_size1_ = 0; + size_t buffer_size2_ = 0; + void* buffer1_ = NULL; + void* buffer2_ = NULL; }; } // namespace gpu #endif \ No newline at end of file From 37ce8b4c32b7b04caae5a4dbc697b21086447c9f Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Thu, 21 Mar 2024 13:08:49 +0000 Subject: [PATCH 07/38] Now compiles --- DefaultGPU/sp_gemm.hh | 2 +- Makefile | 2 +- cuBLAS/sp_gemm.hh | 228 +++++++++++++++------------------ include/doGemm.hh | 7 +- include/kernels/GPU/sp_gemm.hh | 2 +- 5 files changed, 112 insertions(+), 129 deletions(-) diff --git a/DefaultGPU/sp_gemm.hh b/DefaultGPU/sp_gemm.hh index 92d157c..2a9f478 100644 --- a/DefaultGPU/sp_gemm.hh +++ b/DefaultGPU/sp_gemm.hh @@ -22,7 +22,7 @@ class sp_gemm_gpu : public sp_gemm<T> { } /** Initialise the required data structures. */ - void initialise(gpuOffloadType offload, int m, int n, int k) override { + void initialise(gpuOffloadType offload, int n, float sparsity) override { // Default GPU implementation - do nothing. } diff --git a/Makefile b/Makefile index 5dd2fc5..bff0add 100644 --- a/Makefile +++ b/Makefile @@ -177,7 +177,7 @@ $(info $(TAB)$(TAB)Add `CXXFLAGS=-L<NVHPC_DIR>/.../math_libs/lib64 -L<NVHPC_DIR> $(info $(TAB)$(TAB)Add `CXXFLAGS=-I<NVHPC_DIR>/.../math_libs/include -I<NVHPC_DIR>/.../cuda/include` to make command) $(info $(TAB)$(TAB)Add `CXXFLAGS=-Wl,-rpath,<NVHPC_DIR>/.../math_libs/lib64 -Wl,-rpath,<NVHPC_DIR>/.../cuda/lib64` to make command) $(info ) -override CXXFLAGS += -lcublas -lcudart +override CXXFLAGS += -lcublas -lcudart -lcusparse endif HEADER_FILES += $(wildcard cuBLAS/*.hh) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 68e3b84..c0bfb8e 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -2,24 +2,27 @@ #ifdef GPU_CUBLAS #include "cusparse.h" +#include <cublas_v2.h> #include <cuda_runtime.h> #include <type_traits> +#include <random> +#include <iostream> -#include "../include/kernels/GPU/gemm.hh" +#include "../include/kernels/GPU/sp_gemm.hh" #include "../include/utilities.hh" #include "common.hh" namespace gpu { /** A class for GEMM GPU BLAS kernels. */ template <typename T> -class sp_gemm_gpu : public gemm<T> { +class sp_gemm_gpu : public sp_gemm<T> { public: - using gemm<T>::gemm; - using gemm<T>::n_; - using gemm<T>::A_; - using gemm<T>::B_; - using gemm<T>::C_; - using gemm<T>::offload_; + using sp_gemm<T>::sp_gemm; + using sp_gemm<T>::n_; + using sp_gemm<T>::A_; + using sp_gemm<T>::B_; + using sp_gemm<T>::C_; + using sp_gemm<T>::offload_; // ToDo -- just unified implemented so far. Fill in Always and Once later @@ -31,63 +34,50 @@ class sp_gemm_gpu : public gemm<T> { * - Unified: Initialise data as unified memory; no data movement semantics * required */ void initialise(gpuOffloadType offload, int n, float sparsity) override { + std::cout << "Initialising" << std::endl; offload_ = offload; // Create a handle for cuSPARSE cusparseCreate(&handle_); + std::cout << "Handle created" << std::endl; - cudaDataType_ = (std::is_same_v<T, float>) ? CUDA_R_32F : - CUDA_R_64F; + if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F; + else if (std::is_same_v<T, double>) cudaDataType_ = CUDA_R_64F; + else { + std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; + exit(1); + } n_ = n; - cusparseCreateMatDescr(&descrA); - cusparseCreateMatDescr(&descrB); - cusparseCreateMatDescr(&descrC); - - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL); - - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); - cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO); - cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO); - // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); + std::cout << "GPU device got" << std::endl; // Initialise 3 streams to asynchronously move data between host and device cudaCheckError(cudaStreamCreate(&s1_)); cudaCheckError(cudaStreamCreate(&s2_)); cudaCheckError(cudaStreamCreate(&s3_)); + std::cout << "Streams created" << std::endl; // Work out number of edges needed to achieve target sparsity int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); - A_nnz_ = B_nnz_ = edges + (*A_nnz_) = (*B_nnz_) = edges; // ToDo -- for all of this mallocing, bear in mind that row will probably // have fewer than 'edges' values (thats the whole point). May need to // reorganise - cudaCheckError(cudaMallocManaged(A_num_rows_, sizeof(int))); - cudaCheckError(cudaMallocManaged(A_num_cols_, sizeof(int))); - cudaCheckError(cudaMallocManaged(A_nnz_, sizeof(int))); cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges)); cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges)); cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1))); + std::cout << "A CSR vectors malloced" << std::endl; - cudaCheckError(cudaMallocManaged(B_num_rows_, sizeof(int))); - cudaCheckError(cudaMallocManaged(B_num_cols_, sizeof(int))); - cudaCheckError(cudaMallocManaged(B_nnz_, sizeof(int))); cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges)); cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges)); cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1))); - - C_val_ = NULL; - C_col_ = NULL; - C_row_ = NULL; - + std::cout << "B CSR vectors malloced" << std::endl; // Initialise the host matricies // cusparseSpGEMM() works on CSR format only. This helpfully makes our @@ -99,6 +89,13 @@ class sp_gemm_gpu : public gemm<T> { A_[i] = 0.0; B_[i] = 0.0; } + + // Random number generator objects for use in descent + std::default_random_engine gen; + gen.seed(std::chrono::system_clock::now() + .time_since_epoch().count()); + std::uniform_real_distribution<double> dist(0.0, 1.0); + // Using a=0.45 and b=c=0.22 as default probabilities for (int i = 0; i < edges; i++) { while (!rMat(A_, n, 0, n - 1, 0, n - 1, @@ -117,34 +114,20 @@ class sp_gemm_gpu : public gemm<T> { private: - - /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { // Prefetch memory to device - cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int), gpuDevice_, - s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int), gpuDevice_, - s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int), gpuDevice_, - s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges, gpuDevice_, - s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges, + cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_), + gpuDevice_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_), gpuDevice_, s1_)); cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1), gpuDevice_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int), gpuDevice_, - s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int), gpuDevice_, - s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int), gpuDevice_, - s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges, gpuDevice_, - s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges, + cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_), + gpuDevice_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_), gpuDevice_, s2_)); cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1), gpuDevice_, s2_)); @@ -163,13 +146,13 @@ class sp_gemm_gpu : public gemm<T> { // gpuDevice_, s3_)); // Create the CSR matrices on the device - cusparseCreateCsr(descrA_, n_, n_, A_nnz_, A_row_, A_col_, A_val_, + cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDateType_); - cusparseCreateCsr(descrB_, n_, n_, B_nnz_, B_row_, B_col_, B_val_, + CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); + cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDateType_); - cusparseCreateCsr(descrC_, n_, n_, 0, NULL, NULL, NULL, + CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); + cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); @@ -181,38 +164,40 @@ class sp_gemm_gpu : public gemm<T> { cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, descrB_, &beta, descrC_, - CUSPARSE_SPGEMM_DEFAULT, cudaDataType_, - spgemmDesc_, buffer_size1_, NULL); + cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, + spgemmDesc_, &buffer_size1_, NULL); cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_)); cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, descrB_, &beta, descrC_, - CUSPARSE_SPGEMM_DEFAULT, cudaDataType_, - spgemmDesc_, buffer_size1_, buffer1_); - cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, + spgemmDesc_, &buffer_size1_, buffer1_); + cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, - descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT, - cudaDataType_, spgemmDesc_, buffer_size2_, NULL); - cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2)); + descrB_, &beta, descrC_, cudaDataType_, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, + &buffer_size2_, NULL); + cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_)); - if (cusparseSpGEMM_cmopute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, - descrB_, &beta, descrC_, CUSPARSE_SPGEMM_DEFAULT, - cudaDataType_, spgemmDesc_, buffer_size2_, buffer2_) - == CUSPARSE_SATUS_INSUFFICIENT_RESOURCES) { + descrB_, &beta, descrC_, cudaDataType_, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, + &buffer_size2_, buffer2_) + == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) { std::cout << "Insufficient resources" << std::endl; exit(1); } - int rows, cols, nnz; + int64_t rows, cols, nnz; - cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz_); - C_nnz_ = nnz; - cudaCheckError(cudaMallocManaged(C_val_), sizeof(T) * nnz); - cudaCheckError(cudaMallocManaged(C_col_), sizeof(int) * nnz); - cudaCheckError(cudaMallocManaged(C_row_), sizeof(int) * (n_ + 1)); + cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz); + (*C_nnz_) = nnz; + cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz)); + cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz)); + cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1))); - cusparseCstSetPointers(descrC_, *C_row, *C_colind, *C_val); + cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_); cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, descrB_, &beta, descrC_, CUDA_R_32F, @@ -223,44 +208,26 @@ class sp_gemm_gpu : public gemm<T> { * be timed. */ void postLoopRequirements() override { // Ensure all data resides on host once work has completed - cudaCheckError(cudaMemPrefetchAsync(A_num_rows_, sizeof(int), - cudaCpuDeviceId_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_num_cols_, sizeof(int), - cudaCpuDeviceId_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_nnz_, sizeof(int), - cudaCpuDeviceId_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * edges, - cudaCpuDeviceId_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * edges, - cudaCpuDeviceId_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_), + cudaCpuDeviceId, s1_)); + cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_), + cudaCpuDeviceId, s1_)); cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1), - cudaCpuDeviceId_, s1_)); - - cudaCheckError(cudaMemPrefetchAsync(B_num_rows_, sizeof(int), - cudaCpuDeviceId_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_num_cols_, sizeof(int), - cudaCpuDeviceId_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_nnz_, sizeof(int), - cudaCpuDeviceId_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * edges, - cudaCpuDeviceId_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * edges, - cudaCpuDeviceId_, s2_)); + cudaCpuDeviceId, s1_)); + + cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_), + cudaCpuDeviceId, s2_)); + cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_), + cudaCpuDeviceId, s2_)); cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1), - cudaCpuDeviceId_, s2_)); - - cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), - cudaCpuDeviceId_, s3_)); - cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), - cudaCpuDeviceId_, s3_)); - cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), - cudaCpuDeviceId_, s3_)); - cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * C_nnz_, - cudaCpuDeviceId_, s3_)); - cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * C_nnz_, - cudaCpuDeviceId_, s3_)); + cudaCpuDeviceId, s2_)); + + cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * (*C_nnz_), + cudaCpuDeviceId, s3_)); + cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * (*C_nnz_), + cudaCpuDeviceId, s3_)); cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * (n_ + 1), - cudaCpuDeviceId_, s3_)); + cudaCpuDeviceId, s3_)); // Ensure device has finished all work. cudaCheckError(cudaDeviceSynchronize()); } @@ -348,7 +315,7 @@ class sp_gemm_gpu : public gemm<T> { } /** Handle used when calling cuBLAS. */ - cublasHandle_t handle_; + cusparseHandle_t handle_; /** CUDA Stream 1 - used to asynchronously move data between host and device. */ @@ -366,12 +333,29 @@ class sp_gemm_gpu : public gemm<T> { int gpuDevice_; /** CSR format vectors for matrices A, B and C on the host */ - int A_nnz_, B_nnz_, C_nnz_; - T* A_val_, B_val_, C_val_; - int* A_col_, A_row_, B_col_, B_row_, C_col_, C_row_; + T* A_val_; + int* A_col_; + int* A_row_; + int* A_num_rows_; + int* A_num_cols_; + int* A_nnz_; + + T* B_val_; + int* B_col_; + int* B_row_; + int* B_num_rows_; + int* B_num_cols_; + int* B_nnz_; + + T* C_val_; + int* C_col_; + int* C_row_; + int* C_num_rows_; + int* C_num_cols_; + int*C_nnz_; /** CSR format vectors for matrices A, B and C on the device. */ - int A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_, + int* A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_, B_num_cols_dev_, B_nnz_dev_, C_num_rows_dev_, C_num_cols_dev_, C_nnz_dev_; T* A_val_dev_, B_val_dev_, C_val_dev_; int* A_col_dev_, A_row_dev_, B_col_dev_, B_row_dev_, C_col_dev_, C_row_dev_; @@ -384,12 +368,12 @@ class sp_gemm_gpu : public gemm<T> { // Create descriptors for matrices A->C - cusparseMatDescr_t descrA_, descrB_, descrC_; + cusparseSpMatDescr_t descrA_, descrB_, descrC_; - // index type depends on kernel being run - cusparseIndexType_t cudaDataType_; + // Data type depends on kernel being run + cudaDataType_t cudaDataType_; - cusparceSpGEMMDescr_t spgemmDesc_; + cusparseSpGEMMDescr_t spgemmDesc_; size_t buffer_size1_ = 0; size_t buffer_size2_ = 0; diff --git a/include/doGemm.hh b/include/doGemm.hh index 4a7c564..5565fb2 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -519,20 +519,19 @@ class doGemm { // Perform the GPU kernels // - ONCE : Offload to/from GPU once before all iterations and once // after - spGemmGpu_.initialise(gpuOffloadType::once, N, N, N); + spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); time_checksum_gflop gpuResult_once = gemmGpu_.compute(); gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime); // - ALWAYS: Offload to/from GPU every iteration - spGemmGpu_.initialise(gpuOffloadType::always, N, N, N); + spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); time_checksum_gflop gpuResult_always = gemmGpu_.compute(); gpuResult_always.gflops = calcGflops(flops, iterations_, gpuResult_always.runtime); - // - UNIFIED : data passed from host to device (and device to host) as // needed - spGemmGpu_.initialise(gpuOffloadType::unified, N, N, N); + spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); time_checksum_gflop gpuResult_unified = gemmGpu_.compute(); gpuResult_unified.gflops = calcGflops(flops, iterations_, gpuResult_unified.runtime); diff --git a/include/kernels/GPU/sp_gemm.hh b/include/kernels/GPU/sp_gemm.hh index 684c166..dbfba87 100644 --- a/include/kernels/GPU/sp_gemm.hh +++ b/include/kernels/GPU/sp_gemm.hh @@ -17,7 +17,7 @@ namespace gpu { * - Always: Move data from host to device and device to host each iteration * - Unified: Initialise data as unified memory; no data movement semantics * required */ - virtual void initialise(gpuOffloadType offload, int m, int n, int k) = 0; + virtual void initialise(gpuOffloadType offload, int n, float sparsity) = 0; protected: /** Whether data should be offloaded to/from the GPU each iteration, or just From 143c1c041d7da2afda07b27c5c3dbb8b273fab1c Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Mon, 25 Mar 2024 10:11:51 +0000 Subject: [PATCH 08/38] Now compiles with fewer runtime errors --- cuBLAS/sp_gemm.hh | 352 +++++++++++++++++++++++++++------------------- include/doGemm.hh | 42 +++--- 2 files changed, 227 insertions(+), 167 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index c0bfb8e..fa0e39d 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -37,12 +37,12 @@ class sp_gemm_gpu : public sp_gemm<T> { std::cout << "Initialising" << std::endl; offload_ = offload; - // Create a handle for cuSPARSE + // Create a handle for cuSPARSE cusparseCreate(&handle_); std::cout << "Handle created" << std::endl; - if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F; + if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F; else if (std::is_same_v<T, double>) cudaDataType_ = CUDA_R_64F; else { std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; @@ -60,24 +60,38 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCheckError(cudaStreamCreate(&s3_)); std::cout << "Streams created" << std::endl; + if (offload_ == gpuOffloadType::unified) { + std::cout << "Into unified if statement" << std::endl; + A_num_rows_ = (int*)malloc(sizeof(int)); + A_num_cols_ = (int*)malloc(sizeof(int)); + A_nnz_ = (int*)malloc(sizeof(int)); + B_num_rows_ = (int*)malloc(sizeof(int)); + B_num_cols_ = (int*)malloc(sizeof(int)); + B_nnz_ = (int*)malloc(sizeof(int)); + C_num_rows_ = (int*)malloc(sizeof(int)); + C_num_cols_ = (int*)malloc(sizeof(int)); + C_nnz_ = (int*)malloc(sizeof(int)); + } - // Work out number of edges needed to achieve target sparsity - int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); - (*A_nnz_) = (*B_nnz_) = edges; - // ToDo -- for all of this mallocing, bear in mind that row will probably - // have fewer than 'edges' values (thats the whole point). May need to - // reorganise + // Work out number of edges needed to achieve target sparsity + int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); + (*A_nnz_) = (*B_nnz_) = edges; - cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * edges)); - cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * edges)); - cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1))); - std::cout << "A CSR vectors malloced" << std::endl; + if (offload_ == gpuOffloadType::unified) { + std::cout << "beginning mallocs" << std::endl; + cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * (*A_nnz_))); + std::cout << "A vals vectors malloced" << std::endl; + cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * (*A_nnz_))); + std::cout << "A cols vectors malloced" << std::endl; + cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1))); + std::cout << "A CSR vectors malloced" << std::endl; - cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * edges)); - cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * edges)); - cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1))); - std::cout << "B CSR vectors malloced" << std::endl; + cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * (*B_nnz_))); + cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * (*B_nnz_))); + cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1))); + std::cout << "B CSR vectors malloced" << std::endl; + } // Initialise the host matricies // cusparseSpGEMM() works on CSR format only. This helpfully makes our @@ -85,10 +99,12 @@ class sp_gemm_gpu : public sp_gemm<T> { // Initialise the matrices // Set initial values to 0 - for (int i = 0; i < (n_ * n_); i++) { - A_[i] = 0.0; - B_[i] = 0.0; - } + A_ = (T*)malloc(sizeof(T) * n_ * n_); + B_ = (T*)malloc(sizeof(T) * n_ * n_); + for (int i = 0; i < (n_ * n_); i++) { + A_[i] = 0.0; + B_[i] = 0.0; + } // Random number generator objects for use in descent std::default_random_engine gen; @@ -96,19 +112,20 @@ class sp_gemm_gpu : public sp_gemm<T> { .time_since_epoch().count()); std::uniform_real_distribution<double> dist(0.0, 1.0); - // Using a=0.45 and b=c=0.22 as default probabilities - for (int i = 0; i < edges; i++) { - while (!rMat(A_, n, 0, n - 1, 0, n - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} - while (!rMat(B_, n, 0, n - 1, 0, n - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} - } - - toCSR(A_, n, n, edges, A_val_, A_col_, A_row_); - toCSR(B_, n, n, edges, B_val_, B_col_, B_row_); + // Using a=0.45 and b=c=0.22 as default probabilities + for (int i = 0; i < (*A_nnz_); i++) { + while (!rMat(A_, n, 0, n - 1, 0, n - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + } + for (int i = 0; i < (*B_nnz_); i++) { + while (!rMat(B_, n, 0, n - 1, 0, n - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + } + toCSR(A_, n, n, (*A_nnz_), A_val_, A_col_, A_row_); + toCSR(B_, n, n, (*B_nnz_), B_val_, B_col_, B_row_); } @@ -117,135 +134,178 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { - // Prefetch memory to device - cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_), - gpuDevice_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_), - gpuDevice_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1), - gpuDevice_, s1_)); - - cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_), - gpuDevice_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_), - gpuDevice_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1), - gpuDevice_, s2_)); -// -// cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_, -// s3_)); -// cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_, -// s3_)); -// cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_, -// s3_)); -// cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_, -// s3_)); -// cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges, -// gpuDevice_, s3_)); -// cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges, -// gpuDevice_, s3_)); - - // Create the CSR matrices on the device - cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); - cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); - cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); - - cusparseSpGEMM_createDescr(&spgemmDesc_); + switch(offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + break; + } + case gpuOffloadType::unified: { + // Prefetch memory to device + cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_), + gpuDevice_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_), + gpuDevice_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1), + gpuDevice_, s1_)); + + cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_), + gpuDevice_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_), + gpuDevice_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1), + gpuDevice_, s2_)); + // + // cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_, + // s3_)); + // cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_, + // s3_)); + // cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_, + // s3_)); + // cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_, + // s3_)); + // cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges, + // gpuDevice_, s3_)); + // cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges, + // gpuDevice_, s3_)); + + // Create the CSR matrices on the device + cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); + cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); + cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); + + cusparseSpGEMM_createDescr(&spgemmDesc_); + break; + } + } } /** Make a call to the BLAS Library Kernel. */ void callGemm() override { - cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, - descrA_, descrB_, &beta, descrC_, - cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, - spgemmDesc_, &buffer_size1_, NULL); - cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_)); - cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, - descrA_, descrB_, &beta, descrC_, - cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, - spgemmDesc_, &buffer_size1_, buffer1_); - cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, - descrB_, &beta, descrC_, cudaDataType_, - CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, - &buffer_size2_, NULL); - cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_)); - - if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, - descrB_, &beta, descrC_, cudaDataType_, - CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, - &buffer_size2_, buffer2_) - == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) { - std::cout << "Insufficient resources" << std::endl; - exit(1); - } - - int64_t rows, cols, nnz; - - cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz); - (*C_nnz_) = nnz; - cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz)); - cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz)); - cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1))); - - cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_); - cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, - descrB_, &beta, descrC_, CUDA_R_32F, - CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_); + switch(offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + break; + } + case gpuOffloadType::unified: { + cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, + descrA_, descrB_, &beta, descrC_, + cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, + spgemmDesc_, &buffer_size1_, NULL); + cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_)); + cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, + descrA_, descrB_, &beta, descrC_, + cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, + spgemmDesc_, &buffer_size1_, buffer1_); + cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, + &buffer_size2_, NULL); + cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_)); + + if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, + &buffer_size2_, buffer2_) + == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) { + std::cout << "Insufficient resources" << std::endl; + exit(1); + } + + int64_t rows, cols, nnz; + + cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz); + (*C_nnz_) = nnz; + cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz)); + cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz)); + cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1))); + + cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_); + cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, + descrB_, &beta, descrC_, CUDA_R_32F, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_); + break; + } + } } /** Perform any required steps after calling the GEMM kernel that should * be timed. */ void postLoopRequirements() override { - // Ensure all data resides on host once work has completed - cudaCheckError(cudaMemPrefetchAsync(&A_val_, sizeof(T) * (*A_nnz_), - cudaCpuDeviceId, s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_col_, sizeof(int) * (*A_nnz_), - cudaCpuDeviceId, s1_)); - cudaCheckError(cudaMemPrefetchAsync(&A_row_, sizeof(int) * (n_ + 1), - cudaCpuDeviceId, s1_)); - - cudaCheckError(cudaMemPrefetchAsync(&B_val_, sizeof(T) * (*B_nnz_), - cudaCpuDeviceId, s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_col_, sizeof(int) * (*B_nnz_), - cudaCpuDeviceId, s2_)); - cudaCheckError(cudaMemPrefetchAsync(&B_row_, sizeof(int) * (n_ + 1), - cudaCpuDeviceId, s2_)); - - cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * (*C_nnz_), - cudaCpuDeviceId, s3_)); - cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * (*C_nnz_), - cudaCpuDeviceId, s3_)); - cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * (n_ + 1), - cudaCpuDeviceId, s3_)); - // Ensure device has finished all work. - cudaCheckError(cudaDeviceSynchronize()); + switch(offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + break; + } + case gpuOffloadType::unified: { + // Ensure all data resides on host once work has completed + cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_), + cudaCpuDeviceId, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_), + cudaCpuDeviceId, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1), + cudaCpuDeviceId, s1_)); + + cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_), + cudaCpuDeviceId, s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_), + cudaCpuDeviceId, s2_)); + cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1), + cudaCpuDeviceId, s2_)); + + cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * (*C_nnz_), + cudaCpuDeviceId, s3_)); + cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * (*C_nnz_), + cudaCpuDeviceId, s3_)); + cudaCheckError(cudaMemPrefetchAsync(C_row_, sizeof(int) * (n_ + 1), + cudaCpuDeviceId, s3_)); + // Ensure device has finished all work. + cudaCheckError(cudaDeviceSynchronize()); + break; + } + } } /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() override { - // Destroy the handle - cusparseDestroy(handle_); - - // Destroy streams after use - cudaCheckError(cudaStreamDestroy(s1_)); - cudaCheckError(cudaStreamDestroy(s2_)); - cudaCheckError(cudaStreamDestroy(s3_)); + if (offload_ == gpuOffloadType::unified) { + // Destroy the handle + cusparseDestroy(handle_); + + // Destroy streams after use + cudaCheckError(cudaStreamDestroy(s1_)); + cudaCheckError(cudaStreamDestroy(s2_)); + cudaCheckError(cudaStreamDestroy(s3_)); + } - cudaFree(A_); - cudaFree(B_); - cudaFree(C_); + if (offload_ == gpuOffloadType::unified) { + cudaFree(A_val_); + cudaFree(A_col_); + cudaFree(A_row_); + cudaFree(B_val_); + cudaFree(B_col_); + cudaFree(B_row_); + cudaFree(C_val_); + cudaFree(C_col_); + cudaFree(C_row_); + } } bool rMat(T* M, int n, int x1, int x2, int y1, int y2, diff --git a/include/doGemm.hh b/include/doGemm.hh index 5565fb2..0e4dcc0 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -516,23 +516,23 @@ class doGemm { time_checksum_gflop cpuResult = spGemmCpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); - // Perform the GPU kernels - // - ONCE : Offload to/from GPU once before all iterations and once - // after - spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); - time_checksum_gflop gpuResult_once = gemmGpu_.compute(); - gpuResult_once.gflops = - calcGflops(flops, iterations_, gpuResult_once.runtime); - - // - ALWAYS: Offload to/from GPU every iteration - spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); - time_checksum_gflop gpuResult_always = gemmGpu_.compute(); - gpuResult_always.gflops = - calcGflops(flops, iterations_, gpuResult_always.runtime); - // - UNIFIED : data passed from host to device (and device to host) as - // needed +// // Perform the GPU kernels +// // - ONCE : Offload to/from GPU once before all iterations and once +// // after +// spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); +// time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); +// gpuResult_once.gflops = +// calcGflops(flops, iterations_, gpuResult_once.runtime); +// +// // - ALWAYS: Offload to/from GPU every iteration +// spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); +// time_checksum_gflop gpuResult_always = spGemmGpu_.compute(); +// gpuResult_always.gflops = +// calcGflops(flops, iterations_, gpuResult_always.runtime); +// // - UNIFIED : data passed from host to device (and device to host) as +// // needed spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); - time_checksum_gflop gpuResult_unified = gemmGpu_.compute(); + time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); gpuResult_unified.gflops = calcGflops(flops, iterations_, gpuResult_unified.runtime); @@ -541,11 +541,11 @@ class doGemm { // Write lines to CSV file writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, cpuResult.runtime, cpuResult.gflops); - writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, - iterations_, gpuResult_once.runtime, gpuResult_once.gflops); - writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, - iterations_, gpuResult_always.runtime, - gpuResult_always.gflops); +// writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, +// iterations_, gpuResult_once.runtime, gpuResult_once.gflops); +// writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, +// iterations_, gpuResult_always.runtime, +// gpuResult_always.gflops); writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, iterations_, gpuResult_unified.runtime, gpuResult_unified.gflops); From bcd7ae88a01ec199951162c3fdba2d41817edff9 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:23:02 +0100 Subject: [PATCH 09/38] rebasing --- cuBLAS/common.hh | 13 ++ cuBLAS/sp_gemm.hh | 576 ++++++++++++++++++++++++++++++++++------------ include/doGemm.hh | 34 +-- 3 files changed, 458 insertions(+), 165 deletions(-) diff --git a/cuBLAS/common.hh b/cuBLAS/common.hh index 78d0270..70d58fb 100644 --- a/cuBLAS/common.hh +++ b/cuBLAS/common.hh @@ -2,6 +2,9 @@ #if defined GPU_CUBLAS +#include "cusparse.h" + +/** Macro function to check if error occurred when calling cuBLAS. */ /** Macro function to check if error occurred when calling CUDA. */ #define cudaCheckError(f) \ do { \ @@ -22,4 +25,14 @@ } \ } while (false) +#define cusparseCheckError(f) \ + do { \ + cusparseStatus_t status = (f); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + std::cout << "CUSPARSE error: " << __FILE__ << ":" << __LINE__ << ": " \ + << cusparseGetErrorString(status) << std::endl; \ + exit(1); \ + } \ + } while (false) \ + #endif \ No newline at end of file diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index fa0e39d..0879966 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -34,12 +34,9 @@ class sp_gemm_gpu : public sp_gemm<T> { * - Unified: Initialise data as unified memory; no data movement semantics * required */ void initialise(gpuOffloadType offload, int n, float sparsity) override { - std::cout << "Initialising" << std::endl; - offload_ = offload; + std::cout << "_/_/_/_/ Initialising for problem size: " << n << std::endl; - // Create a handle for cuSPARSE - cusparseCreate(&handle_); - std::cout << "Handle created" << std::endl; + offload_ = offload; if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F; @@ -52,45 +49,51 @@ class sp_gemm_gpu : public sp_gemm<T> { // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); - std::cout << "GPU device got" << std::endl; // Initialise 3 streams to asynchronously move data between host and device cudaCheckError(cudaStreamCreate(&s1_)); cudaCheckError(cudaStreamCreate(&s2_)); cudaCheckError(cudaStreamCreate(&s3_)); - std::cout << "Streams created" << std::endl; - if (offload_ == gpuOffloadType::unified) { - std::cout << "Into unified if statement" << std::endl; - A_num_rows_ = (int*)malloc(sizeof(int)); - A_num_cols_ = (int*)malloc(sizeof(int)); - A_nnz_ = (int*)malloc(sizeof(int)); - B_num_rows_ = (int*)malloc(sizeof(int)); - B_num_cols_ = (int*)malloc(sizeof(int)); - B_nnz_ = (int*)malloc(sizeof(int)); - C_num_rows_ = (int*)malloc(sizeof(int)); - C_num_cols_ = (int*)malloc(sizeof(int)); - C_nnz_ = (int*)malloc(sizeof(int)); - } // Work out number of edges needed to achieve target sparsity int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); - (*A_nnz_) = (*B_nnz_) = edges; + A_nnz_ = B_nnz_ = edges; if (offload_ == gpuOffloadType::unified) { - std::cout << "beginning mallocs" << std::endl; - cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * (*A_nnz_))); - std::cout << "A vals vectors malloced" << std::endl; - cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * (*A_nnz_))); - std::cout << "A cols vectors malloced" << std::endl; + cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_)); + cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * A_nnz_)); cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1))); - std::cout << "A CSR vectors malloced" << std::endl; - cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * (*B_nnz_))); - cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * (*B_nnz_))); + cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * B_nnz_)); + cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * B_nnz_)); cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1))); - std::cout << "B CSR vectors malloced" << std::endl; + + cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1))); + C_val_ = NULL; + C_col_ = NULL; + } else { + A_val_ = (T*)malloc(sizeof(T) * A_nnz_); + A_col_ = (int*)malloc(sizeof(int) * A_nnz_); + A_row_ = (int*)malloc(sizeof(int) * (n_ + 1)); + + B_val_ = (T*)malloc(sizeof(T) * B_nnz_); + B_col_ = (int*)malloc(sizeof(int) * B_nnz_); + B_row_ = (int*)malloc(sizeof(int) * (n_ + 1)); + + C_row_ = (int*)malloc(sizeof(int) * (n_ + 1)); + + + cudaCheckError(cudaMalloc((void**)&A_val_dev_, sizeof(T) * A_nnz_)); + cudaCheckError(cudaMalloc((void**)&A_col_dev_, sizeof(int) * A_nnz_)); + cudaCheckError(cudaMalloc((void**)&A_row_dev_, sizeof(int) * (n_ + 1))); + + cudaCheckError(cudaMalloc((void**)&B_val_dev_, sizeof(T) * B_nnz_)); + cudaCheckError(cudaMalloc((void**)&B_col_dev_, sizeof(int) * B_nnz_)); + cudaCheckError(cudaMalloc((void**)&B_row_dev_, sizeof(int) * (n_ + 1))); + + cudaCheckError(cudaMalloc((void**)&C_row_dev_, sizeof(int) * (n_ + 1))); } // Initialise the host matricies @@ -113,75 +116,116 @@ class sp_gemm_gpu : public sp_gemm<T> { std::uniform_real_distribution<double> dist(0.0, 1.0); // Using a=0.45 and b=c=0.22 as default probabilities - for (int i = 0; i < (*A_nnz_); i++) { - while (!rMat(A_, n, 0, n - 1, 0, n - 1, + for (int i = 0; i < A_nnz_; i++) { + while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false)) {} } - for (int i = 0; i < (*B_nnz_); i++) { - while (!rMat(B_, n, 0, n - 1, 0, n - 1, + for (int i = 0; i < B_nnz_; i++) { + while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false)) {} } - toCSR(A_, n, n, (*A_nnz_), A_val_, A_col_, A_row_); - toCSR(B_, n, n, (*B_nnz_), B_val_, B_col_, B_row_); - } + toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_); + + toCSR(B_, n_, n_, B_nnz_, B_val_, B_col_, B_row_); + +// std::cout << "_____Matrix A_____" << std::endl; +// printDenseMatrix(A_, n_, n_); +// std::cout << std::endl << std::endl; +// printCSR(A_val_, A_col_, A_row_, A_nnz_, n_, n_); +// +// +// std::cout << "_____Matrix B_____" << std::endl; +// printDenseMatrix(B_, n_, n_); +// std::cout << std::endl << std::endl; +// printCSR(B_val_, B_col_, B_row_, B_nnz_, n_, n_); + // Create a handle for cuSPARSE + cusparseCheckError(cusparseCreate(&handle_)); + } private: /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { + std::cout << "\t\tPreLoop" << std::endl; + cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_)); switch(offload_) { case gpuOffloadType::always: { + // Make matrix descriptors + cusparseCheckError( + cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, + A_col_dev_, A_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_, + B_col_dev_, B_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL, + rType_, cType_, indType_, cudaDataType_)); break; } case gpuOffloadType::once: { + cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) * + A_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) * + A_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (n_ + + 1), cudaMemcpyHostToDevice, s1_)); + + cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) * + B_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) * + B_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_ + + 1), cudaMemcpyHostToDevice, s1_)); + + // Craete matrix descriptors + cusparseCheckError( + cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, + A_col_dev_, A_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_, + B_col_dev_, B_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL, + rType_, cType_, indType_, cudaDataType_)); break; } case gpuOffloadType::unified: { // Prefetch memory to device - cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_), + cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_, gpuDevice_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_), + cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * A_nnz_, gpuDevice_, s1_)); cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1), gpuDevice_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_), + cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * B_nnz_, gpuDevice_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_), + cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * B_nnz_, gpuDevice_, s2_)); cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1), gpuDevice_, s2_)); - // - // cudaCheckError(cudaMemPrefetchAsync(C_num_rows_, sizeof(int), gpuDevice_, - // s3_)); - // cudaCheckError(cudaMemPrefetchAsync(C_num_cols_, sizeof(int), gpuDevice_, - // s3_)); - // cudaCheckError(cudaMemPrefetchAsync(C_nnz_, sizeof(int), gpuDevice_, - // s3_)); - // cudaCheckError(cudaMemPrefetchAsync(&C_val_, sizeof(T) * edges, gpuDevice_, - // s3_)); - // cudaCheckError(cudaMemPrefetchAsync(&C_col_, sizeof(int) * edges, - // gpuDevice_, s3_)); - // cudaCheckError(cudaMemPrefetchAsync(&C_row_, sizeof(int) * edges, - // gpuDevice_, s3_)); - - // Create the CSR matrices on the device - cusparseCreateCsr(&descrA_, n_, n_, (*A_nnz_), A_row_, A_col_, A_val_, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); - cusparseCreateCsr(&descrB_, n_, n_, (*B_nnz_), B_row_, B_col_, B_val_, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); - cusparseCreateCsr(&descrC_, n_, n_, 0, NULL, NULL, NULL, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, cudaDataType_); - - cusparseSpGEMM_createDescr(&spgemmDesc_); + + // Make matrix descriptors + cusparseCheckError( + cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_, A_col_, + A_val_, rType_, cType_, indType_, + cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_, B_col_, + B_val_, rType_, cType_, indType_, + cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_, NULL, NULL, + rType_, cType_, indType_, cudaDataType_)); break; } } @@ -189,55 +233,208 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Make a call to the BLAS Library Kernel. */ void callGemm() override { + std::cout << "\t\tcallGemm" << std::endl; switch(offload_) { case gpuOffloadType::always: { + cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) * + A_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) * + A_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (n_ + + 1), cudaMemcpyHostToDevice, s1_)); + + cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) * + B_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) * + B_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_ + + 1), cudaMemcpyHostToDevice, s1_)); + + cusparseCheckError( + cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_)); + + cusparseCheckError( + cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, + descrA_, descrB_, &beta, + descrC_, cudaDataType_, alg_, + spgemmDesc_, &buffer_size1_, + NULL)); + cudaCheckError(cudaMalloc((void**)&buffer1_, buffer_size1_)); + cusparseCheckError( + cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, + descrA_, descrB_, &beta, + descrC_, cudaDataType_, alg_, + spgemmDesc_, &buffer_size1_, + buffer1_)); + cusparseCheckError( + cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_, &buffer_size2_, + NULL)); + cudaCheckError(cudaMalloc((void**)&buffer2_, buffer_size2_)); + + cusparseCheckError( + cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, + cudaDataType_, alg_, spgemmDesc_, + &buffer_size2_, buffer2_)); + + cusparseCheckError( + cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, + &C_nnz_)); + + cusparseCheckError( + cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, + &C_nnz_)); + + cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_)); + cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_)); + + cusparseCheckError( + cusparseCsrSetPointers(descrC_, C_row_dev_, C_col_dev_, + C_val_dev_)); + cusparseCheckError( + cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_)); + + cudaCheckError(cudaMemcpyAsync(A_val_, A_val_dev_, sizeof(T) * + A_nnz_, cudaMemcpyDeviceToHost, s1_)); + cudaCheckError(cudaMemcpyAsync(A_col_, A_col_dev_, sizeof(int) * + A_nnz_, cudaMemcpyDeviceToHost, s1_)); + cudaCheckError(cudaMemcpyAsync(A_row_, A_row_dev_, sizeof(int) * + (n_ + 1), cudaMemcpyDeviceToHost, s1_)); + + cudaCheckError(cudaMemcpyAsync(B_val_, B_val_dev_, sizeof(T) * + B_nnz_, cudaMemcpyDeviceToHost, s2_)); + cudaCheckError(cudaMemcpyAsync(B_col_, B_col_dev_, sizeof(int) * + B_nnz_, cudaMemcpyDeviceToHost, s2_)); + cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) * + (n_ + 1), cudaMemcpyDeviceToHost, s2_)); + + C_val_ = (T*)malloc(sizeof(T) * C_nnz_); + C_col_ = (int*)malloc(sizeof(int) * C_nnz_); + cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) * + C_nnz_, cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) * + C_nnz_, cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) * + (n_ + 1), cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaDeviceSynchronize()); + + // Freeing memory + cudaCheckError(cudaFree(buffer1_)); + cudaCheckError(cudaFree(buffer2_)); + cudaCheckError(cudaFree(C_val_dev_)); + cudaCheckError(cudaFree(C_col_dev_)); + free(C_val_); + free(C_col_); break; } case gpuOffloadType::once: { + cusparseCheckError( + cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_)); + + cusparseCheckError( + cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, + descrA_, descrB_, &beta, + descrC_, cudaDataType_, alg_, + spgemmDesc_, &buffer_size1_, + NULL)); + cudaCheckError(cudaMalloc((void**)&buffer1_, buffer_size1_)); + cusparseCheckError( + cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, + descrA_, descrB_, &beta, + descrC_, cudaDataType_, alg_, + spgemmDesc_, &buffer_size1_, + buffer1_)); + cusparseCheckError( + cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_, &buffer_size2_, + NULL)); + cudaCheckError(cudaMalloc((void**)&buffer2_, buffer_size2_)); + + cusparseCheckError( + cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_, &buffer_size2_, buffer2_)); + + cusparseCheckError( + cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, + &C_nnz_)); + + cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_)); + cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_)); + + cusparseCheckError( + cusparseCsrSetPointers(descrC_, C_row_dev_, C_col_dev_, + C_val_dev_)); + cusparseCheckError( + cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, + descrA_, descrB_, &beta, descrC_, + cudaDataType_, alg_, spgemmDesc_)); + + // Freeing memory + cudaCheckError(cudaFree(buffer1_)); + cudaCheckError(cudaFree(buffer2_)); break; } case gpuOffloadType::unified: { - cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, - descrA_, descrB_, &beta, descrC_, - cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, - spgemmDesc_, &buffer_size1_, NULL); - cudaCheckError(cudaMallocManaged(&buffer1_, buffer_size1_)); - cusparseSpGEMM_workEstimation(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, - descrA_, descrB_, &beta, descrC_, - cudaDataType_, CUSPARSE_SPGEMM_DEFAULT, - spgemmDesc_, &buffer_size1_, buffer1_); - cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, + cusparseCheckError( + cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, + descrA_, descrB_, &beta, + descrC_, cudaDataType_, + alg_, spgemmDesc_, &buffer_size1_, + NULL)); + cudaCheckError(cudaMallocManaged((void**)&buffer1_, buffer_size1_)); + cusparseCheckError( + cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, + descrA_, descrB_, &beta, + descrC_, cudaDataType_, + alg_, spgemmDesc_, &buffer_size1_, + buffer1_)); + cusparseCheckError( + cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_, &buffer_size2_, + NULL)); + cudaCheckError(cudaMallocManaged((void**)&buffer2_, buffer_size2_)); + + cusparseCheckError( + cusparseSpGEMM_compute(handle_, opA_, opB_, &alpha, descrA_, descrB_, &beta, descrC_, cudaDataType_, - CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, - &buffer_size2_, NULL); - cudaCheckError(cudaMallocManaged(&buffer2_, buffer_size2_)); + alg_, spgemmDesc_, &buffer_size2_, buffer2_)); - if (cusparseSpGEMM_compute(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, - descrB_, &beta, descrC_, cudaDataType_, - CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_, - &buffer_size2_, buffer2_) - == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) { - std::cout << "Insufficient resources" << std::endl; - exit(1); - } - - int64_t rows, cols, nnz; - - cusparseSpMatGetSize(descrC_, &rows, &cols, &nnz); - (*C_nnz_) = nnz; - cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnz)); - cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnz)); - cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1))); - - cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_); - cusparseSpGEMM_copy(handle_, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA_, - descrB_, &beta, descrC_, CUDA_R_32F, - CUSPARSE_SPGEMM_DEFAULT, spgemmDesc_); + cusparseCheckError( + cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, + &C_nnz_)); + + cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * C_nnz_)); + cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * C_nnz_)); + + cusparseCheckError( + cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_)); + cusparseCheckError( + cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, cudaDataType_, + alg_, spgemmDesc_)); + + + cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * C_nnz_, + cudaCpuDeviceId, s3_)); + cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * C_nnz_, + cudaCpuDeviceId, s3_)); + + // Freeing memory + cudaCheckError(cudaFree(buffer1_)); + cudaCheckError(cudaFree(buffer2_)); + cudaCheckError(cudaFree(C_val_)); + cudaCheckError(cudaFree(C_col_)); break; } } @@ -246,33 +443,63 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Perform any required steps after calling the GEMM kernel that should * be timed. */ void postLoopRequirements() override { + std::cout << "\t\tPostLoop" << std::endl; + cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_)); + // Destroying descriptors + cusparseCheckError(cusparseDestroySpMat(descrA_)); + cusparseCheckError(cusparseDestroySpMat(descrB_)); + cusparseCheckError(cusparseDestroySpMat(descrC_)); switch(offload_) { case gpuOffloadType::always: { break; } case gpuOffloadType::once: { + cudaCheckError(cudaMemcpyAsync(A_val_, A_val_dev_, sizeof(T) * + A_nnz_, cudaMemcpyDeviceToHost, s1_)); + cudaCheckError(cudaMemcpyAsync(A_col_, A_col_dev_, sizeof(int) * + A_nnz_, cudaMemcpyDeviceToHost, s1_)); + cudaCheckError(cudaMemcpyAsync(A_row_, A_row_dev_, sizeof(int) * + (n_ + 1), cudaMemcpyDeviceToHost, s1_)); + + cudaCheckError(cudaMemcpyAsync(B_val_, B_val_dev_, sizeof(T) * + B_nnz_, cudaMemcpyDeviceToHost, s2_)); + cudaCheckError(cudaMemcpyAsync(B_col_, B_col_dev_, sizeof(int) * + B_nnz_, cudaMemcpyDeviceToHost, s2_)); + cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) * + (n_ + 1), cudaMemcpyDeviceToHost, s2_)); + + C_val_ = (T*)malloc(sizeof(T) * C_nnz_); + C_col_ = (int*)malloc(sizeof(int) * C_nnz_); + cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) * + C_nnz_, cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) * + C_nnz_, cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) * + (n_ + 1), cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaDeviceSynchronize()); + + cudaCheckError(cudaFree(C_val_dev_)); + cudaCheckError(cudaFree(C_col_dev_)); + free(C_val_); + free(C_col_); break; } case gpuOffloadType::unified: { // Ensure all data resides on host once work has completed - cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * (*A_nnz_), + cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_, cudaCpuDeviceId, s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * (*A_nnz_), + cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * A_nnz_, cudaCpuDeviceId, s1_)); cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1), cudaCpuDeviceId, s1_)); - cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * (*B_nnz_), + cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * B_nnz_, cudaCpuDeviceId, s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * (*B_nnz_), + cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * B_nnz_, cudaCpuDeviceId, s2_)); cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1), cudaCpuDeviceId, s2_)); - cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * (*C_nnz_), - cudaCpuDeviceId, s3_)); - cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * (*C_nnz_), - cudaCpuDeviceId, s3_)); cudaCheckError(cudaMemPrefetchAsync(C_row_, sizeof(int) * (n_ + 1), cudaCpuDeviceId, s3_)); // Ensure device has finished all work. @@ -285,26 +512,39 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() override { - if (offload_ == gpuOffloadType::unified) { - // Destroy the handle - cusparseDestroy(handle_); + std::cout << "\t\tPostCall" << std::endl << std::endl; + // Destroy the handle + cusparseCheckError(cusparseDestroy(handle_)); + + // Destroy streams after use + cudaCheckError(cudaStreamDestroy(s1_)); + cudaCheckError(cudaStreamDestroy(s2_)); + cudaCheckError(cudaStreamDestroy(s3_)); - // Destroy streams after use - cudaCheckError(cudaStreamDestroy(s1_)); - cudaCheckError(cudaStreamDestroy(s2_)); - cudaCheckError(cudaStreamDestroy(s3_)); - } if (offload_ == gpuOffloadType::unified) { - cudaFree(A_val_); - cudaFree(A_col_); - cudaFree(A_row_); - cudaFree(B_val_); - cudaFree(B_col_); - cudaFree(B_row_); - cudaFree(C_val_); - cudaFree(C_col_); - cudaFree(C_row_); + cudaCheckError(cudaFree(A_val_)); + cudaCheckError(cudaFree(A_col_)); + cudaCheckError(cudaFree(A_row_)); + cudaCheckError(cudaFree(B_val_)); + cudaCheckError(cudaFree(B_col_)); + cudaCheckError(cudaFree(B_row_)); + cudaCheckError(cudaFree(C_row_)); + } else { + free(A_val_); + free(A_col_); + free(A_row_); + free(B_val_); + free(B_col_); + free(B_row_); + free(C_row_); + cudaCheckError(cudaFree(A_val_dev_)); + cudaCheckError(cudaFree(A_col_dev_)); + cudaCheckError(cudaFree(A_row_dev_)); + cudaCheckError(cudaFree(B_val_dev_)); + cudaCheckError(cudaFree(B_col_dev_)); + cudaCheckError(cudaFree(B_row_dev_)); + cudaCheckError(cudaFree(C_row_dev_)); } } @@ -356,13 +596,10 @@ class sp_gemm_gpu : public sp_gemm<T> { void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index, int* row_ptr) { int nnz_encountered = 0; - int prev_row_ptr = 0; for (int row = 0; row < n_row; row++) { - if (nnz_encountered >= nnz) break; - row_ptr[row] = prev_row_ptr; + row_ptr[row] = nnz_encountered; int nnz_row = 0; for (int col = 0; col < n_col; col++) { - if (nnz_encountered >= nnz) break; if (dense[(row * n_col) + col] != 0.0) { nnz_row++; col_index[nnz_encountered] = col; @@ -370,10 +607,41 @@ class sp_gemm_gpu : public sp_gemm<T> { nnz_encountered++; } } - prev_row_ptr += nnz_row; } + row_ptr[n_row] = nnz_encountered; } + + // ToDo -- the two following functons are useful for debugging. I'm + // keeping them in to that end, though they are not used by the benchmark + // itself + void printDenseMatrix(T* M, int rows, int cols) { + for (int row = 0; row < rows; row++) { + std::cout << "| "; + for (int col = 0; col < cols; col++) { + std::cout << M[(row * cols) + col] << " | "; + } + std::cout << std::endl; + } + } + + void printCSR(T* values, int* col_indices, int* row_pointers, int nnz, + int rows, int cols) { + std::cout << "\tRow pointers__" << std::endl; + for (int p = 0; p < (rows + 1); p++) { + std::cout << row_pointers[p] << ", "; + } + std::cout << std::endl << "\tColumn Indices__" << std::endl; + for (int i = 0; i < nnz; i++) { + std::cout << col_indices[i] << ", "; + } + std::cout << std::endl << "\tValues__" << std::endl; + for (int v = 0; v < nnz; v++) { + std::cout << values[v] << ", "; + } + std::cout << std::endl; + } + /** Handle used when calling cuBLAS. */ cusparseHandle_t handle_; @@ -396,29 +664,34 @@ class sp_gemm_gpu : public sp_gemm<T> { T* A_val_; int* A_col_; int* A_row_; - int* A_num_rows_; - int* A_num_cols_; - int* A_nnz_; + int64_t A_num_rows_; + int64_t A_num_cols_; + int64_t A_nnz_; T* B_val_; int* B_col_; int* B_row_; - int* B_num_rows_; - int* B_num_cols_; - int* B_nnz_; + int64_t B_num_rows_; + int64_t B_num_cols_; + int64_t B_nnz_; T* C_val_; int* C_col_; int* C_row_; - int* C_num_rows_; - int* C_num_cols_; - int*C_nnz_; + int64_t C_num_rows_; + int64_t C_num_cols_; + int64_t C_nnz_; /** CSR format vectors for matrices A, B and C on the device. */ - int* A_num_rows_dev_, A_num_cols_dev_, A_nnz_dev_, B_num_rows_dev_, - B_num_cols_dev_, B_nnz_dev_, C_num_rows_dev_, C_num_cols_dev_, C_nnz_dev_; - T* A_val_dev_, B_val_dev_, C_val_dev_; - int* A_col_dev_, A_row_dev_, B_col_dev_, B_row_dev_, C_col_dev_, C_row_dev_; + T* A_val_dev_; + T* B_val_dev_; + T* C_val_dev_; + int* A_col_dev_; + int* A_row_dev_; + int* B_col_dev_; + int* B_row_dev_; + int* C_col_dev_; + int* C_row_dev_; /** The constant value Alpha. */ const T alpha = ALPHA; @@ -439,6 +712,13 @@ class sp_gemm_gpu : public sp_gemm<T> { size_t buffer_size2_ = 0; void* buffer1_ = NULL; void* buffer2_ = NULL; + + cusparseOperation_t opA_ = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseOperation_t opB_ = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseSpGEMMAlg_t alg_ = CUSPARSE_SPGEMM_DEFAULT; + cusparseIndexType_t rType_ = CUSPARSE_INDEX_32I; + cusparseIndexType_t cType_ = CUSPARSE_INDEX_32I; + cusparseIndexBase_t indType_ = CUSPARSE_INDEX_BASE_ZERO; }; } // namespace gpu #endif \ No newline at end of file diff --git a/include/doGemm.hh b/include/doGemm.hh index 0e4dcc0..9a66329 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -517,20 +517,20 @@ class doGemm { cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); // // Perform the GPU kernels + // - ALWAYS: Offload to/from GPU every iteration + spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); + time_checksum_gflop gpuResult_always = spGemmGpu_.compute(); + gpuResult_always.gflops = + calcGflops(flops, iterations_, gpuResult_always.runtime); // // - ONCE : Offload to/from GPU once before all iterations and once // // after -// spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); -// time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); -// gpuResult_once.gflops = -// calcGflops(flops, iterations_, gpuResult_once.runtime); -// -// // - ALWAYS: Offload to/from GPU every iteration -// spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); -// time_checksum_gflop gpuResult_always = spGemmGpu_.compute(); -// gpuResult_always.gflops = -// calcGflops(flops, iterations_, gpuResult_always.runtime); -// // - UNIFIED : data passed from host to device (and device to host) as -// // needed + spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); + time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); + gpuResult_once.gflops = + calcGflops(flops, iterations_, gpuResult_once.runtime); + + // - UNIFIED : data passed from host to device (and device to host) as + // needed spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); gpuResult_unified.gflops = @@ -541,11 +541,11 @@ class doGemm { // Write lines to CSV file writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, cpuResult.runtime, cpuResult.gflops); -// writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, -// iterations_, gpuResult_once.runtime, gpuResult_once.gflops); -// writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, -// iterations_, gpuResult_always.runtime, -// gpuResult_always.gflops); + writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, + iterations_, gpuResult_once.runtime, gpuResult_once.gflops); + writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, + iterations_, gpuResult_always.runtime, + gpuResult_always.gflops); writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, iterations_, gpuResult_unified.runtime, gpuResult_unified.gflops); From 2ffee16635466c3315f7c1cf075846c190041581 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Tue, 26 Mar 2024 12:55:10 +0000 Subject: [PATCH 10/38] All implemented and running. No checksum at the end --- cuBLAS/sp_gemm.hh | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 0879966..fbd08fd 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -325,10 +325,12 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCheckError(cudaDeviceSynchronize()); // Freeing memory - cudaCheckError(cudaFree(buffer1_)); - cudaCheckError(cudaFree(buffer2_)); cudaCheckError(cudaFree(C_val_dev_)); cudaCheckError(cudaFree(C_col_dev_)); + cudaCheckError(cudaFree(buffer1_)); + cudaCheckError(cudaFree(buffer2_)); + buffer_size1_ = 0; + buffer_size2_ = 0; free(C_val_); free(C_col_); break; @@ -380,8 +382,12 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaDataType_, alg_, spgemmDesc_)); // Freeing memory + cudaCheckError(cudaFree(C_val_dev_)); + cudaCheckError(cudaFree(C_col_dev_)); cudaCheckError(cudaFree(buffer1_)); cudaCheckError(cudaFree(buffer2_)); + buffer_size1_ = 0; + buffer_size2_ = 0; break; } case gpuOffloadType::unified: { @@ -414,6 +420,8 @@ class sp_gemm_gpu : public sp_gemm<T> { cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, &C_nnz_)); + if (C_val_ != NULL) cudaCheckError(cudaFree(C_val_)); + if (C_val_ != NULL) cudaCheckError(cudaFree(C_col_)); cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * C_nnz_)); cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * C_nnz_)); @@ -425,16 +433,11 @@ class sp_gemm_gpu : public sp_gemm<T> { alg_, spgemmDesc_)); - cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * C_nnz_, - cudaCpuDeviceId, s3_)); - cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * C_nnz_, - cudaCpuDeviceId, s3_)); - // Freeing memory cudaCheckError(cudaFree(buffer1_)); cudaCheckError(cudaFree(buffer2_)); - cudaCheckError(cudaFree(C_val_)); - cudaCheckError(cudaFree(C_col_)); + buffer_size1_ = 0; + buffer_size2_ = 0; break; } } @@ -468,20 +471,9 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost, s2_)); - C_val_ = (T*)malloc(sizeof(T) * C_nnz_); - C_col_ = (int*)malloc(sizeof(int) * C_nnz_); - cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) * - C_nnz_, cudaMemcpyDeviceToHost, s3_)); - cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) * - C_nnz_, cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaDeviceSynchronize()); - - cudaCheckError(cudaFree(C_val_dev_)); - cudaCheckError(cudaFree(C_col_dev_)); - free(C_val_); - free(C_col_); break; } case gpuOffloadType::unified: { @@ -675,8 +667,8 @@ class sp_gemm_gpu : public sp_gemm<T> { int64_t B_num_cols_; int64_t B_nnz_; - T* C_val_; - int* C_col_; + T* C_val_ = NULL; + int* C_col_ = NULL; int* C_row_; int64_t C_num_rows_; int64_t C_num_cols_; From 064ec5756f4b524d45e8bc2f94dbdf82412375d5 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Tue, 26 Mar 2024 12:57:45 +0000 Subject: [PATCH 11/38] Removing print statements --- cuBLAS/sp_gemm.hh | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index fbd08fd..01c6edb 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -34,11 +34,8 @@ class sp_gemm_gpu : public sp_gemm<T> { * - Unified: Initialise data as unified memory; no data movement semantics * required */ void initialise(gpuOffloadType offload, int n, float sparsity) override { - std::cout << "_/_/_/_/ Initialising for problem size: " << n << std::endl; - offload_ = offload; - if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F; else if (std::is_same_v<T, double>) cudaDataType_ = CUDA_R_64F; else { @@ -151,7 +148,6 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { - std::cout << "\t\tPreLoop" << std::endl; cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_)); switch(offload_) { case gpuOffloadType::always: { @@ -233,7 +229,6 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Make a call to the BLAS Library Kernel. */ void callGemm() override { - std::cout << "\t\tcallGemm" << std::endl; switch(offload_) { case gpuOffloadType::always: { cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) * @@ -446,7 +441,6 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Perform any required steps after calling the GEMM kernel that should * be timed. */ void postLoopRequirements() override { - std::cout << "\t\tPostLoop" << std::endl; cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_)); // Destroying descriptors cusparseCheckError(cusparseDestroySpMat(descrA_)); @@ -504,7 +498,6 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() override { - std::cout << "\t\tPostCall" << std::endl << std::endl; // Destroy the handle cusparseCheckError(cusparseDestroy(handle_)); From 88a053f2ea565e1753d671c4ddcee9ba45a80c3b Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 29 Mar 2024 12:35:53 +0000 Subject: [PATCH 12/38] Removing print statements --- cuBLAS/sp_gemm.hh | 116 +++++++++++++++++++++++++++++----------------- include/doGemm.hh | 20 ++++---- 2 files changed, 84 insertions(+), 52 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 01c6edb..db9cf29 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -24,7 +24,7 @@ class sp_gemm_gpu : public sp_gemm<T> { using sp_gemm<T>::C_; using sp_gemm<T>::offload_; - // ToDo -- just unified implemented so far. Fill in Always and Once later + // ToDo -- No checksum for sparse yet. Nedd to do /** Initialise the required data structures. * `offload` refers to the data offload type: @@ -42,7 +42,7 @@ class sp_gemm_gpu : public sp_gemm<T> { std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; exit(1); } - n_ = n; + n_ = n * 20; // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); @@ -93,6 +93,10 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCheckError(cudaMalloc((void**)&C_row_dev_, sizeof(int) * (n_ + 1))); } + C_mem_allocated_always_ = false; + C_mem_allocated_once_ = false; + C_mem_allocated_unified_ = false; + // Initialise the host matricies // cusparseSpGEMM() works on CSR format only. This helpfully makes our // sparse matrix format decision for us! @@ -148,21 +152,9 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { - cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_)); + switch(offload_) { case gpuOffloadType::always: { - // Make matrix descriptors - cusparseCheckError( - cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, - A_col_dev_, A_val_dev_, rType_, cType_, - indType_, cudaDataType_)); - cusparseCheckError( - cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_, - B_col_dev_, B_val_dev_, rType_, cType_, - indType_, cudaDataType_)); - cusparseCheckError( - cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL, - rType_, cType_, indType_, cudaDataType_)); break; } case gpuOffloadType::once: { @@ -174,11 +166,14 @@ class sp_gemm_gpu : public sp_gemm<T> { + 1), cudaMemcpyHostToDevice, s1_)); cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) * - B_nnz_, cudaMemcpyHostToDevice, s1_)); + B_nnz_, cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) * - B_nnz_, cudaMemcpyHostToDevice, s1_)); + B_nnz_, cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_ - + 1), cudaMemcpyHostToDevice, s1_)); + + 1), cudaMemcpyHostToDevice, s2_)); + + cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_ + + 1), cudaMemcpyHostToDevice, s3_)); // Craete matrix descriptors cusparseCheckError( @@ -225,6 +220,7 @@ class sp_gemm_gpu : public sp_gemm<T> { break; } } + cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_)); } /** Make a call to the BLAS Library Kernel. */ @@ -239,16 +235,27 @@ class sp_gemm_gpu : public sp_gemm<T> { + 1), cudaMemcpyHostToDevice, s1_)); cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) * - B_nnz_, cudaMemcpyHostToDevice, s1_)); + B_nnz_, cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) * - B_nnz_, cudaMemcpyHostToDevice, s1_)); + B_nnz_, cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_ - + 1), cudaMemcpyHostToDevice, s1_)); + + 1), cudaMemcpyHostToDevice, s2_)); + + cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_ + + 1), cudaMemcpyHostToDevice, s3_)); + // Make matrix descriptors cusparseCheckError( - cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_, - descrB_, &beta, descrC_, cudaDataType_, - alg_, spgemmDesc_)); + cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, + A_col_dev_, A_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_, + B_col_dev_, B_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL, + rType_, cType_, indType_, cudaDataType_)); cusparseCheckError( cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, @@ -280,10 +287,10 @@ class sp_gemm_gpu : public sp_gemm<T> { cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, &C_nnz_)); - cusparseCheckError( - cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, - &C_nnz_)); - + if (C_mem_allocated_always_) { + cudaCheckError(cudaFree(C_val_dev_)); + cudaCheckError(cudaFree(C_col_dev_)); + } cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_)); cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_)); @@ -309,8 +316,14 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost, s2_)); + if (C_mem_allocated_always_) { + free(C_val_); + free(C_col_); + } C_val_ = (T*)malloc(sizeof(T) * C_nnz_); C_col_ = (int*)malloc(sizeof(int) * C_nnz_); + C_mem_allocated_always_ = true; + cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) * C_nnz_, cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) * @@ -320,22 +333,13 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCheckError(cudaDeviceSynchronize()); // Freeing memory - cudaCheckError(cudaFree(C_val_dev_)); - cudaCheckError(cudaFree(C_col_dev_)); cudaCheckError(cudaFree(buffer1_)); cudaCheckError(cudaFree(buffer2_)); buffer_size1_ = 0; buffer_size2_ = 0; - free(C_val_); - free(C_col_); break; } case gpuOffloadType::once: { - cusparseCheckError( - cusparseSpGEMM_copy(handle_, opA_, opB_, &alpha, descrA_, - descrB_, &beta, descrC_, cudaDataType_, - alg_, spgemmDesc_)); - cusparseCheckError( cusparseSpGEMM_workEstimation(handle_, opA_, opB_, &alpha, descrA_, descrB_, &beta, @@ -365,8 +369,13 @@ class sp_gemm_gpu : public sp_gemm<T> { cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, &C_nnz_)); + if (C_mem_allocated_once_) { + cudaCheckError(cudaFree(C_val_dev_)); + cudaCheckError(cudaFree(C_col_dev_)); + } cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_)); cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_)); + C_mem_allocated_once_ = true; cusparseCheckError( cusparseCsrSetPointers(descrC_, C_row_dev_, C_col_dev_, @@ -377,8 +386,6 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaDataType_, alg_, spgemmDesc_)); // Freeing memory - cudaCheckError(cudaFree(C_val_dev_)); - cudaCheckError(cudaFree(C_col_dev_)); cudaCheckError(cudaFree(buffer1_)); cudaCheckError(cudaFree(buffer2_)); buffer_size1_ = 0; @@ -415,10 +422,14 @@ class sp_gemm_gpu : public sp_gemm<T> { cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, &C_nnz_)); - if (C_val_ != NULL) cudaCheckError(cudaFree(C_val_)); - if (C_val_ != NULL) cudaCheckError(cudaFree(C_col_)); + if (C_mem_allocated_unified_) { + cudaCheckError(cudaFree(C_val_)); + cudaCheckError(cudaFree(C_col_)); + } + cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * C_nnz_)); cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * C_nnz_)); + C_mem_allocated_unified_ = true; cusparseCheckError( cusparseCsrSetPointers(descrC_, C_row_, C_col_, C_val_)); @@ -445,7 +456,6 @@ class sp_gemm_gpu : public sp_gemm<T> { // Destroying descriptors cusparseCheckError(cusparseDestroySpMat(descrA_)); cusparseCheckError(cusparseDestroySpMat(descrB_)); - cusparseCheckError(cusparseDestroySpMat(descrC_)); switch(offload_) { case gpuOffloadType::always: { break; @@ -465,12 +475,19 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost, s2_)); + C_val_ = (T*)malloc(sizeof(T) * C_nnz_); + C_col_ = (int*)malloc(sizeof(int) * C_nnz_); + cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) * + C_nnz_, cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) * + C_nnz_, cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaDeviceSynchronize()); break; } case gpuOffloadType::unified: { + cusparseCheckError(cusparseDestroySpMat(descrC_)); // Ensure all data resides on host once work has completed cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_, cudaCpuDeviceId, s1_)); @@ -486,6 +503,10 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1), cudaCpuDeviceId, s2_)); +// cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * C_nnz_, +// cudaCpuDeviceId, s3_)); +// cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * C_nnz_, +// cudaCpuDeviceId, s3_)); cudaCheckError(cudaMemPrefetchAsync(C_row_, sizeof(int) * (n_ + 1), cudaCpuDeviceId, s3_)); // Ensure device has finished all work. @@ -506,7 +527,6 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCheckError(cudaStreamDestroy(s2_)); cudaCheckError(cudaStreamDestroy(s3_)); - if (offload_ == gpuOffloadType::unified) { cudaCheckError(cudaFree(A_val_)); cudaCheckError(cudaFree(A_col_)); @@ -514,6 +534,8 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCheckError(cudaFree(B_val_)); cudaCheckError(cudaFree(B_col_)); cudaCheckError(cudaFree(B_row_)); + cudaCheckError(cudaFree(C_val_)); + cudaCheckError(cudaFree(C_col_)); cudaCheckError(cudaFree(C_row_)); } else { free(A_val_); @@ -522,6 +544,8 @@ class sp_gemm_gpu : public sp_gemm<T> { free(B_val_); free(B_col_); free(B_row_); + free(C_val_); + free(C_col_); free(C_row_); cudaCheckError(cudaFree(A_val_dev_)); cudaCheckError(cudaFree(A_col_dev_)); @@ -529,6 +553,8 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCheckError(cudaFree(B_val_dev_)); cudaCheckError(cudaFree(B_col_dev_)); cudaCheckError(cudaFree(B_row_dev_)); + cudaCheckError(cudaFree(C_val_dev_)); + cudaCheckError(cudaFree(C_col_dev_)); cudaCheckError(cudaFree(C_row_dev_)); } } @@ -678,6 +704,10 @@ class sp_gemm_gpu : public sp_gemm<T> { int* C_col_dev_; int* C_row_dev_; + bool C_mem_allocated_always_; + bool C_mem_allocated_once_; + bool C_mem_allocated_unified_; + /** The constant value Alpha. */ const T alpha = ALPHA; diff --git a/include/doGemm.hh b/include/doGemm.hh index 9a66329..8743314 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -516,25 +516,27 @@ class doGemm { time_checksum_gflop cpuResult = spGemmCpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); -// // Perform the GPU kernels + // Perform the GPU kernels + + // - UNIFIED : data passed from host to device (and device to host) as + // needed + spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); + time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); + gpuResult_unified.gflops = + calcGflops(flops, iterations_, gpuResult_unified.runtime); + // - ALWAYS: Offload to/from GPU every iteration spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); time_checksum_gflop gpuResult_always = spGemmGpu_.compute(); gpuResult_always.gflops = calcGflops(flops, iterations_, gpuResult_always.runtime); -// // - ONCE : Offload to/from GPU once before all iterations and once -// // after + // - ONCE : Offload to/from GPU once before all iterations and once + // after spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime); - // - UNIFIED : data passed from host to device (and device to host) as - // needed - spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); - time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); - gpuResult_unified.gflops = - calcGflops(flops, iterations_, gpuResult_unified.runtime); // ToDo -- non-default GPU operations From 5b04a2c93e88ff4438770cfb9828ce681e364c92 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Mon, 1 Apr 2024 09:59:01 +0100 Subject: [PATCH 13/38] rebasing --- cuBLAS/sp_gemm.hh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index db9cf29..0848bb6 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -155,6 +155,18 @@ class sp_gemm_gpu : public sp_gemm<T> { switch(offload_) { case gpuOffloadType::always: { + // Make matrix descriptors + cusparseCheckError( + cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, + A_col_dev_, A_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_, + B_col_dev_, B_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL, + rType_, cType_, indType_, cudaDataType_)); break; } case gpuOffloadType::once: { From 23d318b7e066902bae676bf438f4141746fe79dc Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:26:37 +0100 Subject: [PATCH 14/38] rebasing --- include/doGemm.hh | 44 ++++++++++++++---------- include/main.hh | 2 +- oneMKL/CPU/sp_gemm.hh | 79 +++++++++++++++++++++++++++++++++++++++++++ src/main.cc | 3 +- 4 files changed, 108 insertions(+), 20 deletions(-) create mode 100644 oneMKL/CPU/sp_gemm.hh diff --git a/include/doGemm.hh b/include/doGemm.hh index 8743314..8153651 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -267,9 +267,7 @@ class doGemm { if (doCPU_ && doGPU_) { // Print offload results to stdout printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)"); - } #endif - // Square x Short and Wide // Re-initialise offload threshold structures & previous results cpuGpu_always_ = cpuGpu_offloadThreshold(); @@ -295,7 +293,7 @@ class doGemm { } #endif - // Square sparse matrix - sparse matrix multiplication +// Square sparse matrix - sparse matrix multiplication cpuGpu_always_ = cpuGpu_offloadThreshold(); cpuGpu_once_ = cpuGpu_offloadThreshold(); cpuGpu_unified_ = cpuGpu_offloadThreshold(); @@ -309,6 +307,12 @@ class doGemm { } // Close file csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && dpGPU_) { + // Print offload results to stdout + printOffloadThreshold("Sparse Square"); + } +#endif } private: @@ -512,14 +516,20 @@ class doGemm { const uint64_t flops = calcFlops(N, N, N); std::string kernelName = getKernelName(); - spGemmCpu_.initialise(N, sparsity); - time_checksum_gflop cpuResult = spGemmCpu_.compute(); - cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); - - // Perform the GPU kernels - +#if CPU_ENABLED + if (doCPU_) { + spGemmCpu_.initialise(N, sparsity); + time_checksum_gflop cpuResult = spGemmCpu_.compute(); + cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); + writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, + cpuResult.runtime, cpuResult.gflops); + } +#endif +#if GPU_ENABLED + // Perform the GPU kernels // - UNIFIED : data passed from host to device (and device to host) as // needed + if (doGPU_) { spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); gpuResult_unified.gflops = @@ -536,13 +546,9 @@ class doGemm { time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime); - - // ToDo -- non-default GPU operations // Write lines to CSV file - writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, - cpuResult.runtime, cpuResult.gflops); writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, iterations_, gpuResult_once.runtime, gpuResult_once.gflops); writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, @@ -551,6 +557,10 @@ class doGemm { writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, iterations_, gpuResult_unified.runtime, gpuResult_unified.gflops); + + } +#endif + } /** A function for calculating FLOPs performed by a GEMM. @@ -589,7 +599,7 @@ class doGemm { } /** Print to stdout the offload thresholds. */ - void printOffloadThreshold(std::string problemName) const { + void printOffloadThreshold(const std::string& problemName) const { std::vector<std::string> header = { "Device", "M", "N", "K", "Total Prob. Size (KiB)", "GFLOP/s", "CPU GFLOP/s"}; @@ -686,16 +696,14 @@ class doGemm { #if CPU_ENABLED /** The GEMM CPU kernel. */ cpu::gemm_cpu<T> gemmCpu_; + cpu::sp_gemm_cpu<T> spGemmCpu_; #endif - cpu::sp_gemm_cpu<T> spGemmCpu_; - #if GPU_ENABLED /** The GEMM GPU kernel. */ gpu::gemm_gpu<T> gemmGpu_; -#endif - gpu::sp_gemm_gpu<T> spGemmGpu_; +#endif /** The point at which offloading to GPU (offload once) becomes worthwhile. */ cpuGpu_offloadThreshold cpuGpu_once_; diff --git a/include/main.hh b/include/main.hh index cc0bb8f..f12ebcb 100644 --- a/include/main.hh +++ b/include/main.hh @@ -15,4 +15,4 @@ void printBenchmarkConfig(const int iters, const int upperLimit); int parseInt(const char* str); /** A function which parsen the runtime arguments. */ -void getParameters(int argc, char* argv[]); \ No newline at end of file +void getParameters(int argc, char** argv); \ No newline at end of file diff --git a/oneMKL/CPU/sp_gemm.hh b/oneMKL/CPU/sp_gemm.hh new file mode 100644 index 0000000..847006b --- /dev/null +++ b/oneMKL/CPU/sp_gemm.hh @@ -0,0 +1,79 @@ +#pragma once + +#ifdef CPU_ONEMKL +#include <mkl.h> + +#include <algorithm> + +#include "../../include/kernels/CPU/sp_gemm.hh" +#include "../../include/utilities.hh" + +namespace cpu { +/** A class for GEMM CPU BLAS kernels. */ +template <typename T> +class sp_gemm_cpu : public sp_gemm<T> { + public: + using sp_gemm<T>::sp_gemm; + using sp_gemm<T>::initInputMatrices; + using sp_gemm<T>::callConsume; + using sp_gemm<T>::m_; + using sp_gemm<T>::n_; + using sp_gemm<T>::k_; + using sp_gemm<T>::A_; + using sp_gemm<T>::B_; + using sp_gemm<T>::C_; + + /** Initialise the required data structures. */ + void initialise(int m, int n, int k) { + m_ = m; + n_ = n; + k_ = k; + + A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64); + B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64); + C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64); + + // Initialise the matricies + initInputMatrices(); + } + + private: + /** Make call to the GEMM kernel. */ + void callGemm() override { + if constexpr (std::is_same_v<T, float>) { + cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_, + (float)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_), + (float)BETA, C_, std::max(1, m_)); + } else if constexpr (std::is_same_v<T, double>) { + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_, + (double)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_), + (double)BETA, C_, std::max(1, m_)); + } else { + // Un-specialised class will not do any work - print error and exit. + std::cout << "ERROR - Datatype for OneMKL CPU GEMM kernel not supported." + << std::endl; + exit(1); + } + // Ensure compiler doesn't optimise away the work being done + callConsume(); + } + + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override {} + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override {} + + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() override { + mkl_free_buffers(); + mkl_free(A_); + mkl_free(B_); + mkl_free(C_); + } +}; +} // namespace cpu +#endif \ No newline at end of file diff --git a/src/main.cc b/src/main.cc index 38e2b5a..a4eb55b 100644 --- a/src/main.cc +++ b/src/main.cc @@ -1,6 +1,7 @@ #include "../include/main.hh" int iters = 10; +int startDim = 1; int upperLimit = 128; bool sgemm = true; bool dgemm = true; @@ -115,7 +116,7 @@ int parseInt(const char* str) { return strlen(next) ? -1 : value; } -void getParameters(int argc, char* argv[]) { +void getParameters(int argc, char** argv) { for (int i = 1; i < argc; i++) { if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i")) { if (++i >= argc || (iters = parseInt(argv[i])) < 0) { From be9094c3c28399ac44658d92941b4923323850f5 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:32:57 +0100 Subject: [PATCH 15/38] rebasing --- createGflopsGraphs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/createGflopsGraphs.py b/createGflopsGraphs.py index 0ed7772..d323162 100644 --- a/createGflopsGraphs.py +++ b/createGflopsGraphs.py @@ -199,7 +199,7 @@ plt.margins(x=0.01, y=0.01) leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18) - for obj in leg.legendHandles: + for obj in leg.legend_handles: obj.set_linewidth(3.0) obj.set_markersize(15.0) obj.set_markeredgewidth(3.0) From 7cfa7be9e278995be6d50a1ad00b9146b3996f79 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Wed, 3 Apr 2024 10:22:51 +0100 Subject: [PATCH 16/38] Tidying up spGEMM classes to remove duplicated code --- cuBLAS/sp_gemm.hh | 90 ++------------------------------- include/kernels/CPU/sp_gemm.hh | 72 ++------------------------ include/kernels/gemm.hh | 92 ++++++++++++++++++++++++++++++++++ oneMKL/CPU/sp_gemm.hh | 9 ++-- 4 files changed, 102 insertions(+), 161 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 0848bb6..992b018 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -18,6 +18,8 @@ template <typename T> class sp_gemm_gpu : public sp_gemm<T> { public: using sp_gemm<T>::sp_gemm; + using sp_gemm<T>::initInputMatricesSparse; + using sp_gemm<T>::toCSR; using sp_gemm<T>::n_; using sp_gemm<T>::A_; using sp_gemm<T>::B_; @@ -55,8 +57,7 @@ class sp_gemm_gpu : public sp_gemm<T> { // Work out number of edges needed to achieve target sparsity - int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); - A_nnz_ = B_nnz_ = edges; + A_nnz_ = B_nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity)); if (offload_ == gpuOffloadType::unified) { cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_)); @@ -105,28 +106,7 @@ class sp_gemm_gpu : public sp_gemm<T> { // Set initial values to 0 A_ = (T*)malloc(sizeof(T) * n_ * n_); B_ = (T*)malloc(sizeof(T) * n_ * n_); - for (int i = 0; i < (n_ * n_); i++) { - A_[i] = 0.0; - B_[i] = 0.0; - } - - // Random number generator objects for use in descent - std::default_random_engine gen; - gen.seed(std::chrono::system_clock::now() - .time_since_epoch().count()); - std::uniform_real_distribution<double> dist(0.0, 1.0); - - // Using a=0.45 and b=c=0.22 as default probabilities - for (int i = 0; i < A_nnz_; i++) { - while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} - } - for (int i = 0; i < B_nnz_; i++) { - while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} - } + initInputMatricesSparse(sparsity); toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_); @@ -571,68 +551,6 @@ class sp_gemm_gpu : public sp_gemm<T> { } } - bool rMat(T* M, int n, int x1, int x2, int y1, int y2, - float a, float b, float c, std::default_random_engine* gen, - std::uniform_real_distribution<double> dist, bool bin) { - // If a 1x1 submatrix, then add an edge and return out - if (x1 >= x2 && y1 >= y2) { - if (abs(M[(y1 * n) + x1]) > 0.1) { - return false; - } else { - // Add 1.0 if this is a binary graph, and a random real number otherwise - M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / - 100.0) - 50.0); - return true; - } - } else { - // Divide up the matrix - int xMidPoint = x1 + floor((x2 - x1) / 2); - int yMidPoint = y1 + floor((y2 - y1) / 2); - - // ToDo -- add some noise to these values between iterations - float newA = a; - float newB = b; - float newC = c; - - // Work out which quarter to recurse into - // There are some ugly ternary operators here to avoid going out of bounds in the edge case - // that we are already at 1 width or 1 height - float randomNum = dist(*gen); - if (randomNum < a) { - return rMat(M, n, x1, xMidPoint, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b)) { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b + c)) { - return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, - newA, newB, newC, gen, dist, bin); - } else { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, - ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, - gen, dist, bin); - } - } - return true; - } - - void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index, - int* row_ptr) { - int nnz_encountered = 0; - for (int row = 0; row < n_row; row++) { - row_ptr[row] = nnz_encountered; - int nnz_row = 0; - for (int col = 0; col < n_col; col++) { - if (dense[(row * n_col) + col] != 0.0) { - nnz_row++; - col_index[nnz_encountered] = col; - vals[nnz_encountered] = dense[(row * n_col) + col]; - nnz_encountered++; - } - } - } - row_ptr[n_row] = nnz_encountered; - } // ToDo -- the two following functons are useful for debugging. I'm diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh index 3de5ea5..6d9d011 100644 --- a/include/kernels/CPU/sp_gemm.hh +++ b/include/kernels/CPU/sp_gemm.hh @@ -11,6 +11,8 @@ namespace cpu { class sp_gemm : public ::gemm<T> { public: using ::gemm<T>::gemm; + using ::gemm<T>::initInputMatricesSparse; + using ::gemm<T>::toCSR; using ::gemm<T>::m_; using ::gemm<T>::n_; using ::gemm<T>::k_; @@ -27,78 +29,10 @@ namespace cpu { B_ = (T*)malloc(sizeof(T) * n_ * n_); C_ = (T*)malloc(sizeof(T) * n_ * n_); - // Set initial values to 0 - for (int i = 0; i < (n_ * n_); i++) { - A_[i] = 0.0; - B_[i] = 0.0; - } - - // Random number generator objects for use in descent - std::default_random_engine gen; - gen.seed(std::chrono::system_clock::now() - .time_since_epoch().count()); - std::uniform_real_distribution<double> dist(0.0, 1.0); - - // Work out number of edges needed to achieve target sparsity - int edges = 1 + (int) (n * n * (1 - sparsity)); - - // Initialise the matrices - // Using a=0.45 and b=c=0.22 as default probabilities - for (int i = 0; i < edges; i++) { - while (!rMat(A_, n, 0, n - 1, 0, n - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} - while (!rMat(B_, n, 0, n - 1, 0, n - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} - } + initInputMatricesSparse(sparsity); } private: - bool rMat(T* M, int n, int x1, int x2, int y1, int y2, - float a, float b, float c, std::default_random_engine* gen, - std::uniform_real_distribution<double> dist, bool bin) { - // If a 1x1 submatrix, then add an edge and return out - if (x1 >= x2 && y1 >= y2) { - if (abs(M[(y1 * n) + x1]) > 0.1) { - return false; - } else { - // Add 1.0 if this is a binary graph, and a random real number otherwise - M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / - 100.0) - 50.0); - return true; - } - } else { - // Divide up the matrix - int xMidPoint = x1 + floor((x2 - x1) / 2); - int yMidPoint = y1 + floor((y2 - y1) / 2); - - // ToDo -- add some noise to these values between iterations - float newA = a; - float newB = b; - float newC = c; - - // Work out which quarter to recurse into - // There are some ugly ternary operators here to avoid going out of bounds in the edge case - // that we are already at 1 width or 1 height - float randomNum = dist(*gen); - if (randomNum < a) { - return rMat(M, n, x1, xMidPoint, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b)) { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b + c)) { - return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, - newA, newB, newC, gen, dist, bin); - } else { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, - ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, - gen, dist, bin); - } - } - return true; - } /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() { diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index 4eda90f..59a9898 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -4,6 +4,7 @@ #include <chrono> #include <cmath> #include <limits> +#include <random> #include "../utilities.hh" @@ -86,9 +87,100 @@ class gemm { } } + void initInputMatricesSparse(float sparsity) { + for (int i = 0; i < (n_ * n_); i++) { + A_[i] = 0.0; + B_[i] = 0.0; + } + + // Random number generator objects for use in descent + std::default_random_engine gen; + gen.seed(std::chrono::system_clock::now() + .time_since_epoch().count()); + std::uniform_real_distribution<double> dist(0.0, 1.0); + + int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); + + // Using a=0.45 and b=c=0.22 as default probabilities + for (int i = 0; i < edges; i++) { + while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + } + for (int i = 0; i < edges; i++) { + while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, + 0.45, 0.22, 0.22, + &gen, dist, false)) {} + } + } + /** Call the extern consume() function. */ void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); } + /** Recursive function to populate sparse matrices */ + bool rMat(T* M, int n, int x1, int x2, int y1, int y2, + float a, float b, float c, std::default_random_engine* gen, + std::uniform_real_distribution<double> dist, bool bin) { + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + if (abs(M[(y1 * n) + x1]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / + 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // ToDo -- add some noise to these values between iterations + float newA = a; + float newB = b; + float newC = c; + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + newA, newB, newC, gen, dist, bin); + } else { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, + gen, dist, bin); + } + } + return true; + } + + void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index, + int* row_ptr) { + int nnz_encountered = 0; + for (int row = 0; row < n_row; row++) { + row_ptr[row] = nnz_encountered; + int nnz_row = 0; + for (int col = 0; col < n_col; col++) { + if (dense[(row * n_col) + col] != 0.0) { + nnz_row++; + col_index[nnz_encountered] = col; + vals[nnz_encountered] = dense[(row * n_col) + col]; + nnz_encountered++; + } + } + } + row_ptr[n_row] = nnz_encountered; + } + /** The number of iterations to perform per problem size. */ const int iterations_; diff --git a/oneMKL/CPU/sp_gemm.hh b/oneMKL/CPU/sp_gemm.hh index 847006b..5ac6a70 100644 --- a/oneMKL/CPU/sp_gemm.hh +++ b/oneMKL/CPU/sp_gemm.hh @@ -14,20 +14,17 @@ template <typename T> class sp_gemm_cpu : public sp_gemm<T> { public: using sp_gemm<T>::sp_gemm; - using sp_gemm<T>::initInputMatrices; + using sp_gemm<T>::initInputMatricesSparse; + using sp_gemm<T>::toCSR; using sp_gemm<T>::callConsume; - using sp_gemm<T>::m_; using sp_gemm<T>::n_; - using sp_gemm<T>::k_; using sp_gemm<T>::A_; using sp_gemm<T>::B_; using sp_gemm<T>::C_; /** Initialise the required data structures. */ - void initialise(int m, int n, int k) { - m_ = m; + void initialise(int n, float sparsity) { n_ = n; - k_ = k; A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64); B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64); From 30d384e22573067f0b32ee7aeb30811a44b39781 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:39:46 +0100 Subject: [PATCH 17/38] rebasing --- cuBLAS/sp_gemm.hh | 17 +++++++-- include/doGemm.hh | 82 +++++++++++++++++++++++------------------ include/kernels/gemm.hh | 49 +++++++++--------------- src/main.cc | 4 +- 4 files changed, 80 insertions(+), 72 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 992b018..aa095f8 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -36,6 +36,7 @@ class sp_gemm_gpu : public sp_gemm<T> { * - Unified: Initialise data as unified memory; no data movement semantics * required */ void initialise(gpuOffloadType offload, int n, float sparsity) override { + std::cout << "___________Initialising, problem size = " << n << std::endl; offload_ = offload; if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F; @@ -46,9 +47,11 @@ class sp_gemm_gpu : public sp_gemm<T> { } n_ = n * 20; + std::cout << "\tGetting device" << std::endl; // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); + std::cout << "\tMaking streams" << std::endl; // Initialise 3 streams to asynchronously move data between host and device cudaCheckError(cudaStreamCreate(&s1_)); cudaCheckError(cudaStreamCreate(&s2_)); @@ -59,6 +62,7 @@ class sp_gemm_gpu : public sp_gemm<T> { // Work out number of edges needed to achieve target sparsity A_nnz_ = B_nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity)); + std::cout << "\tMallocing" << std::endl; if (offload_ == gpuOffloadType::unified) { cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_)); cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * A_nnz_)); @@ -106,8 +110,11 @@ class sp_gemm_gpu : public sp_gemm<T> { // Set initial values to 0 A_ = (T*)malloc(sizeof(T) * n_ * n_); B_ = (T*)malloc(sizeof(T) * n_ * n_); + + std::cout << "\tInitialising start matrices" << std::endl; initInputMatricesSparse(sparsity); + std::cout << "\tConverting to CSR" << std::endl; toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_); toCSR(B_, n_, n_, B_nnz_, B_val_, B_col_, B_row_); @@ -132,7 +139,7 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { - + std::cout << "\t\tpre loop" << std::endl; switch(offload_) { case gpuOffloadType::always: { // Make matrix descriptors @@ -217,6 +224,7 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Make a call to the BLAS Library Kernel. */ void callGemm() override { + std::cout << "\t\tGEMM" << std::endl; switch(offload_) { case gpuOffloadType::always: { cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) * @@ -444,6 +452,7 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Perform any required steps after calling the GEMM kernel that should * be timed. */ void postLoopRequirements() override { + std::cout << "\t\tpost loop" << std::endl; cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_)); // Destroying descriptors cusparseCheckError(cusparseDestroySpMat(descrA_)); @@ -511,6 +520,7 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() override { + std::cout << "\t\tcleaning up" << std::endl; // Destroy the handle cusparseCheckError(cusparseDestroy(handle_)); @@ -519,6 +529,9 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCheckError(cudaStreamDestroy(s2_)); cudaCheckError(cudaStreamDestroy(s3_)); + free(A_); + free(B_); + if (offload_ == gpuOffloadType::unified) { cudaCheckError(cudaFree(A_val_)); cudaCheckError(cudaFree(A_col_)); @@ -551,8 +564,6 @@ class sp_gemm_gpu : public sp_gemm<T> { } } - - // ToDo -- the two following functons are useful for debugging. I'm // keeping them in to that end, though they are not used by the benchmark // itself diff --git a/include/doGemm.hh b/include/doGemm.hh index 8153651..f4ec053 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -34,13 +34,16 @@ class doGemm { public: doGemm(const std::string csvDir, const int iters, const int startDim, const int upperLimit, const bool cpuEnabled = true, - const bool gpuEnabled = true) + const bool gpuEnabled = true, const bool doDense = true, + const bool doSparse = true) : CSV_DIR(csvDir), iterations_(iters), startDimention_(startDim), upperLimit_(upperLimit), doCPU_(cpuEnabled), - doGPU_(gpuEnabled) + doGPU_(gpuEnabled), + doDense_(dense), + doSparse_(sparse), #if CPU_ENABLED , gemmCpu_(iterations_), @@ -59,27 +62,28 @@ class doGemm { /** Run all problem types and write data to CSV files. */ void collectData() { - // Square Problem Sizes... - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_square_square_M=N=K.csv"); - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = dim, N = dim, K = dim; - callDenseKernels(csvFile, dim, dim, dim); - } - // Close file - csvFile.close(); + if (doDense_) { + // Square Problem Sizes... + // Re-initialise offload threshold structures + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_square_M=N=K.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = dim, N = dim, K = dim; + callDenseKernels(csvFile, dim, dim, dim); + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Square x Square (M=N=K)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Square (M=N=K)"); + } #endif // Rectangular Problem Sizes: @@ -267,6 +271,7 @@ class doGemm { if (doCPU_ && doGPU_) { // Print offload results to stdout printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)"); + } #endif // Square x Short and Wide // Re-initialise offload threshold structures & previous results @@ -292,27 +297,28 @@ class doGemm { printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)"); } #endif + } -// Square sparse matrix - sparse matrix multiplication - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() + - "_sparse_square.csv"); - if (upperLimit_ >= 32) { - for (int dim = 1; dim <= upperLimit_; dim++) { - const int N = dim; - callSparseKernels(csvFile, N, 0.99); + if (doSparse_) { // Square sparse matrix - sparse matrix multiplication + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() + + "_sparse_square.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callSparseKernels(csvFile, dim, 0.99); + } } - } - // Close file - csvFile.close(); + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && dpGPU_) { + if (doCPU_ && doGPU_) { // Print offload results to stdout printOffloadThreshold("Sparse Square"); } #endif + } } private: @@ -693,6 +699,10 @@ class doGemm { /** Whether the GPU kernels should be run. */ const bool doGPU_ = true; + /** Whether we should run dense and or sparse kernels */ + const bool doDense_; + const bool doSparse_; + #if CPU_ENABLED /** The GEMM CPU kernel. */ cpu::gemm_cpu<T> gemmCpu_; diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index 59a9898..3ffc0d7 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -103,14 +103,8 @@ class gemm { // Using a=0.45 and b=c=0.22 as default probabilities for (int i = 0; i < edges; i++) { - while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} - } - for (int i = 0; i < edges; i++) { - while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, - 0.45, 0.22, 0.22, - &gen, dist, false)) {} + rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false); + rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false); } } @@ -118,23 +112,18 @@ class gemm { void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); } /** Recursive function to populate sparse matrices */ - bool rMat(T* M, int n, int x1, int x2, int y1, int y2, - float a, float b, float c, std::default_random_engine* gen, + void rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b, + float c, std::default_random_engine* gen, std::uniform_real_distribution<double> dist, bool bin) { // If a 1x1 submatrix, then add an edge and return out if (x1 >= x2 && y1 >= y2) { - if (abs(M[(y1 * n) + x1]) > 0.1) { - return false; - } else { - // Add 1.0 if this is a binary graph, and a random real number otherwise - M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / + M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0); - return true; - } + return; } else { // Divide up the matrix - int xMidPoint = x1 + floor((x2 - x1) / 2); - int yMidPoint = y1 + floor((y2 - y1) / 2); + int xMidPoint = (x1 == x2) ? x1 : x1 + floor((x2 - x1) / 2); + int yMidPoint = (y1 == y2) ? y1 : y1 + floor((y2 - y1) / 2); // ToDo -- add some noise to these values between iterations float newA = a; @@ -142,25 +131,23 @@ class gemm { float newC = c; // Work out which quarter to recurse into - // There are some ugly ternary operators here to avoid going out of bounds in the edge case - // that we are already at 1 width or 1 height + // There are some ugly ternary operators here to avoid going out of + // bounds in the edge case that we are already at 1 width or 1 height float randomNum = dist(*gen); if (randomNum < a) { - return rMat(M, n, x1, xMidPoint, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); + rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC, gen, dist, + bin); } else if (randomNum < (a + b)) { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); + rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC, gen, dist, + bin); } else if (randomNum < (a + b + c)) { - return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, - newA, newB, newC, gen, dist, bin); + rMat(M, n, x1, xMidPoint, yMidPoint, y2, newA, newB, newC, gen, + dist, bin); } else { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, - ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, - gen, dist, bin); + rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA, newB, newC, gen, + dist, bin); } } - return true; } void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index, diff --git a/src/main.cc b/src/main.cc index a4eb55b..268b628 100644 --- a/src/main.cc +++ b/src/main.cc @@ -37,14 +37,14 @@ int main(int argc, char** argv) { // SGEMM Comparison std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl; doGemm<float> sgemm(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu); + doGpu, sgemm, sp_sgemm); sgemm.collectData(); std::cout << "Finished!" << std::endl; // DGEMM Comparison std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl; doGemm<double> dgemm(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu); + doGpu, dgemm, sp_dgemm); dgemm.collectData(); std::cout << "Finished!" << std::endl; From cc8e2a86347ca35b598b462724b5c3c71fb9a659 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:43:02 +0100 Subject: [PATCH 18/38] rebasing --- cuBLAS/sp_gemm.hh | 16 +++------------- include/doGemm.hh | 4 ++-- include/kernels/gemm.hh | 34 ++++++++++++++++++++-------------- src/main.cc | 32 ++++++++++++++++++-------------- 4 files changed, 43 insertions(+), 43 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index aa095f8..2c787d9 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -36,7 +36,6 @@ class sp_gemm_gpu : public sp_gemm<T> { * - Unified: Initialise data as unified memory; no data movement semantics * required */ void initialise(gpuOffloadType offload, int n, float sparsity) override { - std::cout << "___________Initialising, problem size = " << n << std::endl; offload_ = offload; if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F; @@ -45,13 +44,11 @@ class sp_gemm_gpu : public sp_gemm<T> { std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; exit(1); } - n_ = n * 20; + n_ = n; - std::cout << "\tGetting device" << std::endl; // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); - std::cout << "\tMaking streams" << std::endl; // Initialise 3 streams to asynchronously move data between host and device cudaCheckError(cudaStreamCreate(&s1_)); cudaCheckError(cudaStreamCreate(&s2_)); @@ -62,7 +59,6 @@ class sp_gemm_gpu : public sp_gemm<T> { // Work out number of edges needed to achieve target sparsity A_nnz_ = B_nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity)); - std::cout << "\tMallocing" << std::endl; if (offload_ == gpuOffloadType::unified) { cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_)); cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * A_nnz_)); @@ -111,13 +107,11 @@ class sp_gemm_gpu : public sp_gemm<T> { A_ = (T*)malloc(sizeof(T) * n_ * n_); B_ = (T*)malloc(sizeof(T) * n_ * n_); - std::cout << "\tInitialising start matrices" << std::endl; initInputMatricesSparse(sparsity); - std::cout << "\tConverting to CSR" << std::endl; - toCSR(A_, n_, n_, A_nnz_, A_val_, A_col_, A_row_); + toCSR(A_, n_, n_, A_val_, A_col_, A_row_); - toCSR(B_, n_, n_, B_nnz_, B_val_, B_col_, B_row_); + toCSR(B_, n_, n_, B_val_, B_col_, B_row_); // std::cout << "_____Matrix A_____" << std::endl; @@ -139,7 +133,6 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { - std::cout << "\t\tpre loop" << std::endl; switch(offload_) { case gpuOffloadType::always: { // Make matrix descriptors @@ -224,7 +217,6 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Make a call to the BLAS Library Kernel. */ void callGemm() override { - std::cout << "\t\tGEMM" << std::endl; switch(offload_) { case gpuOffloadType::always: { cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) * @@ -452,7 +444,6 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Perform any required steps after calling the GEMM kernel that should * be timed. */ void postLoopRequirements() override { - std::cout << "\t\tpost loop" << std::endl; cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_)); // Destroying descriptors cusparseCheckError(cusparseDestroySpMat(descrA_)); @@ -520,7 +511,6 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() override { - std::cout << "\t\tcleaning up" << std::endl; // Destroy the handle cusparseCheckError(cusparseDestroy(handle_)); diff --git a/include/doGemm.hh b/include/doGemm.hh index f4ec053..53bbb54 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -42,8 +42,8 @@ class doGemm { upperLimit_(upperLimit), doCPU_(cpuEnabled), doGPU_(gpuEnabled), - doDense_(dense), - doSparse_(sparse), + doDense_(doDense), + doSparse_(doSparse) #if CPU_ENABLED , gemmCpu_(iterations_), diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index 3ffc0d7..230c7d3 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -103,8 +103,10 @@ class gemm { // Using a=0.45 and b=c=0.22 as default probabilities for (int i = 0; i < edges; i++) { - rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false); - rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false); + while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, + false)) {} + while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, + false)){} } } @@ -112,14 +114,18 @@ class gemm { void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); } /** Recursive function to populate sparse matrices */ - void rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b, + bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b, float c, std::default_random_engine* gen, std::uniform_real_distribution<double> dist, bool bin) { // If a 1x1 submatrix, then add an edge and return out if (x1 >= x2 && y1 >= y2) { - M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / + if (M[(int) (y1 * n) + x1] == 0) { + M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0); - return; + return true; + } else { + return false; + } } else { // Divide up the matrix int xMidPoint = (x1 == x2) ? x1 : x1 + floor((x2 - x1) / 2); @@ -135,22 +141,22 @@ class gemm { // bounds in the edge case that we are already at 1 width or 1 height float randomNum = dist(*gen); if (randomNum < a) { - rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC, gen, dist, - bin); + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC, + gen, dist, bin); } else if (randomNum < (a + b)) { - rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC, gen, dist, - bin); + return rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC, + gen, dist, bin); } else if (randomNum < (a + b + c)) { - rMat(M, n, x1, xMidPoint, yMidPoint, y2, newA, newB, newC, gen, - dist, bin); + return rMat(M, n, x1, xMidPoint, yMidPoint, y2, newA, newB, newC, gen, + dist, bin); } else { - rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA, newB, newC, gen, - dist, bin); + return rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA, newB, newC, + gen, dist, bin); } } } - void toCSR(T* dense, int n_col, int n_row, int nnz, T* vals, int* col_index, + void toCSR(T* dense, int n_col, int n_row, T* vals, int* col_index, int* row_ptr) { int nnz_encountered = 0; for (int row = 0; row < n_row; row++) { diff --git a/src/main.cc b/src/main.cc index 268b628..06fd48e 100644 --- a/src/main.cc +++ b/src/main.cc @@ -3,10 +3,10 @@ int iters = 10; int startDim = 1; int upperLimit = 128; -bool sgemm = true; -bool dgemm = true; -bool sp_sgemm = true; -bool sp_dgemm = true; +bool doSgemm = true; +bool doDgemm = true; +bool doSp_sgemm = true; +bool doSp_dgemm = true; bool doCpu = CPU_ENABLED; bool doGpu = GPU_ENABLED; @@ -37,14 +37,14 @@ int main(int argc, char** argv) { // SGEMM Comparison std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl; doGemm<float> sgemm(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu, sgemm, sp_sgemm); + doGpu, doSgemm, doSp_sgemm); sgemm.collectData(); std::cout << "Finished!" << std::endl; // DGEMM Comparison std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl; doGemm<double> dgemm(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu, dgemm, sp_dgemm); + doGpu, doDgemm, doSp_dgemm); dgemm.collectData(); std::cout << "Finished!" << std::endl; @@ -146,28 +146,28 @@ void getParameters(int argc, char** argv) { } else if (!strcmp(argv[i], "--no_gpu")) { doGpu = false; } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) { - sgemm = dgemm = sp_sgemm = sp_dgemm = false; + doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false; std::string kernelList = argv[++i]; if (kernelList.find("sp-sgemm") != std::string::npos) { - sp_sgemm = true; + doSp_sgemm = true; if (kernelList.find("sgemm") != std::string::npos && kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) { - sgemm = true; + doSgemm = true; } } else if (kernelList.find("sgemm") != std::string::npos) { - sgemm = true; + doSgemm = true; } if (kernelList.find("sp-dgemm") != std::string::npos) { - sp_dgemm = true; + doSp_dgemm = true; if (kernelList.find("dgemm") != std::string::npos && kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) { - dgemm = true; + doDgemm = true; } } else if (kernelList.find("dgemm") != std::string::npos) { - dgemm = true; + doDgemm = true; } - if (!sgemm && !dgemm && !sp_sgemm && !sp_dgemm) { + if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm) { std::cout << "ERROR - no implemented kernels in list" << std::endl; exit(1); } @@ -200,6 +200,10 @@ void getParameters(int argc, char** argv) { std::cout << " -d --dimension_limit D Max value of M, N, K is D " "(default: " << upperLimit << ")" << std::endl; + std::cout << " -k --kernels <kernels> Comma-separated list of " + "kernels to be run. Options are sgemm, dgemm, sp-sgemm, " + "sp-dgemm (default: sgemm,dgemm,sp-gemm,sp-dgemm)" << + std::endl; std::cout << std::endl; exit(0); } else { From de56ae19b2934221195fdd4b020f0d33f97879a5 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:44:12 +0100 Subject: [PATCH 19/38] rebasing --- cuBLAS/sp_gemm.hh | 27 +++++++++++++++++++-------- include/doGemm.hh | 2 +- include/kernels/gemm.hh | 38 +++++++++++++++++++++----------------- 3 files changed, 41 insertions(+), 26 deletions(-) diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 2c787d9..8bed12b 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -26,7 +26,7 @@ class sp_gemm_gpu : public sp_gemm<T> { using sp_gemm<T>::C_; using sp_gemm<T>::offload_; - // ToDo -- No checksum for sparse yet. Nedd to do + // ToDo -- No checksum for sparse yet. Need to do /** Initialise the required data structures. * `offload` refers to the data offload type: @@ -44,7 +44,7 @@ class sp_gemm_gpu : public sp_gemm<T> { std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; exit(1); } - n_ = n; + n_ = 100 * n; // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); @@ -133,6 +133,7 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { + cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_)); switch(offload_) { case gpuOffloadType::always: { // Make matrix descriptors @@ -212,13 +213,17 @@ class sp_gemm_gpu : public sp_gemm<T> { break; } } - cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDesc_)); } /** Make a call to the BLAS Library Kernel. */ void callGemm() override { switch(offload_) { case gpuOffloadType::always: { + if (C_mem_allocated_always_) { + cusparseCheckError(cusparseDestroySpMat(descrA_)); + cusparseCheckError(cusparseDestroySpMat(descrB_)); + cusparseCheckError(cusparseDestroySpMat(descrC_)); + } cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) * A_nnz_, cudaMemcpyHostToDevice, s1_)); cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) * @@ -235,6 +240,7 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_ + 1), cudaMemcpyHostToDevice, s3_)); + cudaCheckError(cudaDeviceSynchronize()); // Make matrix descriptors cusparseCheckError( @@ -444,10 +450,6 @@ class sp_gemm_gpu : public sp_gemm<T> { /** Perform any required steps after calling the GEMM kernel that should * be timed. */ void postLoopRequirements() override { - cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_)); - // Destroying descriptors - cusparseCheckError(cusparseDestroySpMat(descrA_)); - cusparseCheckError(cusparseDestroySpMat(descrB_)); switch(offload_) { case gpuOffloadType::always: { break; @@ -476,10 +478,14 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaDeviceSynchronize()); + + cusparseCheckError(cusparseDestroySpMat(descrA_)); + cusparseCheckError(cusparseDestroySpMat(descrB_)); + cusparseCheckError(cusparseDestroySpMat(descrC_)); + break; } case gpuOffloadType::unified: { - cusparseCheckError(cusparseDestroySpMat(descrC_)); // Ensure all data resides on host once work has completed cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_, cudaCpuDeviceId, s1_)); @@ -503,9 +509,14 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCpuDeviceId, s3_)); // Ensure device has finished all work. cudaCheckError(cudaDeviceSynchronize()); + + cusparseCheckError(cusparseDestroySpMat(descrA_)); + cusparseCheckError(cusparseDestroySpMat(descrB_)); + cusparseCheckError(cusparseDestroySpMat(descrC_)); break; } } + cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDesc_)); } /** Do any necessary cleanup (free pointers, close library handles, etc.) diff --git a/include/doGemm.hh b/include/doGemm.hh index 53bbb54..b89abee 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -307,7 +307,7 @@ class doGemm { "_sparse_square.csv"); if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { - callSparseKernels(csvFile, dim, 0.99); + callSparseKernels(csvFile, dim, 0.9999); } } // Close file diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index 230c7d3..2a971a0 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -106,7 +106,7 @@ class gemm { while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false)) {} while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, - false)){} + false)) {} } } @@ -119,17 +119,19 @@ class gemm { std::uniform_real_distribution<double> dist, bool bin) { // If a 1x1 submatrix, then add an edge and return out if (x1 >= x2 && y1 >= y2) { - if (M[(int) (y1 * n) + x1] == 0) { - M[(int) (y1 * n) + x1] = (bin) ? 1.0 : (((rand() % 10000) / - 100.0) - 50.0); - return true; - } else { + // Needed to avoid overfloe segfaults with large problem sizes + uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1); + if (abs(M[index]) > 0.1) { return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0); + return true; } } else { // Divide up the matrix - int xMidPoint = (x1 == x2) ? x1 : x1 + floor((x2 - x1) / 2); - int yMidPoint = (y1 == y2) ? y1 : y1 + floor((y2 - y1) / 2); + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); // ToDo -- add some noise to these values between iterations float newA = a; @@ -137,23 +139,25 @@ class gemm { float newC = c; // Work out which quarter to recurse into - // There are some ugly ternary operators here to avoid going out of - // bounds in the edge case that we are already at 1 width or 1 height + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height float randomNum = dist(*gen); if (randomNum < a) { - return rMat(M, n, x1, xMidPoint, y1, yMidPoint, newA, newB, newC, - gen, dist, bin); + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); } else if (randomNum < (a + b)) { - return rMat(M, n, xMidPoint, x2, y1, yMidPoint, newA, newB, newC, - gen, dist, bin); + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); } else if (randomNum < (a + b + c)) { - return rMat(M, n, x1, xMidPoint, yMidPoint, y2, newA, newB, newC, gen, - dist, bin); + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + newA, newB, newC, gen, dist, bin); } else { - return rMat(M, n, xMidPoint, x2, yMidPoint, y2, newA, newB, newC, + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, gen, dist, bin); } } + return true; } void toCSR(T* dense, int n_col, int n_row, T* vals, int* col_index, From b972c23e4058c5d5e541b6d3f3e3424dc185f7b0 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:49:45 +0100 Subject: [PATCH 20/38] rebasing --- src/main.cc | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/main.cc b/src/main.cc index 06fd48e..51d1cf1 100644 --- a/src/main.cc +++ b/src/main.cc @@ -146,26 +146,26 @@ void getParameters(int argc, char** argv) { } else if (!strcmp(argv[i], "--no_gpu")) { doGpu = false; } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) { - doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false; - std::string kernelList = argv[++i]; - if (kernelList.find("sp-sgemm") != std::string::npos) { - doSp_sgemm = true; - if (kernelList.find("sgemm") != std::string::npos && - kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) { - doSgemm = true; - } - } else if (kernelList.find("sgemm") != std::string::npos) { - doSgemm = true; - } - if (kernelList.find("sp-dgemm") != std::string::npos) { - doSp_dgemm = true; - if (kernelList.find("dgemm") != std::string::npos && - kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) { - doDgemm = true; - } - } else if (kernelList.find("dgemm") != std::string::npos) { - doDgemm = true; - } + doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false; + std::string kernelList = argv[++i]; + if (kernelList.find("sp-sgemm") != std::string::npos) { + doSp_sgemm = true; + if (kernelList.find("sgemm") != std::string::npos && + kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) { + doSgemm = true; + } + } else if (kernelList.find("sgemm") != std::string::npos) { + doSgemm = true; + } + if (kernelList.find("sp-dgemm") != std::string::npos) { + doSp_dgemm = true; + if (kernelList.find("dgemm") != std::string::npos && + kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) { + doDgemm = true; + } + } else if (kernelList.find("dgemm") != std::string::npos) { + doDgemm = true; + } if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm) { std::cout << "ERROR - no implemented kernels in list" << std::endl; From 1f5f2ddebf774b9bd35b52ab29ef02cca6065ff3 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:50:03 +0100 Subject: [PATCH 21/38] rebasing --- calculateOffloadThreshold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/calculateOffloadThreshold.py b/calculateOffloadThreshold.py index 38c2646..43028c0 100644 --- a/calculateOffloadThreshold.py +++ b/calculateOffloadThreshold.py @@ -165,7 +165,7 @@ def printResults(once:offloadThreshold, always:offloadThreshold, unified:offload gpuAlways.M = 0 gpuAlways.N = 0 gpuAlways.K = 0 - if(gpuUnified.M != 0 and float(cpu[8]) >= float(gpuU[8])): + if("gemm" in kernel and gpuUnified.M != 0 and float(cpu[8]) >= float(gpuU[8])): # Do check to see if this is a momentary drop that we should ignore if (prevGpuUgflops <= float(cpu[8])) and (float(gpuLines[2].split(',')[8]) <= float(cpu[8])): gpuUnified.cpuGflops = 0.0 From b06250c0ca7a8d14c2904d69a70da24f89824e5d Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:50:37 +0100 Subject: [PATCH 22/38] rebasing --- AOCL/sp_gemm.hh | 62 ++++++++++ cuBLAS/common.hh | 53 +++++++-- cuBLAS/sp_gemm.hh | 4 +- include/doGemm.hh | 4 +- include/kernels/CPU/sp_gemm.hh | 3 +- include/kernels/gemm.hh | 25 +++- oneMKL/CPU/sp_gemm.hh | 201 +++++++++++++++++++++++++++++---- 7 files changed, 320 insertions(+), 32 deletions(-) create mode 100644 AOCL/sp_gemm.hh diff --git a/AOCL/sp_gemm.hh b/AOCL/sp_gemm.hh new file mode 100644 index 0000000..3c6b5c0 --- /dev/null +++ b/AOCL/sp_gemm.hh @@ -0,0 +1,62 @@ +#pragma once + +#ifdef CPU_AOCL +#include <blis.h> + +#include "../include/kernels/CPU/gemm.hh" +#include "../include/utilities.hh" + +namespace cpu { +/** A class for GEMM CPU BLAS kernels. */ +template <typename T> +class gemm_cpu : public gemm<T> { + public: + using gemm<T>::gemm; + using gemm<T>::callConsume; + using gemm<T>::m_; + using gemm<T>::n_; + using gemm<T>::k_; + using gemm<T>::A_; + using gemm<T>::B_; + using gemm<T>::C_; + + private: + /** Make call to the GEMM kernel. */ + void callGemm() override { + if constexpr (std::is_same_v<T, float>) { + bli_sgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_, + rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_), + &beta, C_, rowStride, std::max(1, m_)); + } else if constexpr (std::is_same_v<T, double>) { + bli_dgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_, + rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_), + &beta, C_, rowStride, std::max(1, m_)); + } else { + // Un-specialised class will not do any work - print error and exit. + std::cout << "ERROR - Datatype for AOCL CPU GEMM kernel not supported." + << std::endl; + exit(1); + } + // Ensure compiler doesn't optimise away the work being done + callConsume(); + } + + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override {} + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override {} + + /** The constant value Alpha. */ + T alpha = ALPHA; + + /** The constant value Beta. */ + T beta = BETA; + + /** The distance in elements to the next column. */ + const int rowStride = 1; +}; +} // namespace cpu +#endif \ No newline at end of file diff --git a/cuBLAS/common.hh b/cuBLAS/common.hh index 70d58fb..c8086db 100644 --- a/cuBLAS/common.hh +++ b/cuBLAS/common.hh @@ -16,13 +16,52 @@ } while (false) /** Macro function to check if error occurred when calling cuBLAS. */ -#define cublasCheckError(f) \ - do { \ - if (cublasStatus_t e = (f); e != CUBLAS_STATUS_SUCCESS) { \ - std::cout << "CUBLAS error: " << __FILE__ << ":" << __LINE__ << ": " \ - << cublasGetStatusString(e) << std::endl; \ - exit(1); \ - } \ +#define cublasCheckError(f) \ + do { \ + switch (f) { \ + case CUBLAS_STATUS_SUCCESS: \ + break; \ + case CUBLAS_STATUS_NOT_INITIALIZED: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_NOT_INITIALIZED" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_ALLOC_FAILED: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_ALLOC_FAILED" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_INVALID_VALUE: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_INVALID_VALUE" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_ARCH_MISMATCH: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_ARCH_MISMATCH" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_MAPPING_ERROR: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_MAPPING_ERROR" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_EXECUTION_FAILED: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_EXECUTION_FAILED" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_INTERNAL_ERROR: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_INTERNAL_ERROR" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_NOT_SUPPORTED: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_NOT_SUPPORTED" << std::endl; \ + exit(1); \ + case CUBLAS_STATUS_LICENSE_ERROR: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": CUBLAS_STATUS_LICENSE_ERROR" << std::endl; \ + exit(1); \ + default: \ + std::cout << "CUBLAS error: " << __FILE__ << ": " << __LINE__ \ + << ": other error not in switch statement" << std::endl; \ + exit(1); \ + } \ } while (false) #define cusparseCheckError(f) \ diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index 8bed12b..d849d22 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -109,9 +109,9 @@ class sp_gemm_gpu : public sp_gemm<T> { initInputMatricesSparse(sparsity); - toCSR(A_, n_, n_, A_val_, A_col_, A_row_); + toCSR_int(A_, n_, n_, A_val_, A_col_, A_row_); - toCSR(B_, n_, n_, B_val_, B_col_, B_row_); + toCSR_int(B_, n_, n_, B_val_, B_col_, B_row_); // std::cout << "_____Matrix A_____" << std::endl; diff --git a/include/doGemm.hh b/include/doGemm.hh index b89abee..e264273 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -303,8 +303,8 @@ class doGemm { cpuGpu_always_ = cpuGpu_offloadThreshold(); cpuGpu_once_ = cpuGpu_offloadThreshold(); cpuGpu_unified_ = cpuGpu_offloadThreshold(); - csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() + - "_sparse_square.csv"); + std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" + + getKernelName() + "_sparse_square.csv"); if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { callSparseKernels(csvFile, dim, 0.9999); diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh index 6d9d011..60778e7 100644 --- a/include/kernels/CPU/sp_gemm.hh +++ b/include/kernels/CPU/sp_gemm.hh @@ -1,5 +1,6 @@ #pragma once +#ifdef CPU_ONEMKL #include "../gemm.hh" #include <random> @@ -41,4 +42,4 @@ namespace cpu { free(C_); } }; -} // namespace cpu \ No newline at end of file +} // namespace cpu diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index 2a971a0..d97fc8c 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -1,5 +1,9 @@ #pragma once +#ifdef CPU_ONEMKL +#include <mkl.h> +#endif + #include <algorithm> #include <chrono> #include <cmath> @@ -160,7 +164,7 @@ class gemm { return true; } - void toCSR(T* dense, int n_col, int n_row, T* vals, int* col_index, + void toCSR_int(T* dense, int n_col, int n_row, T* vals, int* col_index, int* row_ptr) { int nnz_encountered = 0; for (int row = 0; row < n_row; row++) { @@ -178,6 +182,25 @@ class gemm { row_ptr[n_row] = nnz_encountered; } +#ifdef CPU_ONEMKL + void toCSR_mkl(T* dense, int n_col, int n_row, T* vals, MKL_INT* col_index, + MKL_INT* row_ptr) { + int nnz_encountered = 0; + for (int row = 0; row < n_row; row++) { + row_ptr[row] = (MKL_INT)nnz_encountered; + int nnz_row = 0; + for (int col = 0; col < n_col; col++) { + if (dense[(row * n_col) + col] != 0.0) { + nnz_row++; + col_index[nnz_encountered] = (MKL_INT)col; + vals[nnz_encountered] = dense[(row * n_col) + col]; + nnz_encountered++; + } + } + } + row_ptr[n_row] = (MKL_INT)nnz_encountered; + } +#endif /** The number of iterations to perform per problem size. */ const int iterations_; diff --git a/oneMKL/CPU/sp_gemm.hh b/oneMKL/CPU/sp_gemm.hh index 5ac6a70..0b4e32b 100644 --- a/oneMKL/CPU/sp_gemm.hh +++ b/oneMKL/CPU/sp_gemm.hh @@ -24,33 +24,146 @@ class sp_gemm_cpu : public sp_gemm<T> { /** Initialise the required data structures. */ void initialise(int n, float sparsity) { - n_ = n; - A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64); B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64); C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64); + n_ = n * 100; + nnz_ = (1 + (int)(n_ * n_ * (1 - sparsity))); + + values_A_ = (T*)mkl_malloc(sizeof(T) * nnz_, ALIGN); + columns_A_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, ALIGN); + rowIndex_A_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (n_ + 1), ALIGN); + + values_B_ = (T*)mkl_malloc(sizeof(T) * nnz_, ALIGN); + columns_B_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, ALIGN); + rowIndex_B_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (n_ + 1), ALIGN); + + x_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN); + y_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN); + rslt_mv_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN); + rslt_mv_trans_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN); + // Initialise the matricies - initInputMatrices(); + initInputMatricesSparse(sparsity); + + descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL; + + // Transfer from dense to CSR format + toCSR_mkl(A_, n_, n_, values_A_, columns_A_, rowIndex_A_); + toCSR_mkl(B_, n_, n_, values_B_, columns_B_, rowIndex_B_); + + // ToDo -- Set values for x and y (which are vectors of length n_?) + + if constexpr (std::is_same_v<T, float>) { + CALL_AND_CHECK_STATUS(mkl_sparse_s_create_csr(&csrA_, + SPARSE_INDEX_BASE_ZERO, n_, + n_, rowIndex_A_, + rowIndex_A_+1, columns_A_, + values_A_), + "Error after MKL_SPARSE_D_CREATE_CSR for csrA\n"); + CALL_AND_CHECK_STATUS(mkl_sparse_s_create_csr(&csrB_, + SPARSE_INDEX_BASE_ZERO, n_, + n_, rowIndex_B_, + rowIndex_B_+1, columns_B_, + values_B_), + "Error after MKL_SPARSE_D_CREATE_CSR for csrB\n"); + } else if constexpr (std::is_same_v<T, double>) { + CALL_AND_CHECK_STATUS(mkl_sparse_d_create_csr(&csrA_, + SPARSE_INDEX_BASE_ZERO, n_, + n_, rowIndex_A_, + rowIndex_A_+1, columns_A_, + values_A_), + "Error after MKL_SPARSE_D_CREATE_CSR for csrA\n"); + CALL_AND_CHECK_STATUS(mkl_sparse_d_create_csr(&csrB_, + SPARSE_INDEX_BASE_ZERO, n_, + n_, rowIndex_B_, + rowIndex_B_+1, columns_B_, + values_B_), + "Error after MKL_SPARSE_D_CREATE_CSR for csrB\n"); + } else { + std::cout << "ERROR - Datatype for OneMKL CPU spGEMM kernel not " + "supported." << std::endl; + exit(1) + }; + + CALL_AND_CHECK_STATUS(mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE, + csrA_, csrB_, &csrC_), + "Error after MKL_SPARSE_SPMM\n"); + + // ToDo -- check that transpose is what I want here + CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrA_, + SPARSE_OPERATION_TRANSPOSE, + descr_type_gen_, 1), + "Error after MKL_SPARSE_SET_MV_HINT with csrA_\n"); + CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrB_, + SPARSE_OPERATION_NON_TRANSPOSE, + descr_type_gen_, 1), + "Error after MKL_SPARSE_SET_MV_HINT with csrB_\n"); + CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrC_, + SPARSE_OPERATION_NON_TRANSPOSE, + descr_type_gen_, 1), + "Error after MKL_SPARSE_SET_MV_HINT with csrC_\n"); + + CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrA_), + "Error after MKL_SPARSE_OPTIMIZE with csrA_\n"); + CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrB_), + "Error after MKL_SPARSE_OPTIMIZE with csrB_\n"); + CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrC_), + "Error after MKL_SPARSE_OPTIMIZE with csrC_\n"); } private: /** Make call to the GEMM kernel. */ void callGemm() override { if constexpr (std::is_same_v<T, float>) { - cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_, - (float)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_), - (float)BETA, C_, std::max(1, m_)); - } else if constexpr (std::is_same_v<T, double>) { - cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_, - (double)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_), - (double)BETA, C_, std::max(1, m_)); - } else { - // Un-specialised class will not do any work - print error and exit. - std::cout << "ERROR - Datatype for OneMKL CPU GEMM kernel not supported." - << std::endl; - exit(1); + CALL_AND_CHECK_STATUS(mkl_sparse_s_mv(SPARSE_OPERATION_NON_TRASPOSE, 1 + .0, csrC_, descr_type_gen_, x_, 0.0, rslt_mv_), + "Error after MKL_SPARSE_S_MV for csrC_ * x_\n"); + left_ = cblas_sdot(n_, rstl_mv_, 1, y_, 1); + + CALL_AND_CHECK_STATUS(mkl_sparse_s_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1 + .0, csrB_, descr_type_gen_, x, 0.0, trslt_mv_), + "Error adter MKL_SPARSE_S_MV for csrB_ * x_\n"); + CALL_AND_CHECK_STATUS(mkl_sparse_s_mv(SPARSE_OPERATION_TRANSPOSE, 1.0, + csrA_, descr_type_gen_, y_, 0.0, + rslt_mv_trans_), + "Error adter MKL_SPARSE_S_MV for csrA_ * y_\n"); + right_ = cblas_sdot(n_, rslt_mv_, 1, rslt_mv_trans_, 1); + + residual = fabs(left - right)/(fabs(left) + 1); + + CALL_AND_CHECK_STATUS(mkl_sparse_s_export_csr(csrC_, &indexing_, + &rows_, &cols_, + &pointerB_C_, + &pointerE_C_, + &columns_C_, &values_C_), + "Error after MKL_SPARSE_S_EXPORT_CSR\n"); + } else if constexpr (std::is_same_v<T, double) { + CALL_AND_CHECK_STATUS(mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRASPOSE, 1 + .0, csrC_, descr_type_gen_, x_, 0.0, rslt_mv_), + "Error after MKL_SPARSE_D_MV for csrC_ * x_\n"); + left_ = cblas_ddot(n_, rstl_mv_, 1, y_, 1); + + CALL_AND_CHECK_STATUS(mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1 + .0, csrB_, descr_type_gen_, x, 0.0, trslt_mv_), + "Error adter MKL_SPARSE_D_MV for csrB_ * x_\n"); + CALL_AND_CHECK_STATUS(mkl_sparse_d_mv(SPARSE_OPERATION_TRANSPOSE, 1.0, + csrA_, descr_type_gen_, y_, 0.0, + rslt_mv_trans_), + "Error adter MKL_SPARSE_D_MV for csrA_ * y_\n"); + right_ = cblas_ddot(n_, rslt_mv_, 1, rslt_mv_trans_, 1); + + residual = fabs(left - right)/(fabs(left) + 1); + + CALL_AND_CHECK_STATUS(mkl_sparse_d_export_csr(csrC_, &indexing_, + &rows_, &cols_, + &pointerB_C_, + &pointerE_C_, + &columns_C_, &values_C_), + "Error after MKL_SPARSE_D_EXPORT_CSR\n"); } + // Ensure compiler doesn't optimise away the work being done callConsume(); } @@ -66,11 +179,61 @@ class sp_gemm_cpu : public sp_gemm<T> { /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() override { - mkl_free_buffers(); - mkl_free(A_); - mkl_free(B_); - mkl_free(C_); + if (mkl_sparse_destroy(csrC_) != SPARSE_STATUS_SUCCESS) { + printf(" Error after MKL_SPARSE_DESTROY, csrC_\n"); + fflush(0); + status = 1; + } + + //Deallocate arrays for which we allocate memory ourselves. + mkl_free(rslt_mv_trans_); + mkl_free(rslt_mv-); + mkl_free(x_); + mkl_free(y_); + + //Release matrix handle and deallocate arrays for which we allocate memory ourselves. + if (mkl_sparse_destroy(csrA_) != SPARSE_STATUS_SUCCESS) { + printf("Error after MKL_SPARSE_DESTROY, csrA_\n"); + fflush(0); + status = 1; + } + + mkl_free(values_A_); + mkl_free(columns_A_); + mkl_free(rowIndex_A_); + + if (mkl_sparse_destroy(csrB_) != SPARSE_STATUS_SUCCESS) { + printf("Error after MKL_SPARSE_DESTROY, csrB_\n"); + fflush(0); + status = 1; + } + + mkl_free(values_B_); + mkl_free(columns_B_); + mkl_free(rowIndex_B_); } + + int nnz_; + + MKL_INT* columns_A_; + MKL_INT* columns_B_; + MKL_INT* columns_C_; + MKL_INT* rowIndex_A_; + MKL_INT* rowIndex_B_; + MKL_INT* pointerB_C_; + MKL_INT* pointerE_C_; + + T* rslt_mv_; + T* rslt_mv_trans_; + T* x_; + T* y_; + + T left_, right_, residual_; + MKL_INT rows_, cols_, i_, j_, ii_, status_; + + sparse_index_base_t indexing_; + struct matrix_descr descr_type_gen_; + sparse_matrix_t csrA_, csrB_, csrC_; }; } // namespace cpu #endif \ No newline at end of file From 42bdc5846d6a5bac4f3270d62b258e0d021757aa Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Wed, 21 Aug 2024 11:05:52 +0100 Subject: [PATCH 23/38] Adding AOCL files --- AOCL/gemm.hh | 1 + AOCL/sp_gemm.hh | 32 ++++- ArmPL/sp_gemm.hh | 231 +++++++++++++++++++++++++++++++++ NVPL/sp_gemv.hh | 117 +++++++++++++++++ include/kernels/CPU/sp_gemm.hh | 71 +++++++++- include/kernels/gemm.hh | 22 ++++ 6 files changed, 464 insertions(+), 10 deletions(-) create mode 100644 ArmPL/sp_gemm.hh create mode 100644 NVPL/sp_gemv.hh diff --git a/AOCL/gemm.hh b/AOCL/gemm.hh index 3c6b5c0..f418bdc 100644 --- a/AOCL/gemm.hh +++ b/AOCL/gemm.hh @@ -23,6 +23,7 @@ class gemm_cpu : public gemm<T> { private: /** Make call to the GEMM kernel. */ void callGemm() override { + if constexpr (std::is_same_v<T, float>) { bli_sgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_, rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_), diff --git a/AOCL/sp_gemm.hh b/AOCL/sp_gemm.hh index 3c6b5c0..4fc178b 100644 --- a/AOCL/sp_gemm.hh +++ b/AOCL/sp_gemm.hh @@ -28,9 +28,16 @@ class gemm_cpu : public gemm<T> { rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_), &beta, C_, rowStride, std::max(1, m_)); } else if constexpr (std::is_same_v<T, double>) { - bli_dgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_, - rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_), - &beta, C_, rowStride, std::max(1, m_)); + // Todo -- base? + aoclsparse_create_dscr(&A_csr_, base, n_, n_, nnz_, cst_row_ptr_A_.data + (), csr_col_ind_A_.data(), csr_val_A_.data()); + aoclsparse_create_dscr(&B_csr_, base, n_, n_, nnz_, cst_row_ptr_B_.data + (), csr_col_ind_B_.data(), csr_val_B_.data()); + + aoclsparse_spmm(aoclsparse_operation_none, A_csr_, B_csr_, &C_csr_); + aoclsparse_export_dcsr(C_csr_, &base, &C_M_, &C_N_, &nnz_C_, + &csr_row_ptr_C_, &csr_col_ind_C_, (void**) + &csr_val_C_); } else { // Un-specialised class will not do any work - print error and exit. std::cout << "ERROR - Datatype for AOCL CPU GEMM kernel not supported." @@ -57,6 +64,25 @@ class gemm_cpu : public gemm<T> { /** The distance in elements to the next column. */ const int rowStride = 1; + + aoclsparse_matrix A_csr_; + aoclsparse_int* csr_row_ptr_A_; + aoclsparse_int* csr_col_ind_A_; + T* csr_val_A_; + + aoclsparse_matrix B_csr_; + aoclsparse_int* csr_row_ptr_B_; + aoclsparse_int* csr_col_ind_B_; + T* csr_val_B_; + + aoclsparse_matrix C_csr_; + aoclsparse_int* csr_row_ptr_C_; + aoclsparse_int* csr_col_ind_C_; + T* csr_val_C_; + aoclsparse_int C_M_; + aoclsparse_int C_N_; + + aoclsparse_status status; }; } // namespace cpu #endif \ No newline at end of file diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh new file mode 100644 index 0000000..aba5814 --- /dev/null +++ b/ArmPL/sp_gemm.hh @@ -0,0 +1,231 @@ +#pragma once + +#ifdef CPU_ARMPL +#include <stdio.h> +#include <stdlib.h> +#include <armpl.h> +#include <omp.h> + +#include <algorithm> + +#include "../include/kernels/CPU/sp_gemm.hh" +#include "../include/utilities.hh" + +namespace cpu { +/** A class for GEMM CPU BLAS kernels. */ +template <typename T> +class sp_gemm_cpu : public sp_gemm<T> { + public: + using sp_gemm<T>::gemm; + using sp_gemm<T>::callConsume; + using sp_gemm<T>::m_; + using sp_gemm<T>::n_; + using sp_gemm<T>::k_; + using sp_gemm<T>::A_; + using sp_gemm<T>::B_; + using sp_gemm<T>::C_; + + private: + /** Make call to the GEMM kernel. */ + void callGemm() override { + + /** + * Flow of ARMPL Sparse LA: + * + * 1. Create sparse matrix objects: armpl_spmat_create_csr[sdcz]() + * + * 2. Supply hints on usage: armpl_spmat_hint() + * + * 3. Optimise for SpMV: armpl_spmv_optimize() + * + * 4. Solve SpMV case: armpl_spmv_exec_[sdcz]() + * + * 5. Destroy sparse matrix object: armpl_spmat_destroy() + * + * In addiion, users can choose to update a set of non-zero values using + * armpl_spmat_update_[sdcz]() + */ + + // Todo -- See if using armpl_spmat_hint can improve performance here. + // If so, follow with optimisation functions + + + + + if (std::is_same_v<T, float>) { + status_ = armpl_spmm_exec_s(transA, + transB, + alpha, + A_armpl_, + B_armpl, + beta, + C_armpl_); + } else if constexpr (std::is_same_v<T, double>) { + status_ = armpl_spmm_exec_d(transA, + transB, + alpha, + A_armpl_, + B_armpl, + beta, + C_armpl_); + } else { + // Un-specialised class will not do any work - print error and exit. + std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported." + << std::endl; + exit(1); + } + + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + // Ensure compiler doesn't optimise away the work being done + callConsume(); + } + + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override {} + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override { + status_ = armpl_spmat_destroy(A_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_destroy(B_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_destroy(C_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } + + /** The constant value Alpha. */ + const T alpha = ALPHA; + + /** The constant value Beta. */ + const T beta = BETA; + + armpl_status_t status_; + + armpl_spmat_t armpl_A, armpl_B, armpl_C; + + @override + void toCSR() { + n_armpl_ = n_; + // ToDo -- check whether flags_ is correct! + flags_ = 0; + + // Move A to CSR + A_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; + A_armpl_col_index_ = new armpl_int_t[nnz_]; + A_vals_ = new T[nnz_]; + int nnz_encountered = 0; + for (int row = 0; row < n_; row++) { + A_armpl_row_ptr_[row] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (A_[(row * n_) + col] != 0.0) { + A_armpl_col_index_[nnz_encountered] = col; + A_vals_[nnz_encountered] = A_[(row * n_) + col]; + nnz_encountered++; + } + } + } + + // Move B to CSR + B_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; + B_armpl_col_index_ = new armpl_int_t[nnz_]; + B_vals_ = new T[nnz_]; + nnz_encountered = 0; + for (int row = 0; row < n_; row++) { + B_armpl_row_ptr_[row] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (B_[(row * n_) + col] != 0.0) { + B_armpl_col_index_[nnz_encountered] = col; + B_vals_[nnz_encountered] = B_[(row * n_) + col]; + nnz_encountered++; + } + } + } + + if (std::is_sam_v<T, float>) { + status_ = armpl_spmat_create_csr_s(A_armpl_, + n_armpl_, + n_armpl_, + A_armpl_row_ptr_, + A_armpl_col_index_, + A_vals_, + flags); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_create_csr_s(B_armpl_, + n_armpl_, + n_armpl_, + B_armpl_row_ptr_, + B_armpl_col_index_, + B_vals_, + flags); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } else if (std::is_same_v<T, double>) { + status_ = armpl_spmat_create_csr_d(A_armpl_, + n_armpl_, + n_armpl_, + A_armpl_row_ptr_, + A_armpl_col_index_, + A_vals_, + flags); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_create_csr_d(B_armpl_, + n_armpl_, + n_armpl_, + B_armpl_row_ptr_, + B_armpl_col_index_, + B_vals_, + flags); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } + + + } + + armpl_int_t flags_; + + armpl_int_t n_armpl_; + + armpl_int_t* A_armpl_row_ptr_; + armpl_int_t* A_armpl_col_index_; + armpl_int_t* B_armpl_row_ptr_; + armpl_int_t* B_armpl_col_index_; + armpl_int_t* C_armpl_row_ptr_; + armpl_int_t* C_armpl_col_index_; + + armpl_spmat_t* A_armpl_; + armpl_spmat_t* B_armpl_; + armpl_spmat_t* C_armpl_; + + sparse_hint_value transA = ARMPL_SPARSE_OPERATION_NOTRANS; + sparse_hint_value transB = ARMPL_SPARSE_OPERATION_NOTRANS; + +}; +} // namespace cpu +#endif \ No newline at end of file diff --git a/NVPL/sp_gemv.hh b/NVPL/sp_gemv.hh new file mode 100644 index 0000000..d04f6b8 --- /dev/null +++ b/NVPL/sp_gemv.hh @@ -0,0 +1,117 @@ +/** + * ToDo -- This is all currently written for GEMM, but NVPL does not support + * GEMM, so this needs to be adjusted to spmv -- which is supported + */ + + + + + +#pragma once + +#ifdef CPU_NVPL +#include <nvpl_sparse.h> + +#include "../include/kernels/CPU/gemm.hh" +#include "../include/utilities.hh" + +namespace cpu { +/** A class for GEMM CPU BLAS kernels. */ +template <typename T> +class sp_gemm_cpu : public sp_gemm<T> { + public: + using sp_gemm<T>::gemm; + using sp_gemm<T>::callConsume; + using sp_gemm<T>::m_; + using sp_gemm<T>::n_; + using sp_gemm<T>::k_; + using sp_gemm<T>::A_; + using sp_gemm<T>::B_; + using sp_gemm<T>::C_; + + private: + /** Make call to the GEMM kernel. */ + void callGemm() override { + + // Ensure compiler doesn't optimise away the work being done + callConsume(); + } + + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override { + // Set type enum + if constexpr (std::is_same_v<T, float>) { + type_ = NVPL_SPARSE_R_32F; + } else if constexpr (std::is_same_v<T, double>) { + type_ = NVPL_SPARSE_R_64F; + } else { + // Un-specialised class will not do any work - print error and exit. + std::cout << "ERROR - Datatype for NVPL sparse GEMM kernel not supported." + << std::endl; + exit(1); + } + status_ = nvpl_sparse_create(&handle_); + // Todo -- error check + + // Todo -- Make const? + status_ = nvpl_sparse_create_csr(A_nvpl_, n_, n_, nnz_, A_row_ptr_nvpl_, + A_col_index_nvpl_, A_vals_nvpl_, + index_type_, index_type_, base_, type_); + + status_ = nvpl_sparse_create_csr(B_nvpl_, n_, n_, nnz_, B_row_ptr_nvpl_, + B_col_index_nvpl_, B_vals_nvpl_, + index_type_, index_type_, base_, type_); + // Todo -- error check + + + } + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override { + status_ = nvpl_sparse_destroy(handle_); + // Todo -- error check + status_ = nvpl_sparse_destroy_sp_mat(A_nvpl_); + status_ = nvpl_sparse_destroy_sp_mat(B_nvpl_); + status_ = nvpl_sparse_destroy_sp_mat(C_nvpl_); + } + + /** The constant value Alpha. */ + T alpha = ALPHA; + + /** The constant value Beta. */ + T beta = BETA; + + /** + * Sparse metadata + */ + nvpl_sparse_status_t status_; + nvpl_sparse_handle_t handle_; + nvpl_sparse_data_type_t type_; + + nvpl_sparse_operation_t op_ = NVPL_SPARSE_OPERATION_NON_TRANSPOSE; + nvpl_sparse_index_base_t base_ = NVPL_SPARSE_INDEX_BASE_ZERO; + nvpl_sparse_format_t format_ = NVPL_SPARSE_FORMAT_CSR; + nvpl_sparse_order_t order_ = NVPL_SPARSE_ORDER_COL; + nvpl_sparse_index_type_t index_type_ = NVPL_SPARSE_INDEX_64I; + + /** + * Sparse matrix descriptors + */ + nvpl_sparse_sp_mat_descr_t* A_nvpl_; + nvpl_sparse_sp_mat_descr_t* B_nvpl_; + nvpl_sparse_sp_mat_descr_t* C_nvpl_; + + void* A_row_ptr_nvpl_; + void* B_row_ptr_nvpl_; + void* C_row_ptr_nvpl_; + void* A_col_idnex_nvpl_; + void* B_col_idnex_nvpl_; + void* C_col_idnex_nvpl_; + void* A_vals_nvpl_; + void* B_vals_nvpl_; + void* C_vals_nvpl_; +}; +} // namespace cpu +#endif \ No newline at end of file diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh index 60778e7..72fd5dc 100644 --- a/include/kernels/CPU/sp_gemm.hh +++ b/include/kernels/CPU/sp_gemm.hh @@ -1,9 +1,9 @@ #pragma once -#ifdef CPU_ONEMKL #include "../gemm.hh" #include <random> +#include <memory> namespace cpu { @@ -25,21 +25,78 @@ namespace cpu { /** Initialise the required data structures. */ virtual void initialise(int n, double sparsity, bool binary = false) { n_ = n; + sparsity_ = sparsity; + + // Note that the below should be the same as the edges calculation + // used in the initInputMatricesSparse function. If changed here, + // change there + nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity_)); A_ = (T*)malloc(sizeof(T) * n_ * n_); B_ = (T*)malloc(sizeof(T) * n_ * n_); C_ = (T*)malloc(sizeof(T) * n_ * n_); - initInputMatricesSparse(sparsity); + initInputMatricesSparse(sparsity_); + + toCSR(); } private: /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ - void postCallKernelCleanup() { - free(A_); - free(B_); - free(C_); - } + void postCallKernelCleanup() { + free(A_); + free(B_); + free(C_); + } + + void toCSR() { + // Move A to CSR + A_row_ptr_ = new int[n_ + 1]; + A_col_index_ = new int[nnz_]; + A_vals_ = new T[nnz_]; + int nnz_encountered = 0; + for (int row = 0; row < n_; row++) { + A_row_ptr_[row] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (A_[(row * n_) + col] != 0.0) { + A_col_index_[nnz_encountered] = col; + A_vals_[nnz_encountered] = A_[(row * n_) + col]; + nnz_encountered++; + } + } + } + + // Move B to CSR + B_row_ptr_ = new int[n_ + 1]; + B_col_index_ = new int[nnz_]; + B_vals_ = new T[nnz_]; + nnz_encountered = 0; + for (int row = 0; row < n_; row++) { + B_row_ptr_[row] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (B_[(row * n_) + col] != 0.0) { + B_col_index_[nnz_encountered] = col; + B_vals_[nnz_encountered] = B_[(row * n_) + col]; + nnz_encountered++; + } + } + } + } + + double sparsity_; + + int nnz_; + + int* A_row_ptr_; + int* A_col_index_; + int* B_row_ptr_; + int* B_col_index_; + int* C_row_ptr_; + int* C_col_index_; + T* A_vals_; + T* B_vals_; + T* C_vals; + }; } // namespace cpu diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index d97fc8c..d357734 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -91,6 +91,9 @@ class gemm { } } + // Note that the below should be the same as the nnz calculation + // used in the cpu initialise functions. If changed here, + // change there void initInputMatricesSparse(float sparsity) { for (int i = 0; i < (n_ * n_); i++) { A_[i] = 0.0; @@ -200,6 +203,25 @@ class gemm { } row_ptr[n_row] = (MKL_INT)nnz_encountered; } +#endif +#ifdef CPU_AOCL + void toCSR_aocl(T* dense, int n_col, int n_row, T* vals, aoclsparse_int* + col_index, aoclsparse_int* row_ptr) { + int nnz_encountered = 0; + for (int row = 0; row < n_row; row++) { + row_ptr[row] = (aoclsparse_int)nnz_encountered; + int nnz_row = 0; + for (int col = 0; col < n_col; col++) { + if (dense[(row * n_col) + col] != 0.0) { + nnz_row++; + col_index[nnz_encountered] = (aoclsparse_int)col; + vals[nnz_encountered] = dense[(row * n_col) + col]; + nnz_encountered++; + } + } + } + row_ptr[n_row] = (MKL_INT)nnz_encountered; + } #endif /** The number of iterations to perform per problem size. */ const int iterations_; From 521cbf3d1f4f5369813732e46be11fd019a09241 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Tue, 1 Oct 2024 12:00:19 +0100 Subject: [PATCH 24/38] Working changes --- .DS_Store | Bin 0 -> 8196 bytes .idea/GPU-BLAS-Offload-Benchmark.iml | 2 + .idea/codeStyles/codeStyleConfig.xml | 5 + .idea/misc.xml | 6 + .idea/modules.xml | 8 + .idea/vcs.xml | 6 + .idea/workspace.xml | 541 +++++++++++++++++++++++++++ ArmPL/sp_gemm.hh | 271 ++++++++++++-- DefaultCPU/sp_gemm.hh | 55 --- DefaultGPU/sp_gemm.hh | 54 --- Makefile | 2 +- NVPL/sp_gemv.hh | 117 ------ createGflopsGraphs.py | 5 + cuBLAS/sp_gemm.hh | 9 +- cuBLAS/sp_gemv.hh | 261 +++++++++++++ include/.DS_Store | Bin 0 -> 6148 bytes include/doGemm.hh | 46 ++- include/kernels/.DS_Store | Bin 0 -> 6148 bytes include/kernels/CPU/sp_gemm.hh | 23 +- include/kernels/CPU/sp_gemv.hh | 47 +++ include/kernels/GPU/sp_gemm.hh | 3 +- include/kernels/GPU/sp_gemv.hh | 28 ++ include/kernels/gemm.hh | 4 + include/kernels/gemv.hh | 79 ++++ 24 files changed, 1278 insertions(+), 294 deletions(-) create mode 100644 .DS_Store create mode 100644 .idea/GPU-BLAS-Offload-Benchmark.iml create mode 100644 .idea/codeStyles/codeStyleConfig.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 .idea/workspace.xml delete mode 100644 DefaultCPU/sp_gemm.hh delete mode 100644 DefaultGPU/sp_gemm.hh delete mode 100644 NVPL/sp_gemv.hh create mode 100644 cuBLAS/sp_gemv.hh create mode 100644 include/.DS_Store create mode 100644 include/kernels/.DS_Store create mode 100644 include/kernels/CPU/sp_gemv.hh create mode 100644 include/kernels/GPU/sp_gemv.hh diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5e3f9bcf14470d249e0f7fdd3125325b2078e4a9 GIT binary patch literal 8196 zcmeHMO>7fK6n<lyc%x911A(YNVx^ucnbc_j1vu1k3<v=wibIOjRH-*>5{vbYWjjq# z6v^tL67*83J#*!VfCCrQLnUrVRV9ilZorKzhpMOY%}<<}wVfjZF%#{)*?Hf4Gw;ot zH<>jNvEq6-Ni<GG9v(qQM^N=Cf-jz)lF~h&filQbv^rT{;q^uj(seiwI1e}vI1e}v zI1jun9>AF`iZ|fgS6o*+4>%9JmmU!L!N((LLDP<+GIgMmR{+oqx@AFoR5U<+O$(ZK z6a@!`DN#@*%Jdb3DRK1s8duP?qo@)mrY|2%kIeK9g~`#O-<NP=1w~!$Jm5Ug>j80h z&(H*|QjOZy{XN^dWAf^}R0<*<pwD}B?7_W1{&_fK^TS>FjWy%jz=wH=(jJUkqmZgp zu|}pZAKP4W?5W(s++S*JL%z;;Mt?b-`giJyoSlKN#;0Gz_!*j^i!@8;&qjPj+lKVP z{sV8~e^~?!^PHh3)oCt?ME^jfZPCz@t;e+J*6HxtuYc<Kc1JQT>W{E3l6miATA>O> zsMk?fs14s<q$~J~h~qt^1zLkGRce89^oCf223@l~kXlXHH3p4BN@+Z!^);a74Eih3 zu!eRYO>{e<X-LkkNax|ZvOOyMc{gmZ(Q4>5x*s&6TC1JUKVhkKX3tR8%X%Z;x8*gy zQEpe->#bs?`Hgs6;5-Vp+m&FkR^3=0-9O9YcBK|qn^K?_RsmW1x)z6gqsZ6euq9>7 zis21=!^@)wH#d(==KQ1i>8+f<dHjof&Ob7BY-?-S8~gP0lk>}0qk7D*WBMpepHTFH zdhgaZ(6Y?8L!>EAFpF;nN$}&P6E9TQConsKzd!%cfv0^icA&`6w{(18ZpIOh#iEP3 zdq@Ti1khm$WY`4uGRdI7X>5-yHgSw)jUa=~Y@^vH(6|fQ_QBm(KqvH>UU>HW^xof< zg*~VpKWy?7xuYrpBv6(oSRRAX2tx5JlE5kYipr=buxWmvwrxe~Zy?Q-;L!zy{Z(v< zE3iK5v08+(X>|tL7kd*(dNzR@!lsO&^#YwsCL5WSOr0LKb_3X$`fjJRNZ%%YnC4;M z43(f=*jcAAVWo%wQzDDa&9Sn5^=A$x&}pQACau^yMYOPeMzm;@z3!K9L6_#3>;2Pj z-IeTech<R=YppMJpwyMY9!M+JxVZnH8vOl#umZ<%9&jFb=RLqOrE;kVGq(S8Nm?ZC z+CHA|@Q5OC9Yv)GDtR3TmDh35?hl4&`+y3Wf~Fls(S!2FKLm7tpmgWIJO5<}zL(W& D3B5za literal 0 HcmV?d00001 diff --git a/.idea/GPU-BLAS-Offload-Benchmark.iml b/.idea/GPU-BLAS-Offload-Benchmark.iml new file mode 100644 index 0000000..190534e --- /dev/null +++ b/.idea/GPU-BLAS-Offload-Benchmark.iml @@ -0,0 +1,2 @@ +<?xml version="1.0" encoding="UTF-8"?> +<module classpath="External" external.linked.project.id="GPU-BLAS-Offload-Benchmark" external.linked.project.path="$MODULE_DIR$" external.root.project.path="$MODULE_DIR$" external.system.id="Makefile" type="CPP_MODULE" version="4" /> \ No newline at end of file diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml new file mode 100644 index 0000000..a55e7a1 --- /dev/null +++ b/.idea/codeStyles/codeStyleConfig.xml @@ -0,0 +1,5 @@ +<component name="ProjectCodeStyleConfiguration"> + <state> + <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" /> + </state> +</component> \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..830d3c8 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="MakefileWorkspace"> + <contentRoot DIR="$PROJECT_DIR$" /> + </component> +</project> \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..eff3984 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ProjectModuleManager"> + <modules> + <module fileurl="file://$PROJECT_DIR$/.idea/GPU-BLAS-Offload-Benchmark.iml" filepath="$PROJECT_DIR$/.idea/GPU-BLAS-Offload-Benchmark.iml" /> + </modules> + </component> +</project> \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="VcsDirectoryMappings"> + <mapping directory="" vcs="Git" /> + </component> +</project> \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000..b954508 --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,541 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="AutoImportSettings"> + <option name="autoReloadType" value="SELECTIVE" /> + </component> + <component name="CMakeRunConfigurationManager"> + <generated> + <config projectName="GPU-BLAS-Offload-Benchmark" targetName="all" /> + <config projectName="GPU-BLAS-Offload-Benchmark" targetName="gpu-blob" /> + </generated> + </component> + <component name="CMakeSettings"> + <configurations> + <configuration PROFILE_NAME="Debug" ENABLED="true" CONFIG_NAME="Debug" /> + </configurations> + </component> + <component name="ChangeListManager"> + <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding AOCL files" /> + <option name="SHOW_DIALOG" value="false" /> + <option name="HIGHLIGHT_CONFLICTS" value="true" /> + <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> + <option name="LAST_RESOLUTION" value="IGNORE" /> + </component> + <component name="ClangdSettings"> + <option name="clangTidyViaClangd" value="false" /> + <option name="formatViaClangd" value="false" /> + </component> + <component name="ExternalProjectsData"> + <projectState path="$PROJECT_DIR$"> + <ProjectState /> + </projectState> + </component> + <component name="Git.Settings"> + <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" /> + <option name="UPDATE_TYPE" value="REBASE" /> + </component> + <component name="MakefileLocalSettings"> + <option name="availableProjects"> + <map> + <entry> + <key> + <ExternalProjectPojo> + <option name="name" value="GPU-BLAS-Offload-Benchmark" /> + <option name="path" value="$PROJECT_DIR$" /> + </ExternalProjectPojo> + </key> + <value> + <list> + <ExternalProjectPojo> + <option name="name" value="GPU-BLAS-Offload-Benchmark" /> + <option name="path" value="$PROJECT_DIR$" /> + </ExternalProjectPojo> + </list> + </value> + </entry> + </map> + </option> + <option name="projectSyncType"> + <map> + <entry key="$PROJECT_DIR$" value="RE_IMPORT" /> + </map> + </option> + </component> + <component name="MarkdownSettingsMigration"> + <option name="stateVersion" value="1" /> + </component> + <component name="OCResolveContextSettings"> + <option name="configuration" value="$PROJECT_DIR$/src/main.cc" /> + </component> + <component name="ProjectApplicationVersion"> + <option name="ide" value="CLion" /> + <option name="majorVersion" value="2023" /> + <option name="minorVersion" value="3" /> + </component> + <component name="ProjectColorInfo">{ + "associatedIndex": 2 +}</component> + <component name="ProjectId" id="2bAwYDqoTyLBV0DE8xYqkQ0FEw0" /> + <component name="ProjectViewState"> + <option name="hideEmptyMiddlePackages" value="true" /> + <option name="showLibraryContents" value="true" /> + </component> + <component name="PropertiesComponent">{ + "keyToString": { + "C/C++ File.main.cc.executor": "Run", + "RunOnceActivity.OpenProjectViewOnStart": "true", + "RunOnceActivity.ShowReadmeOnStart": "true", + "RunOnceActivity.cidr.known.project.marker": "true", + "RunOnceActivity.readMode.enableVisualFormatting": "true", + "cf.advertisement.text.has.clang-format": "true", + "cf.first.check.clang-format": "false", + "cidr.known.project.marker": "true", + "git-widget-placeholder": "sparse", + "last_opened_file_path": "/Users/no22498/Documents/GPU-BLAS-Offload-Benchmark", + "node.js.detected.package.eslint": "true", + "node.js.detected.package.tslint": "true", + "node.js.selected.package.eslint": "(autodetect)", + "node.js.selected.package.tslint": "(autodetect)", + "nodejs_package_manager_path": "npm", + "settings.editor.selected.configurable": "preferences.lookFeel", + "structure.view.defaults.are.configured": "true", + "vue.rearranger.settings.migration": "true" + } +}</component> + <component name="RecentsManager"> + <key name="MoveFile.RECENT_KEYS"> + <recent name="$PROJECT_DIR$/CSV_Results" /> + </key> + </component> + <component name="RunManager" selected="C/C++ File.main.cc"> + <configuration name="all" type="CLionNativeAppRunConfigurationType" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" EMULATE_TERMINAL="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="GPU-BLAS-Offload-Benchmark" TARGET_NAME="all" CONFIG_NAME="all" version="1"> + <method v="2"> + <option name="CLION.COMPOUND.BUILD" enabled="true" /> + </method> + </configuration> + <configuration name="gpu-blob" type="CLionNativeAppRunConfigurationType" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" EMULATE_TERMINAL="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="GPU-BLAS-Offload-Benchmark" TARGET_NAME="gpu-blob" CONFIG_NAME="gpu-blob" version="1"> + <method v="2"> + <option name="CLION.COMPOUND.BUILD" enabled="true" /> + </method> + </configuration> + <configuration name="main.cc" type="CppFileRunConfiguration" factoryName="CppFileRunConfiguration" temporary="true" REDIRECT_INPUT="false" ELEVATE="false" USE_EXTERNAL_CONSOLE="false" EMULATE_TERMINAL="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="GPU-BLAS-Offload-Benchmark" TARGET_NAME="main.cc" CONFIG_NAME="main.cc"> + <option name="sourceFile" value="src/main.cc" /> + <method v="2"> + <option name="com.jetbrains.cidr.cpp.runfile.CppFileBuildBeforeRunTaskProvider$BasicBuildBeforeRunTask" enabled="true" /> + </method> + </configuration> + <list> + <item itemvalue="C/C++ File.main.cc" /> + <item itemvalue="Native Application.all" /> + <item itemvalue="Native Application.gpu-blob" /> + </list> + <recent_temporary> + <list> + <item itemvalue="C/C++ File.main.cc" /> + </list> + </recent_temporary> + </component> + <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" /> + <component name="TaskManager"> + <task active="true" id="Default" summary="Default task"> + <changelist id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="" /> + <created>1705671236426</created> + <option name="number" value="Default" /> + <option name="presentableId" value="Default" /> + <updated>1705671236426</updated> + <workItem from="1705671237559" duration="4602000" /> + <workItem from="1706262352145" duration="10830000" /> + <workItem from="1706520146967" duration="113000" /> + <workItem from="1706524361669" duration="11224000" /> + <workItem from="1706871479435" duration="19313000" /> + <workItem from="1707150032379" duration="1154000" /> + <workItem from="1707218344676" duration="510000" /> + <workItem from="1707218861842" duration="7823000" /> + <workItem from="1707568200980" duration="5614000" /> + <workItem from="1708954563821" duration="751000" /> + <workItem from="1708955322064" duration="16518000" /> + <workItem from="1709217936554" duration="14897000" /> + <workItem from="1709904670690" duration="598000" /> + <workItem from="1710146767066" duration="2251000" /> + <workItem from="1710157491483" duration="1263000" /> + <workItem from="1710158763389" duration="2993000" /> + <workItem from="1710161850416" duration="103978000" /> + <workItem from="1711446443157" duration="118701000" /> + <workItem from="1715785109710" duration="13531000" /> + <workItem from="1716389199190" duration="1275000" /> + <workItem from="1716897681894" duration="598000" /> + <workItem from="1716899034743" duration="1217000" /> + <workItem from="1716981059825" duration="14000" /> + <workItem from="1722246444109" duration="2990000" /> + <workItem from="1722496439084" duration="24843000" /> + <workItem from="1723101242209" duration="21225000" /> + <workItem from="1724244974273" duration="40294000" /> + <workItem from="1726568120590" duration="8508000" /> + <workItem from="1726828018604" duration="38592000" /> + </task> + <task id="LOCAL-00001" summary="trivial changes"> + <option name="closed" value="true" /> + <created>1706261672580</created> + <option name="number" value="00001" /> + <option name="presentableId" value="LOCAL-00001" /> + <option name="project" value="LOCAL" /> + <updated>1706261672580</updated> + </task> + <task id="LOCAL-00002" summary="Adding sparse algorithm"> + <option name="closed" value="true" /> + <created>1706568127804</created> + <option name="number" value="00002" /> + <option name="presentableId" value="LOCAL-00002" /> + <option name="project" value="LOCAL" /> + <updated>1706568127804</updated> + </task> + <task id="LOCAL-00003" summary="Integrating algorithm with benchmark"> + <option name="closed" value="true" /> + <created>1706881882900</created> + <option name="number" value="00003" /> + <option name="presentableId" value="LOCAL-00003" /> + <option name="project" value="LOCAL" /> + <updated>1706881882900</updated> + </task> + <task id="LOCAL-00004" summary="Adding commandline options to select only sparse or dense kernels"> + <option name="closed" value="true" /> + <created>1707233768599</created> + <option name="number" value="00004" /> + <option name="presentableId" value="LOCAL-00004" /> + <option name="project" value="LOCAL" /> + <updated>1707233768599</updated> + </task> + <task id="LOCAL-00005" summary="Changes"> + <option name="closed" value="true" /> + <created>1709208672718</created> + <option name="number" value="00005" /> + <option name="presentableId" value="LOCAL-00005" /> + <option name="project" value="LOCAL" /> + <updated>1709208672718</updated> + </task> + <task id="LOCAL-00006" summary="Changes"> + <option name="closed" value="true" /> + <created>1709211130948</created> + <option name="number" value="00006" /> + <option name="presentableId" value="LOCAL-00006" /> + <option name="project" value="LOCAL" /> + <updated>1709211130948</updated> + </task> + <task id="LOCAL-00007" summary="Adding sparse kernel to doGemm"> + <option name="closed" value="true" /> + <created>1709217956669</created> + <option name="number" value="00007" /> + <option name="presentableId" value="LOCAL-00007" /> + <option name="project" value="LOCAL" /> + <updated>1709217956669</updated> + </task> + <task id="LOCAL-00008" summary="Adding matrix type enum class"> + <option name="closed" value="true" /> + <created>1709218027209</created> + <option name="number" value="00008" /> + <option name="presentableId" value="LOCAL-00008" /> + <option name="project" value="LOCAL" /> + <updated>1709218027209</updated> + </task> + <task id="LOCAL-00009" summary="changes"> + <option name="closed" value="true" /> + <created>1709368112577</created> + <option name="number" value="00009" /> + <option name="presentableId" value="LOCAL-00009" /> + <option name="project" value="LOCAL" /> + <updated>1709368112577</updated> + </task> + <task id="LOCAL-00010" summary="changes"> + <option name="closed" value="true" /> + <created>1709368228167</created> + <option name="number" value="00010" /> + <option name="presentableId" value="LOCAL-00010" /> + <option name="project" value="LOCAL" /> + <updated>1709368228167</updated> + </task> + <task id="LOCAL-00011" summary="adding command line kernel selection"> + <option name="closed" value="true" /> + <created>1709582619984</created> + <option name="number" value="00011" /> + <option name="presentableId" value="LOCAL-00011" /> + <option name="project" value="LOCAL" /> + <updated>1709582619984</updated> + </task> + <task id="LOCAL-00012" summary="adding command line kernel selection"> + <option name="closed" value="true" /> + <created>1710157174669</created> + <option name="number" value="00012" /> + <option name="presentableId" value="LOCAL-00012" /> + <option name="project" value="LOCAL" /> + <updated>1710157174669</updated> + </task> + <task id="LOCAL-00013" summary="Adding basic sparse multiplication kernel for default CPU and GPU"> + <option name="closed" value="true" /> + <created>1710172355530</created> + <option name="number" value="00013" /> + <option name="presentableId" value="LOCAL-00013" /> + <option name="project" value="LOCAL" /> + <updated>1710172355530</updated> + </task> + <task id="LOCAL-00014" summary="Implementing cuSPARSE kernel"> + <option name="closed" value="true" /> + <created>1710337387217</created> + <option name="number" value="00014" /> + <option name="presentableId" value="LOCAL-00014" /> + <option name="project" value="LOCAL" /> + <updated>1710337387217</updated> + </task> + <task id="LOCAL-00015" summary="Trying to work out CSR malloc bug"> + <option name="closed" value="true" /> + <created>1710338720376</created> + <option name="number" value="00015" /> + <option name="presentableId" value="LOCAL-00015" /> + <option name="project" value="LOCAL" /> + <updated>1710338720376</updated> + </task> + <task id="LOCAL-00016" summary="Trying to work out CSR malloc bug"> + <option name="closed" value="true" /> + <created>1710338867534</created> + <option name="number" value="00016" /> + <option name="presentableId" value="LOCAL-00016" /> + <option name="project" value="LOCAL" /> + <updated>1710338867534</updated> + </task> + <task id="LOCAL-00017" summary="cuSPARSE unified memory implementation"> + <option name="closed" value="true" /> + <created>1710853559721</created> + <option name="number" value="00017" /> + <option name="presentableId" value="LOCAL-00017" /> + <option name="project" value="LOCAL" /> + <updated>1710853559721</updated> + </task> + <task id="LOCAL-00018" summary="Now compiles"> + <option name="closed" value="true" /> + <created>1711026531002</created> + <option name="number" value="00018" /> + <option name="presentableId" value="LOCAL-00018" /> + <option name="project" value="LOCAL" /> + <updated>1711026531002</updated> + </task> + <task id="LOCAL-00019" summary="Now compiles"> + <option name="closed" value="true" /> + <created>1711026902576</created> + <option name="number" value="00019" /> + <option name="presentableId" value="LOCAL-00019" /> + <option name="project" value="LOCAL" /> + <updated>1711026902576</updated> + </task> + <task id="LOCAL-00020" summary="Now compiles with fewer runtime errors"> + <option name="closed" value="true" /> + <created>1711361513432</created> + <option name="number" value="00020" /> + <option name="presentableId" value="LOCAL-00020" /> + <option name="project" value="LOCAL" /> + <updated>1711361513432</updated> + </task> + <task id="LOCAL-00021" summary="Implementing other offload types - still some runtime errors"> + <option name="closed" value="true" /> + <created>1711453016707</created> + <option name="number" value="00021" /> + <option name="presentableId" value="LOCAL-00021" /> + <option name="project" value="LOCAL" /> + <updated>1711453016707</updated> + </task> + <task id="LOCAL-00022" summary="All implemented and running. No checksum at the end"> + <option name="closed" value="true" /> + <created>1711457712445</created> + <option name="number" value="00022" /> + <option name="presentableId" value="LOCAL-00022" /> + <option name="project" value="LOCAL" /> + <updated>1711457712445</updated> + </task> + <task id="LOCAL-00023" summary="Removing print statements"> + <option name="closed" value="true" /> + <created>1711457867311</created> + <option name="number" value="00023" /> + <option name="presentableId" value="LOCAL-00023" /> + <option name="project" value="LOCAL" /> + <updated>1711457867311</updated> + </task> + <task id="LOCAL-00024" summary="All three offload types working for large problem sizes"> + <option name="closed" value="true" /> + <created>1711715754311</created> + <option name="number" value="00024" /> + <option name="presentableId" value="LOCAL-00024" /> + <option name="project" value="LOCAL" /> + <updated>1711715754311</updated> + </task> + <task id="LOCAL-00025" summary="Removing print statements"> + <option name="closed" value="true" /> + <created>1711715920815</created> + <option name="number" value="00025" /> + <option name="presentableId" value="LOCAL-00025" /> + <option name="project" value="LOCAL" /> + <updated>1711715920815</updated> + </task> + <task id="LOCAL-00026" summary="Superficial changes"> + <option name="closed" value="true" /> + <created>1711961476350</created> + <option name="number" value="00026" /> + <option name="presentableId" value="LOCAL-00026" /> + <option name="project" value="LOCAL" /> + <updated>1711961476350</updated> + </task> + <task id="LOCAL-00027" summary="rebasing"> + <option name="closed" value="true" /> + <created>1711961618074</created> + <option name="number" value="00027" /> + <option name="presentableId" value="LOCAL-00027" /> + <option name="project" value="LOCAL" /> + <updated>1711961618074</updated> + </task> + <task id="LOCAL-00028" summary="rebasing"> + <option name="closed" value="true" /> + <created>1711961836984</created> + <option name="number" value="00028" /> + <option name="presentableId" value="LOCAL-00028" /> + <option name="project" value="LOCAL" /> + <updated>1711961836984</updated> + </task> + <task id="LOCAL-00029" summary="rebasing"> + <option name="closed" value="true" /> + <created>1711961942373</created> + <option name="number" value="00029" /> + <option name="presentableId" value="LOCAL-00029" /> + <option name="project" value="LOCAL" /> + <updated>1711961942374</updated> + </task> + <task id="LOCAL-00030" summary="Fixing after rebase"> + <option name="closed" value="true" /> + <created>1712057111636</created> + <option name="number" value="00030" /> + <option name="presentableId" value="LOCAL-00030" /> + <option name="project" value="LOCAL" /> + <updated>1712057111636</updated> + </task> + <task id="LOCAL-00031" summary="Tidying up spGEMM classes to remove duplicated code"> + <option name="closed" value="true" /> + <created>1712136173732</created> + <option name="number" value="00031" /> + <option name="presentableId" value="LOCAL-00031" /> + <option name="project" value="LOCAL" /> + <updated>1712136173732</updated> + </task> + <task id="LOCAL-00032" summary="Fixing py script to accomodate new kernels"> + <option name="closed" value="true" /> + <created>1712141872451</created> + <option name="number" value="00032" /> + <option name="presentableId" value="LOCAL-00032" /> + <option name="project" value="LOCAL" /> + <updated>1712141872451</updated> + </task> + <task id="LOCAL-00033" summary="Fixing memory bug. Implementing --kernels flag"> + <option name="closed" value="true" /> + <created>1712153668999</created> + <option name="number" value="00033" /> + <option name="presentableId" value="LOCAL-00033" /> + <option name="project" value="LOCAL" /> + <updated>1712153668999</updated> + </task> + <task id="LOCAL-00034" summary="Getting rid of print statements"> + <option name="closed" value="true" /> + <created>1712222760735</created> + <option name="number" value="00034" /> + <option name="presentableId" value="LOCAL-00034" /> + <option name="project" value="LOCAL" /> + <updated>1712222760735</updated> + </task> + <task id="LOCAL-00035" summary="WIP"> + <option name="closed" value="true" /> + <created>1712311301376</created> + <option name="number" value="00035" /> + <option name="presentableId" value="LOCAL-00035" /> + <option name="project" value="LOCAL" /> + <updated>1712311301376</updated> + </task> + <task id="LOCAL-00036" summary="Finalising"> + <option name="closed" value="true" /> + <created>1713959722407</created> + <option name="number" value="00036" /> + <option name="presentableId" value="LOCAL-00036" /> + <option name="project" value="LOCAL" /> + <updated>1713959722407</updated> + </task> + <task id="LOCAL-00037" summary="Rebasing"> + <option name="closed" value="true" /> + <created>1715161012243</created> + <option name="number" value="00037" /> + <option name="presentableId" value="LOCAL-00037" /> + <option name="project" value="LOCAL" /> + <updated>1715161012243</updated> + </task> + <task id="LOCAL-00038" summary="Rebasing"> + <option name="closed" value="true" /> + <created>1715161090646</created> + <option name="number" value="00038" /> + <option name="presentableId" value="LOCAL-00038" /> + <option name="project" value="LOCAL" /> + <updated>1715161090646</updated> + </task> + <task id="LOCAL-00039" summary="Adding AOCL files"> + <option name="closed" value="true" /> + <created>1716198459677</created> + <option name="number" value="00039" /> + <option name="presentableId" value="LOCAL-00039" /> + <option name="project" value="LOCAL" /> + <updated>1716198459677</updated> + </task> + <task id="LOCAL-00040" summary="Adding AOCL files"> + <option name="closed" value="true" /> + <created>1724234752813</created> + <option name="number" value="00040" /> + <option name="presentableId" value="LOCAL-00040" /> + <option name="project" value="LOCAL" /> + <updated>1724234752813</updated> + </task> + <option name="localTasksCounter" value="41" /> + <servers /> + </component> + <component name="TypeScriptGeneratedFilesManager"> + <option name="version" value="3" /> + </component> + <component name="Vcs.Log.Tabs.Properties"> + <option name="TAB_STATES"> + <map> + <entry key="MAIN"> + <value> + <State /> + </value> + </entry> + </map> + </option> + </component> + <component name="VcsManagerConfiguration"> + <MESSAGE value="Adding sparse kernel to doGemm" /> + <MESSAGE value="Adding matrix type enum class" /> + <MESSAGE value="changes" /> + <MESSAGE value="adding command line kernel selection" /> + <MESSAGE value="Adding basic sparse multiplication kernel for default CPU and GPU" /> + <MESSAGE value="Implementing cuSPARSE kernel" /> + <MESSAGE value="Trying to work out CSR malloc bug" /> + <MESSAGE value="cuSPARSE unified memory implementation" /> + <MESSAGE value="Now compiles" /> + <MESSAGE value="Now compiles with fewer runtime errors" /> + <MESSAGE value="Implementing other offload types - still some runtime errors" /> + <MESSAGE value="All implemented and running. No checksum at the end" /> + <MESSAGE value="All three offload types working for large problem sizes" /> + <MESSAGE value="Removing print statements" /> + <MESSAGE value="Superficial changes" /> + <MESSAGE value="rebasing" /> + <MESSAGE value="Fixing after rebase" /> + <MESSAGE value="Tidying up spGEMM classes to remove duplicated code" /> + <MESSAGE value="Fixing py script to accomodate new kernels" /> + <MESSAGE value="Fixing memory bug. Implementing --kernels flag" /> + <MESSAGE value="Getting rid of print statements" /> + <MESSAGE value="WIP" /> + <MESSAGE value="Finalising" /> + <MESSAGE value="Rebasing" /> + <MESSAGE value="Adding AOCL files" /> + <option name="LAST_COMMIT_MESSAGE" value="Adding AOCL files" /> + </component> +</project> \ No newline at end of file diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh index aba5814..47b0bf9 100644 --- a/ArmPL/sp_gemm.hh +++ b/ArmPL/sp_gemm.hh @@ -16,7 +16,7 @@ namespace cpu { template <typename T> class sp_gemm_cpu : public sp_gemm<T> { public: - using sp_gemm<T>::gemm; + using sp_gemm<T>::sp_gemm; using sp_gemm<T>::callConsume; using sp_gemm<T>::m_; using sp_gemm<T>::n_; @@ -24,6 +24,7 @@ class sp_gemm_cpu : public sp_gemm<T> { using sp_gemm<T>::A_; using sp_gemm<T>::B_; using sp_gemm<T>::C_; + using sp_gemm<T>::nnz_; private: /** Make call to the GEMM kernel. */ @@ -52,22 +53,23 @@ class sp_gemm_cpu : public sp_gemm<T> { - if (std::is_same_v<T, float>) { - status_ = armpl_spmm_exec_s(transA, - transB, + if constexpr (std::is_same_v<T, float>) { + status_ = armpl_spmm_exec_s(transA_, + transB_, alpha, - A_armpl_, - B_armpl, + *A_armpl_, + *B_armpl_, beta, - C_armpl_); + *B_armpl_); } else if constexpr (std::is_same_v<T, double>) { - status_ = armpl_spmm_exec_d(transA, - transB, + std::cout << "About to execute dgemm" << std::endl; + status_ = armpl_spmm_exec_d(transA_, + transB_, alpha, - A_armpl_, - B_armpl, + *A_armpl_, + *B_armpl_, beta, - C_armpl_); + *B_armpl_); } else { // Un-specialised class will not do any work - print error and exit. std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported." @@ -85,26 +87,42 @@ class sp_gemm_cpu : public sp_gemm<T> { /** Perform any required steps before calling the GEMM kernel that should * be timed. */ - void preLoopRequirements() override {} + void preLoopRequirements() override { + // Need to put A_ and B_ into A_armpl_ and B_armpl_ + // ToDo -- Error catching + toCSR_armpl(); +// std::cout << "toCSR_armpl() wrapped up without a problem" << std::endl; + } /** Perform any required steps after calling the GEMM kernel that should * be timed. */ void postLoopRequirements() override { - status_ = armpl_spmat_destroy(A_armpl_); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } - status_ = armpl_spmat_destroy(B_armpl_); + status_ = armpl_spmat_destroy(*A_armpl_); if (status_ != ARMPL_STATUS_SUCCESS) { std::cout << "ERROR " << status_ << std::endl; exit(1); } - status_ = armpl_spmat_destroy(C_armpl_); + status_ = armpl_spmat_destroy(*B_armpl_); if (status_ != ARMPL_STATUS_SUCCESS) { std::cout << "ERROR " << status_ << std::endl; exit(1); } +// status_ = armpl_spmat_destroy(*C_armpl_); +// if (status_ != ARMPL_STATUS_SUCCESS) { +// std::cout << "ERROR " << status_ << std::endl; +// exit(1); +// } + +// delete [] A_armpl_row_ptr_; +// delete [] A_armpl_col_index_; +// delete [] A_vals_; +// delete [] B_armpl_row_ptr_; +// delete [] B_armpl_col_index_; +// delete [] B_vals_; +// delete [] C_armpl_row_ptr_; +// delete [] C_armpl_col_index_; +// delete [] C_vals_; + } /** The constant value Alpha. */ @@ -117,8 +135,7 @@ class sp_gemm_cpu : public sp_gemm<T> { armpl_spmat_t armpl_A, armpl_B, armpl_C; - @override - void toCSR() { + void toCSR_armpl() { n_armpl_ = n_; // ToDo -- check whether flags_ is correct! flags_ = 0; @@ -127,85 +144,265 @@ class sp_gemm_cpu : public sp_gemm<T> { A_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; A_armpl_col_index_ = new armpl_int_t[nnz_]; A_vals_ = new T[nnz_]; + A_armpl_row_ptr_[0] = 0; + int nnz_encountered = 0; +// std::cout << "About to load A into csr" << std::endl; for (int row = 0; row < n_; row++) { - A_armpl_row_ptr_[row] = nnz_encountered; +// std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << std::endl; + A_armpl_row_ptr_[row + 1] = nnz_encountered; for (int col = 0; col < n_; col++) { if (A_[(row * n_) + col] != 0.0) { +// std::cout << "\t\tCol " << col << " = " << A_[(row * n_) + col] << +// std::endl; A_armpl_col_index_[nnz_encountered] = col; - A_vals_[nnz_encountered] = A_[(row * n_) + col]; + A_vals_[nnz_encountered] = static_cast<T>(A_[(row * n_) + col]); nnz_encountered++; +// std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl; } } } +// std::cout << "___A =" << std::endl << "\t\t["; +// for (int i = 0; i < (n_ + 1); i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << A_armpl_row_ptr_[i]; +// } +// std::cout << "]" << std::endl << "\t\t["; +// for (int i = 0; i < nnz_; i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << A_armpl_col_index_[i]; +// } +// std::cout << "]" << std::endl << "\t\t["; +// for (int i = 0; i < nnz_; i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << A_vals_[i]; +// } +// std::cout << "]" << std::endl; + + +// std::cout << "About to load B into csr" << std::endl; + // Move B to CSR B_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; B_armpl_col_index_ = new armpl_int_t[nnz_]; B_vals_ = new T[nnz_]; + B_armpl_row_ptr_[0] = 0; + nnz_encountered = 0; for (int row = 0; row < n_; row++) { - B_armpl_row_ptr_[row] = nnz_encountered; +// std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << +// std::endl; + B_armpl_row_ptr_[row + 1] = nnz_encountered; for (int col = 0; col < n_; col++) { if (B_[(row * n_) + col] != 0.0) { +// std::cout << "\t\tCol " << col << " = " << B_[(row * n_) + col] << std::endl; B_armpl_col_index_[nnz_encountered] = col; - B_vals_[nnz_encountered] = B_[(row * n_) + col]; + B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]); nnz_encountered++; +// std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl; } } } +// std::cout << "___B =" << std::endl << "\t\t["; +// for (int i = 0; i < (n_ + 1); i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << B_armpl_row_ptr_[i]; +// } +// std::cout << "]" << std::endl << "\t\t["; +// for (int i = 0; i < nnz_; i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << B_armpl_col_index_[i]; +// } +// std::cout << "]" << std::endl << "\t\t["; +// for (int i = 0; i < nnz_; i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << B_vals_[i]; +// } +// std::cout << "]" << std::endl; + + +// // Move B to CSR +// C_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; +// C_armpl_col_index_ = new armpl_int_t[nnz_]; +// C_vals_ = new T[nnz_]; +// C_armpl_row_ptr_[0] = 0; +// +// nnz_encountered = 0; +//// std::cout << "About to load C into csr" << std::endl; +// for (int row = 0; row < n_; row++) { +//// std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << std::endl; +// C_armpl_row_ptr_[row + 1] = nnz_encountered; +// for (int col = 0; col < n_; col++) { +// if (A_[(row * n_) + col] != 0.0) { +// C_armpl_col_index_[nnz_encountered] = col; +// C_vals_[nnz_encountered] = A_[(row * n_) + col]; +// nnz_encountered++; +//// std::cout << "\t\tCol " << col << " = " << C_vals_[nnz_encountered] << +//// std::endl; +//// std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl; +// } +// } +// } + +// std::cout << "___C =" << std::endl << "\t\t["; +// for (int i = 0; i < (n_ + 1); i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << C_armpl_row_ptr_[i]; +// } +// std::cout << "]" << std::endl << "\t\t["; +// for (int i = 0; i < nnz_; i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << C_armpl_col_index_[i]; +// } +// std::cout << "]" << std::endl << "\t\t["; +// for (int i = 0; i < nnz_; i++) { +// if (i != 0) { +// std::cout << ", "; +// } +// std::cout << C_vals_[i]; +// } +// std::cout << "]" << std::endl; + + + +// std::cout << "Loading csr A into armpl storage formats" << std::endl; + if constexpr (std::is_same_v<T, float>) { + std::cout << "\tn_armpl_ = " << n_armpl_ << std::endl; + std::cout << "\tA_armpl_row_ptr_ (size = " << sizeof + (A_armpl_row_ptr_[0]) << ") = [" << A_armpl_row_ptr_[0]; + for (int i = 1; i < (n_ + 1); i++) { + std::cout << ", " << A_armpl_row_ptr_[i]; + } + std::cout << "]" << std::endl << "\tA_armpl_col_index_ (size = " << + sizeof(A_armpl_col_index_[0]) << ") = [" << + A_armpl_col_index_[0]; + for (int i = 1; i < nnz_; i++) { + std::cout << ", " << A_armpl_col_index_[i]; + } + std::cout << "]" << std::endl << "\tA_vals_ (size = " << sizeof + (A_vals_[0]) << ") = [" << A_vals_[0]; + for (int i = 1; i < nnz_; i++) { + std::cout << ", " << A_vals_[i]; + } + std::cout << "]" << std::endl << "flags: " << flags_ << std::endl; - if (std::is_sam_v<T, float>) { status_ = armpl_spmat_create_csr_s(A_armpl_, n_armpl_, n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, - flags); + flags_); if (status_ != ARMPL_STATUS_SUCCESS) { std::cout << "ERROR " << status_ << std::endl; exit(1); } +// std::cout << "Loading csr C into armpl storage formats" << std::endl; +// status_ = armpl_spmat_create_csr_s(C_armpl_, +// n_armpl_, +// n_armpl_, +// C_armpl_row_ptr_, +// C_armpl_col_index_, +// C_vals_, +// flags_); +// if (status_ != ARMPL_STATUS_SUCCESS) { +// std::cout << "ERROR " << status_ << std::endl; +// exit(1); +// } + +// std::cout << "Loading csr B into armpl storage formats" << std::endl; status_ = armpl_spmat_create_csr_s(B_armpl_, n_armpl_, n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, - flags); + flags_); if (status_ != ARMPL_STATUS_SUCCESS) { std::cout << "ERROR " << status_ << std::endl; exit(1); } - } else if (std::is_same_v<T, double>) { + } else if constexpr (std::is_same_v<T, double>) { + std::cout << "\tn_armpl_ = " << n_armpl_ << std::endl; + std::cout << "\tA_armpl_row_ptr_ (size = " << sizeof + (A_armpl_row_ptr_[0]) << ") = [" << A_armpl_row_ptr_[0]; + for (int i = 1; i < (n_ + 1); i++) { + std::cout << ", " << A_armpl_row_ptr_[i]; + } + std::cout << "]" << std::endl << "\tA_armpl_col_index_ (size = " << + sizeof(A_armpl_col_index_[0]) << ") = [" << + A_armpl_col_index_[0]; + for (int i = 1; i < nnz_; i++) { + std::cout << ", " << A_armpl_col_index_[i]; + } + std::cout << "]" << std::endl << "\tA_vals_ (size = " << sizeof + (A_vals_[0]) << ") = [" << A_vals_[0]; + for (int i = 1; i < nnz_; i++) { + std::cout << ", " << A_vals_[i]; + } + std::cout << "]" << std::endl << "flags: " << flags_ << std::endl; + + + std::cout << "About to create CSR A (double)" << std::endl; status_ = armpl_spmat_create_csr_d(A_armpl_, n_armpl_, n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, - flags); + flags_); if (status_ != ARMPL_STATUS_SUCCESS) { std::cout << "ERROR " << status_ << std::endl; exit(1); } +// std::cout << "Loading csr C into armpl storage formats" << std::endl; +// status_ = armpl_spmat_create_csr_d(C_armpl_, +// n_armpl_, +// n_armpl_, +// C_armpl_row_ptr_, +// C_armpl_col_index_, +// C_vals_, +// flags_); +// if (status_ != ARMPL_STATUS_SUCCESS) { +// std::cout << "ERROR " << status_ << std::endl; +// exit(1); +// } + +// std::cout << "Loading csr B into armpl storage formats" << std::endl; + std::cout << "About to create CSR B (double)" << std::endl; status_ = armpl_spmat_create_csr_d(B_armpl_, n_armpl_, n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, - flags); + flags_); if (status_ != ARMPL_STATUS_SUCCESS) { std::cout << "ERROR " << status_ << std::endl; exit(1); } } - +// std::cout << "Okay, all matrices made!!" << std::endl; } armpl_int_t flags_; @@ -219,12 +416,16 @@ class sp_gemm_cpu : public sp_gemm<T> { armpl_int_t* C_armpl_row_ptr_; armpl_int_t* C_armpl_col_index_; + T* A_vals_; + T* B_vals_; + T* C_vals_; + armpl_spmat_t* A_armpl_; armpl_spmat_t* B_armpl_; armpl_spmat_t* C_armpl_; - sparse_hint_value transA = ARMPL_SPARSE_OPERATION_NOTRANS; - sparse_hint_value transB = ARMPL_SPARSE_OPERATION_NOTRANS; + armpl_sparse_hint_value transA_ = ARMPL_SPARSE_OPERATION_NOTRANS; + armpl_sparse_hint_value transB_ = ARMPL_SPARSE_OPERATION_NOTRANS; }; } // namespace cpu diff --git a/DefaultCPU/sp_gemm.hh b/DefaultCPU/sp_gemm.hh deleted file mode 100644 index d7ecb37..0000000 --- a/DefaultCPU/sp_gemm.hh +++ /dev/null @@ -1,55 +0,0 @@ -#pragma once - -#if defined CPU_DEFAULT - -#include "../include/kernels/CPU/sp_gemm.hh" -#include "../include/utilities.hh" - -namespace cpu { -/** A class for GEMM CPU BLAS kernels. */ -template <typename T> -class sp_gemm_cpu : public sp_gemm<T> { - public: - using sp_gemm<T>::sp_gemm; - using sp_gemm<T>::callConsume; - using sp_gemm<T>::m_; - using sp_gemm<T>::n_; - using sp_gemm<T>::k_; - using sp_gemm<T>::A_; - using sp_gemm<T>::B_; - using sp_gemm<T>::C_; - - private: - /** Perform the GEMM kernel. */ - void callGemm() override { - /** A naive implementation of a column-major GEMM. Alpha and Beta are always - * 1 and 0 respectively. - * Operation takes the form of C[M,N] = A[M,K] * B[K,N]. - * callConsume() is required to ensure that the compiler does not optimise - * away this function. */ - int x, y, z; - T acc; - for (x = 0; x < m_; x++) { - for (y = 0; y < n_; y++) { - acc = 0.0; - for (z = 0; z < k_; z++) { - acc += A_[z * m_ + x] * B_[y * k_ + z]; - } - C_[y * m_ + x] = acc; - } - } - // Ensure compiler doesn't optimise away the work being done - callConsume(); - } - - /** Perform any required steps before calling the GEMM kernel that should - * be timed. */ - void preLoopRequirements() override {} - - /** Perform any required steps after calling the GEMM kernel that should - * be timed. */ - void postLoopRequirements() override {} -}; - -} // namespace cpu -#endif diff --git a/DefaultGPU/sp_gemm.hh b/DefaultGPU/sp_gemm.hh deleted file mode 100644 index 2a9f478..0000000 --- a/DefaultGPU/sp_gemm.hh +++ /dev/null @@ -1,54 +0,0 @@ -#pragma once - -#if defined GPU_DEFAULT - -#include <cmath> - -#include "../include/kernels/GPU/sp_gemm.hh" -#include "../include/utilities.hh" - -namespace gpu { -/** A class for GEMM GPU BLAS kernels. */ -template <typename T> -class sp_gemm_gpu : public sp_gemm<T> { - public: - using sp_gemm<T>::sp_gemm; - - /** Call the BLAS kernel n times, with 1 warmup run. - * Returns the time elapsed for n BLAS calls in seconds. */ - time_checksum_gflop compute() { - // Override function in base `kernel` class as DefaultGPU should do nothing. - return {INFINITY, INFINITY, 0.0}; - } - - /** Initialise the required data structures. */ - void initialise(gpuOffloadType offload, int n, float sparsity) override { - // Default GPU implementation - do nothing. - } - - private: - /** Make a call to the BLAS Library Kernel. */ - void callGemm() override { - // Default GPU implementation - do nothing. - } - - /** Perform any required steps before calling the GEMM kernel that should - * be timed. */ - void preLoopRequirements() override { - // Default GPU implementation - do nothing. - } - - /** Perform any required steps after calling the GEMM kernel that should - * be timed. */ - void postLoopRequirements() override { - // Default GPU implementation - do nothing. - } - - /** Do any necessary cleanup (free pointers, close library handles, etc.) - * after Kernel has been called. */ - void postCallKernelCleanup() override { - // Default GPU implementation - do nothing. - } -}; -} // namespace gpu -#endif \ No newline at end of file diff --git a/Makefile b/Makefile index bff0add..e5091e0 100644 --- a/Makefile +++ b/Makefile @@ -170,7 +170,7 @@ $(warning GPU_LIB not set (use CUBLAS, ONEMKL, ROCBLAS). No GPU kernels will be else ifeq ($(GPU_LIB), CUBLAS) # Do cuBLAS stuff ifeq ($(COMPILER), NVIDIA) -override CXXFLAGS += -cudalib=cublas +override CXXFLAGS += -cudalib=cublas -lcusparse_static else $(warning Users may be required to do the following to use $(COMPILER) with $(GPU_LIB):) $(info $(TAB)$(TAB)Add `CXXFLAGS=-L<NVHPC_DIR>/.../math_libs/lib64 -L<NVHPC_DIR>/.../cuda/lib64` to make command) diff --git a/NVPL/sp_gemv.hh b/NVPL/sp_gemv.hh deleted file mode 100644 index d04f6b8..0000000 --- a/NVPL/sp_gemv.hh +++ /dev/null @@ -1,117 +0,0 @@ -/** - * ToDo -- This is all currently written for GEMM, but NVPL does not support - * GEMM, so this needs to be adjusted to spmv -- which is supported - */ - - - - - -#pragma once - -#ifdef CPU_NVPL -#include <nvpl_sparse.h> - -#include "../include/kernels/CPU/gemm.hh" -#include "../include/utilities.hh" - -namespace cpu { -/** A class for GEMM CPU BLAS kernels. */ -template <typename T> -class sp_gemm_cpu : public sp_gemm<T> { - public: - using sp_gemm<T>::gemm; - using sp_gemm<T>::callConsume; - using sp_gemm<T>::m_; - using sp_gemm<T>::n_; - using sp_gemm<T>::k_; - using sp_gemm<T>::A_; - using sp_gemm<T>::B_; - using sp_gemm<T>::C_; - - private: - /** Make call to the GEMM kernel. */ - void callGemm() override { - - // Ensure compiler doesn't optimise away the work being done - callConsume(); - } - - /** Perform any required steps before calling the GEMM kernel that should - * be timed. */ - void preLoopRequirements() override { - // Set type enum - if constexpr (std::is_same_v<T, float>) { - type_ = NVPL_SPARSE_R_32F; - } else if constexpr (std::is_same_v<T, double>) { - type_ = NVPL_SPARSE_R_64F; - } else { - // Un-specialised class will not do any work - print error and exit. - std::cout << "ERROR - Datatype for NVPL sparse GEMM kernel not supported." - << std::endl; - exit(1); - } - status_ = nvpl_sparse_create(&handle_); - // Todo -- error check - - // Todo -- Make const? - status_ = nvpl_sparse_create_csr(A_nvpl_, n_, n_, nnz_, A_row_ptr_nvpl_, - A_col_index_nvpl_, A_vals_nvpl_, - index_type_, index_type_, base_, type_); - - status_ = nvpl_sparse_create_csr(B_nvpl_, n_, n_, nnz_, B_row_ptr_nvpl_, - B_col_index_nvpl_, B_vals_nvpl_, - index_type_, index_type_, base_, type_); - // Todo -- error check - - - } - - /** Perform any required steps after calling the GEMM kernel that should - * be timed. */ - void postLoopRequirements() override { - status_ = nvpl_sparse_destroy(handle_); - // Todo -- error check - status_ = nvpl_sparse_destroy_sp_mat(A_nvpl_); - status_ = nvpl_sparse_destroy_sp_mat(B_nvpl_); - status_ = nvpl_sparse_destroy_sp_mat(C_nvpl_); - } - - /** The constant value Alpha. */ - T alpha = ALPHA; - - /** The constant value Beta. */ - T beta = BETA; - - /** - * Sparse metadata - */ - nvpl_sparse_status_t status_; - nvpl_sparse_handle_t handle_; - nvpl_sparse_data_type_t type_; - - nvpl_sparse_operation_t op_ = NVPL_SPARSE_OPERATION_NON_TRANSPOSE; - nvpl_sparse_index_base_t base_ = NVPL_SPARSE_INDEX_BASE_ZERO; - nvpl_sparse_format_t format_ = NVPL_SPARSE_FORMAT_CSR; - nvpl_sparse_order_t order_ = NVPL_SPARSE_ORDER_COL; - nvpl_sparse_index_type_t index_type_ = NVPL_SPARSE_INDEX_64I; - - /** - * Sparse matrix descriptors - */ - nvpl_sparse_sp_mat_descr_t* A_nvpl_; - nvpl_sparse_sp_mat_descr_t* B_nvpl_; - nvpl_sparse_sp_mat_descr_t* C_nvpl_; - - void* A_row_ptr_nvpl_; - void* B_row_ptr_nvpl_; - void* C_row_ptr_nvpl_; - void* A_col_idnex_nvpl_; - void* B_col_idnex_nvpl_; - void* C_col_idnex_nvpl_; - void* A_vals_nvpl_; - void* B_vals_nvpl_; - void* C_vals_nvpl_; -}; -} // namespace cpu -#endif \ No newline at end of file diff --git a/createGflopsGraphs.py b/createGflopsGraphs.py index d323162..07ac243 100644 --- a/createGflopsGraphs.py +++ b/createGflopsGraphs.py @@ -123,6 +123,11 @@ inputTypeStr = "Square x Short-Wide (M=K=32, N)" for j in range(0, len(mnk)): xVals.append(mnk[j][1]) + elif "_sparse_square" in gemmFilenames[i]: + x_name = "Value of M, N, K" + inputTypeStr = "Sparse square matrices" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) else: # File not supported so go to next file continue diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/sp_gemm.hh index d849d22..b5e8d93 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/sp_gemm.hh @@ -1,8 +1,7 @@ #pragma once #ifdef GPU_CUBLAS -#include "cusparse.h" -#include <cublas_v2.h> +#include <cusparse_v2.h> #include <cuda_runtime.h> #include <type_traits> #include <random> @@ -13,13 +12,13 @@ #include "common.hh" namespace gpu { -/** A class for GEMM GPU BLAS kernels. */ +/** A class for sparse GEMM GPU BLAS kernels. */ template <typename T> class sp_gemm_gpu : public sp_gemm<T> { public: using sp_gemm<T>::sp_gemm; using sp_gemm<T>::initInputMatricesSparse; - using sp_gemm<T>::toCSR; + using sp_gemm<T>::toCSR_int; using sp_gemm<T>::n_; using sp_gemm<T>::A_; using sp_gemm<T>::B_; @@ -44,7 +43,7 @@ class sp_gemm_gpu : public sp_gemm<T> { std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; exit(1); } - n_ = 100 * n; + n_ = n; // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); diff --git a/cuBLAS/sp_gemv.hh b/cuBLAS/sp_gemv.hh new file mode 100644 index 0000000..8027746 --- /dev/null +++ b/cuBLAS/sp_gemv.hh @@ -0,0 +1,261 @@ +//#pragma once +// +//#ifdef GPU_CUBLAS +//#include <cusparse_v2.h> +//#include <cuda.h> +//#include <cublas_v2.h> +//#include <cuda_runtime.h> +//#include <type_traits> +//#include <random> +//#include <iostream> +// +//#include "../include/kernels/GPU/sp_gemv.hh" +//#include "../include/utilities.hh" +//#include "common.hh" +// +//namespace gpu { +///** A class for sparse GEMV GPU BLAS kernels. */ +//template <typename T> +//class gemv_gpu : public gemv<T> { +// public: +// using gemv<T>::gemv; +// using gemv<T>::initInputMatrixVector; +// using gemv<T>::m_; +// using gemv<T>::n_; +// using gemv<T>::A_; +// using gemv<T>::x_; +// using gemv<T>::y_; +// using gemv<T>::offload_; +// using gemv<T>::vecIncrement_; +// +// ~gemv_gpu() { +// if (alreadyInitialised_) { +// // Destroy the handle +// cublasCheckError(cublasDestroy(handle_)); +// +// // Destroy streams after use +// cudaCheckError(cudaStreamDestroy(s1_)); +// cudaCheckError(cudaStreamDestroy(s2_)); +// cudaCheckError(cudaStreamDestroy(s3_)); +// } +// } +// +// /** Initialise the required data structures. +// * `offload` refers to the data offload type: +// * - Once: Move data from host to device before all iterations & move from +// * device to host after all iterations +// * - Always: Move data from host to device and device to host each iteration +// * - Unified: Initialise data as unified memory; no data movement semantics +// * required */ +// void initialise(gpuOffloadType offload, int m, int n) override { +// if (!alreadyInitialised_) { +// alreadyInitialised_ = true; +// // Perform set-up which doesn't need to happen every problem size change. +// // Create a handle for CUBLAS +// cublasCheckError(cublasCreate(&handle_)); +// +// // Get device identifier +// cudaCheckError(cudaGetDevice(&gpuDevice_)); +// +// // Initialise 3 streams to asynchronously move data between host and +// // device +// cudaCheckError(cudaStreamCreate(&s1_)); +// cudaCheckError(cudaStreamCreate(&s2_)); +// cudaCheckError(cudaStreamCreate(&s3_)); +// } +// +// offload_ = offload; +// m_ = m; +// n_ = n; +// +// if (offload_ == gpuOffloadType::unified) { +// cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * m_ * n_)); +// cudaCheckError(cudaMallocManaged(&x_, sizeof(T) * n_)); +// cudaCheckError(cudaMallocManaged(&y_, sizeof(T) * m_)); +// } else { +// // Allocate matrices on host +// cudaCheckError(cudaMallocHost((void**)&A_, sizeof(T) * m_ * n_)); +// cudaCheckError(cudaMallocHost((void**)&x_, sizeof(T) * n_)); +// cudaCheckError(cudaMallocHost((void**)&y_, sizeof(T) * m_)); +// // Allocate matrices on device +// cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * m_ * n_)); +// cudaCheckError(cudaMalloc((void**)&x_device_, sizeof(T) * n_)); +// cudaCheckError(cudaMalloc((void**)&y_device_, sizeof(T) * m_)); +// } +// +// // Initialise the host data structures +// initInputMatrixVector(); +// } +// +// private: +// /** Perform any required steps before calling the GEMV kernel that should +// * be timed. */ +// void preLoopRequirements() override { +// switch (offload_) { +// case gpuOffloadType::always: { +// // Offload data each iteration - no requirements +// break; +// } +// case gpuOffloadType::once: { +// // Offload input data from host to the device. +// cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_, +// cudaMemcpyHostToDevice, s1_)); +// cudaCheckError(cudaMemcpyAsync(x_device_, x_, sizeof(T) * n_, +// cudaMemcpyHostToDevice, s2_)); +// cudaCheckError(cudaMemcpyAsync(y_device_, y_, sizeof(T) * m_, +// cudaMemcpyHostToDevice, s3_)); +// break; +// } +// case gpuOffloadType::unified: { +// // Prefetch input data to device +// cudaCheckError( +// cudaMemPrefetchAsync(A_, sizeof(T) * m_ * n_, gpuDevice_, s1_)); +// cudaCheckError( +// cudaMemPrefetchAsync(x_, sizeof(T) * n_, gpuDevice_, s2_)); +// cudaCheckError( +// cudaMemPrefetchAsync(y_, sizeof(T) * m_, gpuDevice_, s3_)); +// break; +// } +// } +// } +// +// /** Make a call to the BLAS Library Kernel. */ +// void callGemv() override { +// switch (offload_) { +// case gpuOffloadType::always: { +// // Offload input data from host to the device. +// cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_, +// cudaMemcpyHostToDevice, s1_)); +// cudaCheckError(cudaMemcpyAsync(x_device_, x_, sizeof(T) * n_, +// cudaMemcpyHostToDevice, s2_)); +// cudaCheckError(cudaMemcpyAsync(y_device_, y_, sizeof(T) * m_, +// cudaMemcpyHostToDevice, s3_)); +// // Call cuBLAS GEMV kernel +// if constexpr (std::is_same_v<T, float>) { +// cublasCheckError(cublasSgemv( +// handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_), +// x_device_, vecIncrement_, &beta, y_device_, vecIncrement_)); +// } else if constexpr (std::is_same_v<T, double>) { +// cublasCheckError(cublasDgemv( +// handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_), +// x_device_, vecIncrement_, &beta, y_device_, vecIncrement_)); +// } +// // Offload output data from device to host +// cudaCheckError(cudaMemcpyAsync(y_, y_device_, sizeof(T) * m_, +// cudaMemcpyDeviceToHost, s3_)); +// // Ensure device has finished all work. +// cudaCheckError(cudaDeviceSynchronize()); +// break; +// } +// case gpuOffloadType::once: { +// // Call cuBLAS GEMV kernel +// if constexpr (std::is_same_v<T, float>) { +// cublasCheckError(cublasSgemv( +// handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_), +// x_device_, vecIncrement_, &beta, y_device_, vecIncrement_)); +// } else if constexpr (std::is_same_v<T, double>) { +// cublasCheckError(cublasDgemv( +// handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_), +// x_device_, vecIncrement_, &beta, y_device_, vecIncrement_)); +// } +// break; +// } +// case gpuOffloadType::unified: { +// // Call cuBLAS GEMV kernel +// if constexpr (std::is_same_v<T, float>) { +// cublasCheckError(cublasSgemv(handle_, CUBLAS_OP_N, m_, n_, &alpha, A_, +// std::max(1, m_), x_, vecIncrement_, +// &beta, y_, vecIncrement_)); +// } else if constexpr (std::is_same_v<T, double>) { +// cublasCheckError(cublasDgemv(handle_, CUBLAS_OP_N, m_, n_, &alpha, A_, +// std::max(1, m_), x_, vecIncrement_, +// &beta, y_, vecIncrement_)); +// } +// break; +// } +// } +// } +// +// /** Perform any required steps after calling the GEMV kernel that should +// * be timed. */ +// void postLoopRequirements() override { +// switch (offload_) { +// case gpuOffloadType::always: { +// // Offload data each iteration - no requirements +// break; +// } +// case gpuOffloadType::once: { +// // Offload output data from device to host +// cudaCheckError(cudaMemcpyAsync(y_, y_device_, sizeof(T) * m_, +// cudaMemcpyDeviceToHost, s3_)); +// // Ensure device has finished all work. +// cudaCheckError(cudaDeviceSynchronize()); +// break; +// } +// case gpuOffloadType::unified: { +// // Ensure all output data resides on host once work has completed +// cudaCheckError( +// cudaMemPrefetchAsync(y_, sizeof(T) * m_, cudaCpuDeviceId, s3_)); +// // Ensure device has finished all work. +// cudaCheckError(cudaDeviceSynchronize()); +// break; +// } +// } +// } +// +// /** Do any necessary cleanup (free pointers, close library handles, etc.) +// * after Kernel has been called. */ +// void postCallKernelCleanup() override { +// if (offload_ == gpuOffloadType::unified) { +// cudaFree(A_); +// cudaFree(x_); +// cudaFree(y_); +// } else { +// // Free the memory held on host and device +// cudaFreeHost((void*)A_); +// cudaFreeHost((void*)x_); +// cudaFreeHost((void*)y_); +// cudaFree(A_device_); +// cudaFree(x_device_); +// cudaFree(y_device_); +// } +// } +// +// /** Whether the initialise function has been called before. */ +// bool alreadyInitialised_ = false; +// +// /** Handle used when calling cuBLAS. */ +// cublasHandle_t handle_; +// +// /** CUDA Stream 1 - used to asynchronously move data between host and device. +// */ +// cudaStream_t s1_; +// +// /** CUDA Stream 2 - used to asynchronously move data between host and device. +// */ +// cudaStream_t s2_; +// +// /** CUDA Stream 3 - used to asynchronously move data between host and device. +// */ +// cudaStream_t s3_; +// +// /** The ID of the target GPU Device. */ +// int gpuDevice_; +// +// /** Input matrix A, held on the device. */ +// T* A_device_; +// +// /** Input vector x, held on the device. */ +// T* x_device_; +// +// /** Input vector y, held on the device. */ +// T* y_device_; +// +// /** The constant value Alpha. */ +// const T alpha = ALPHA; +// +// /** The constant value Beta. */ +// const T beta = BETA; +//}; +//} // namespace gpu +//#endif \ No newline at end of file diff --git a/include/.DS_Store b/include/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..869e02c3a673dee3916dd63df65263ee873d8adc GIT binary patch literal 6148 zcmeHKOG*Pl5UtjL18%a@WnUp%S8W(ikPApmP;lY|#r@#cyLb!16L=oqtEvQs^umoI zQUzVFy1J^X=fU(xMAH0uH4~YNNP|X9G%7-Ob?C^0C%~k0tfiBu?sm4g=_?ccMHkn8 zBKNYEM|ptWuYa?(<zBoTMr``FU3UEvBhBl_TX**Pep!oJeIsgnN49p=yr`899Sj5m z!9Xw&4EzKGII}6usbT0~AQ%V+_6%@;$Y{jkI2h{I0i{m>Ag|FXu=$o?PIfGggCRyB z$x?xqn*528EFJ#ram8^kv~)>Y8S{AM-Qy)`b@;P}ODcw;gMnaR%)qgAr#%0!@XJ&m z`Qw!61p~prKVu+G+C@9ZNBP-$@OeCIGuky8g>eH72<*`%03Gfl=Q?QPnKt5z<6y{H S=+|^$Tm+PmP{F`2Fz^NPk}u={ literal 0 HcmV?d00001 diff --git a/include/doGemm.hh b/include/doGemm.hh index e264273..a33ef7e 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -8,6 +8,7 @@ #if defined CPU_ARMPL #include "../ArmPL/gemm.hh" +#include "../ArmPL/sp_gemm.hh" #elif defined CPU_ONEMKL #include "../oneMKL/CPU/gemm.hh" #elif defined CPU_AOCL @@ -62,7 +63,9 @@ class doGemm { /** Run all problem types and write data to CSV files. */ void collectData() { - if (doDense_) { + // ToDo -- I've hard coded false here as kernel selection was not working + // . Needs to be fixed + if (false) { // Square Problem Sizes... // Re-initialise offload threshold structures cpuGpu_always_ = cpuGpu_offloadThreshold(); @@ -299,7 +302,7 @@ class doGemm { #endif } - if (doSparse_) { // Square sparse matrix - sparse matrix multiplication + if (true) { // Square sparse matrix - sparse matrix multiplication cpuGpu_always_ = cpuGpu_offloadThreshold(); cpuGpu_once_ = cpuGpu_offloadThreshold(); cpuGpu_unified_ = cpuGpu_offloadThreshold(); @@ -307,7 +310,7 @@ class doGemm { getKernelName() + "_sparse_square.csv"); if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { - callSparseKernels(csvFile, dim, 0.9999); + callSparseKernels(csvFile, dim, 0.99); } } // Close file @@ -524,8 +527,12 @@ class doGemm { #if CPU_ENABLED if (doCPU_) { +// std::cout << "about to initialise matrices with size = " << N << +// std::endl; spGemmCpu_.initialise(N, sparsity); +// std::cout << "about to run spGEMM" << std::endl; time_checksum_gflop cpuResult = spGemmCpu_.compute(); +// std::cout << "about to calculate flops" << std::endl; cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, cpuResult.runtime, cpuResult.gflops); @@ -536,31 +543,38 @@ class doGemm { // - UNIFIED : data passed from host to device (and device to host) as // needed if (doGPU_) { - spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); - time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); - gpuResult_unified.gflops = - calcGflops(flops, iterations_, gpuResult_unified.runtime); + std::cout << "Starting with matrix of size " << N << std::endl; + std::cout << "\t\tUnified"; + spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); + std::cout << "\tInitialised" << std::endl; + time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); + gpuResult_unified.gflops = + calcGflops(flops, iterations_, gpuResult_unified.runtime); // - ALWAYS: Offload to/from GPU every iteration - spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); - time_checksum_gflop gpuResult_always = spGemmGpu_.compute(); - gpuResult_always.gflops = + std::cout << "\t\tAlways"; + spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); + std::cout << "\tInitialised" << std::endl; + time_checksum_gflop gpuResult_always = spGemmGpu_.compute(); + gpuResult_always.gflops = calcGflops(flops, iterations_, gpuResult_always.runtime); // - ONCE : Offload to/from GPU once before all iterations and once // after - spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); - time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); - gpuResult_once.gflops = + std::cout << "\t\tOnce"; + spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); + std::cout << "\tInitialised" << std::endl; + time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); + gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime); // ToDo -- non-default GPU operations // Write lines to CSV file - writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, + writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, iterations_, gpuResult_once.runtime, gpuResult_once.gflops); - writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, + writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, iterations_, gpuResult_always.runtime, gpuResult_always.gflops); - writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, + writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, iterations_, gpuResult_unified.runtime, gpuResult_unified.gflops); diff --git a/include/kernels/.DS_Store b/include/kernels/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..9cc84b2a4ce0fb9e6849637c24a43195d7749e28 GIT binary patch literal 6148 zcmeHKy-EW?5S}qX4s3#z<(9SqTVahCY=vMiF<^=u7ZCdeU&q(*6?_C=!p3iQCf*&l zSc=FD?0&oRd-uWZ-VhNlo;P!%84*<&f-H-Ih`MMxGG{Te<e0A>+Gbx!@po17>=U}C zTe{ml_MiXswX-yBU9WfT8k*|q^5x^={q3r6-TYwPZ~IvT!cgyKT<`d^v-InoFMIWJ zT+?>-#@0eTsp;YjI0MdrGvEvy7{Hw^Qk^LJ><l;q&cHVVay|qM!DyHi^U;ARw*bH? z)LF2WUP5AmVKhvNus~Qtff~xzVz7ooAIvWrCPfV=w&sJa@}YU*TpjsCb|;RCK05=> zz>tB7ZfA1;FY(C~oBUyj@0<Z=;GZ$T(|T1e@KScSj$Tjh+JJG7AtHWB77(llKLOat gIdYbbY7er8Uo=dLl12169Oyp+nGm0xfnQ+Y3;ZEIrT_o{ literal 0 HcmV?d00001 diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh index 72fd5dc..dfab687 100644 --- a/include/kernels/CPU/sp_gemm.hh +++ b/include/kernels/CPU/sp_gemm.hh @@ -4,6 +4,7 @@ #include <random> #include <memory> +#include <iostream> namespace cpu { @@ -11,10 +12,11 @@ namespace cpu { template <typename T> class sp_gemm : public ::gemm<T> { public: - using ::gemm<T>::gemm; + using ::gemm<T>::gemm; using ::gemm<T>::initInputMatricesSparse; - using ::gemm<T>::toCSR; - using ::gemm<T>::m_; + using ::gemm<T>::toCSR_int; + using ::gemm<T>::iterations_; + using ::gemm<T>::m_; using ::gemm<T>::n_; using ::gemm<T>::k_; using ::gemm<T>::A_; @@ -30,7 +32,8 @@ namespace cpu { // Note that the below should be the same as the edges calculation // used in the initInputMatricesSparse function. If changed here, // change there - nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity_)); + nnz_ = 1 + (int) ((double)n_ * (double)n_ * (1.0 - sparsity_)); +// std::cout << "nnz_ = " << nnz_ << std::endl; A_ = (T*)malloc(sizeof(T) * n_ * n_); B_ = (T*)malloc(sizeof(T) * n_ * n_); @@ -38,10 +41,12 @@ namespace cpu { initInputMatricesSparse(sparsity_); - toCSR(); + toCSR_int(); } - private: + int nnz_; + + private: /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() { @@ -50,7 +55,7 @@ namespace cpu { free(C_); } - void toCSR() { + void toCSR_int() { // Move A to CSR A_row_ptr_ = new int[n_ + 1]; A_col_index_ = new int[nnz_]; @@ -86,8 +91,6 @@ namespace cpu { double sparsity_; - int nnz_; - int* A_row_ptr_; int* A_col_index_; int* B_row_ptr_; @@ -96,7 +99,7 @@ namespace cpu { int* C_col_index_; T* A_vals_; T* B_vals_; - T* C_vals; + T* C_vals_; }; } // namespace cpu diff --git a/include/kernels/CPU/sp_gemv.hh b/include/kernels/CPU/sp_gemv.hh new file mode 100644 index 0000000..0c84cb0 --- /dev/null +++ b/include/kernels/CPU/sp_gemv.hh @@ -0,0 +1,47 @@ +#pragma once + +#include "../gemv.hh" + +#include <random> +#include <memory> + +namespace cpu { + +/** An abstract class for GEMV BLAS kernels. */ + template <typename T> + class sp_gemv : public ::gemv<T> { + public: + using ::gemv<T>::gemv; + using ::gemv<T>::initInputMatrixVectorSparse; + using ::gemv<T>::m_; + using ::gemv<T>::n_; + using ::gemv<T>::A_; + using ::gemv<T>::x_; + using ::gemv<T>::y_; + using ::gemv<T>::sparsity_; + + public: + /** Initialise the required data structures. */ + void initialise(int n, double sparsity) { + m_ = n; + n_ = n; + sparsity_ = sparsity; + + A_ = (T*)malloc(sizeof(T) * m_ * n_); + x_ = (T*)malloc(sizeof(T) * n_); + y_ = (T*)malloc(sizeof(T) * m_); + + // Initialise the matrix and vectors + initInputMatrixVectorSparse(); + } + + private: + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() override { + free(A_); + free(x_); + free(y_); + } + }; +} // namespace cpu \ No newline at end of file diff --git a/include/kernels/GPU/sp_gemm.hh b/include/kernels/GPU/sp_gemm.hh index dbfba87..52a5494 100644 --- a/include/kernels/GPU/sp_gemm.hh +++ b/include/kernels/GPU/sp_gemm.hh @@ -17,7 +17,8 @@ namespace gpu { * - Always: Move data from host to device and device to host each iteration * - Unified: Initialise data as unified memory; no data movement semantics * required */ - virtual void initialise(gpuOffloadType offload, int n, float sparsity) = 0; + virtual void initialise(gpuOffloadType offload, int n, float sparsity) + = 0; protected: /** Whether data should be offloaded to/from the GPU each iteration, or just diff --git a/include/kernels/GPU/sp_gemv.hh b/include/kernels/GPU/sp_gemv.hh new file mode 100644 index 0000000..75fd126 --- /dev/null +++ b/include/kernels/GPU/sp_gemv.hh @@ -0,0 +1,28 @@ +#pragma once + +#include "../gemv.hh" + +namespace gpu { + +/** An abstract class for GEMV BLAS kernels. */ + template <typename T> + class sp_gemv : public ::gemv<T> { + public: + using ::gemv<T>::gemv; + + /** Initialise the required data structures. + * `offload` refers to the data offload type: + * - Once: Move data from host to device before all iterations & move from + * device to host after all iterations + * - Always: Move data from host to device and device to host each iteration + * - Unified: Initialise data as unified memory; no data movement semantics + * required */ + virtual void initialise(gpuOffloadType offload, int n, float sparsity) + = 0; + + protected: + /** Whether data should be offloaded to/from the GPU each iteration, or just + * before & after. */ + gpuOffloadType offload_ = gpuOffloadType::always; + }; +} // namespace gpu \ No newline at end of file diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index d357734..6d75554 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -9,6 +9,7 @@ #include <cmath> #include <limits> #include <random> +#include <iostream> #include "../utilities.hh" @@ -27,10 +28,13 @@ class gemm { std::chrono::high_resolution_clock::now(); // Perform all GEMM calls +// std::cout << "about to do pre-loop requirements" << std::endl; preLoopRequirements(); for (int i = 0; i < iterations_; i++) { +// std::cout << "entering loop " << i << std::endl; callGemm(); } +// std::cout << "about to do post-loop requirements" << std::endl; postLoopRequirements(); // Stop Timer diff --git a/include/kernels/gemv.hh b/include/kernels/gemv.hh index ba12d02..665fe59 100644 --- a/include/kernels/gemv.hh +++ b/include/kernels/gemv.hh @@ -4,6 +4,7 @@ #include <chrono> #include <cmath> #include <limits> +#include <random> #include "../utilities.hh" @@ -82,6 +83,82 @@ class gemv { } } + void initInputMatrixVectorSparse() { + // Initialise sparse matrix + for (int i = 0; i < (n_ * n_); i++) { + A_[i] = 0.0; + } + + // Random number generator objects for use in descent + std::default_random_engine gen; + gen.seed(std::chrono::system_clock::now() + .time_since_epoch().count()); + std::uniform_real_distribution<double> dist(0.0, 1.0); + + int edges = 1 + (int) (n_ * n_ * (1 - sparsity_)); + + // Using a=0.45 and b=c=0.22 as default probabilities + for (int i = 0; i < edges; i++) { + while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, + false)) {} + } + + // Initialise the input and output vectors + for (int y = 0; y < n_; y++) { + x_[y] = (T)((double)(rand() % 100) / 3.0); + } + for (int y = 0; y < m_; y++) { + y_[y] = (T)0.0; + } + } + + /** Recursive function to populate sparse matrices */ + bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b, + float c, std::default_random_engine* gen, + std::uniform_real_distribution<double> dist, bool bin) { + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + // Needed to avoid overfloe segfaults with large problem sizes + uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1); + if (abs(M[index]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // ToDo -- add some noise to these values between iterations + float newA = a; + float newB = b; + float newC = c; + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + newA, newB, newC, gen, dist, bin); + } else { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, + gen, dist, bin); + } + } + return true; + } + /** Call the extern consume() function. */ void callConsume() { consume((void*)A_, (void*)x_, (void*)y_); } @@ -105,4 +182,6 @@ class gemv { /** The distance between two vector elements. */ const int vecIncrement_ = 1; + + double sparsity_ = 0.0; }; From a8e5c4690238832761286e2cde7ab7f2170acf26 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:53:08 +0100 Subject: [PATCH 25/38] Adding AOCL files --- .idea/workspace.xml | 6 +- ArmPL/sp_gemm.hh | 266 +++++++-------------------------- createGflopsGraphs.py | 2 +- cuBLAS/common.hh | 2 +- include/doGemm.hh | 11 -- include/kernels/CPU/sp_gemm.hh | 10 +- include/kernels/gemm.hh | 3 - src/main.cc | 24 +-- 8 files changed, 80 insertions(+), 244 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index b954508..e9a4d65 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -125,9 +125,9 @@ </method> </configuration> <list> - <item itemvalue="C/C++ File.main.cc" /> <item itemvalue="Native Application.all" /> <item itemvalue="Native Application.gpu-blob" /> + <item itemvalue="C/C++ File.main.cc" /> </list> <recent_temporary> <list> @@ -171,7 +171,9 @@ <workItem from="1723101242209" duration="21225000" /> <workItem from="1724244974273" duration="40294000" /> <workItem from="1726568120590" duration="8508000" /> - <workItem from="1726828018604" duration="38592000" /> + <workItem from="1726828018604" duration="52619000" /> + <workItem from="1727941759103" duration="43000" /> + <workItem from="1727941814674" duration="165000" /> </task> <task id="LOCAL-00001" summary="trivial changes"> <option name="closed" value="true" /> diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh index 47b0bf9..cb6b443 100644 --- a/ArmPL/sp_gemm.hh +++ b/ArmPL/sp_gemm.hh @@ -25,6 +25,9 @@ class sp_gemm_cpu : public sp_gemm<T> { using sp_gemm<T>::B_; using sp_gemm<T>::C_; using sp_gemm<T>::nnz_; + using sp_gemm<T>::A_vals_; + using sp_gemm<T>::B_vals_; + using sp_gemm<T>::C_vals_; private: /** Make call to the GEMM kernel. */ @@ -57,19 +60,18 @@ class sp_gemm_cpu : public sp_gemm<T> { status_ = armpl_spmm_exec_s(transA_, transB_, alpha, - *A_armpl_, - *B_armpl_, + A_armpl_, + B_armpl_, beta, - *B_armpl_); + B_armpl_); } else if constexpr (std::is_same_v<T, double>) { - std::cout << "About to execute dgemm" << std::endl; status_ = armpl_spmm_exec_d(transA_, transB_, alpha, - *A_armpl_, - *B_armpl_, + A_armpl_, + B_armpl_, beta, - *B_armpl_); + B_armpl_); } else { // Un-specialised class will not do any work - print error and exit. std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported." @@ -89,20 +91,18 @@ class sp_gemm_cpu : public sp_gemm<T> { * be timed. */ void preLoopRequirements() override { // Need to put A_ and B_ into A_armpl_ and B_armpl_ - // ToDo -- Error catching toCSR_armpl(); -// std::cout << "toCSR_armpl() wrapped up without a problem" << std::endl; } /** Perform any required steps after calling the GEMM kernel that should * be timed. */ void postLoopRequirements() override { - status_ = armpl_spmat_destroy(*A_armpl_); + status_ = armpl_spmat_destroy(A_armpl_); if (status_ != ARMPL_STATUS_SUCCESS) { std::cout << "ERROR " << status_ << std::endl; exit(1); } - status_ = armpl_spmat_destroy(*B_armpl_); + status_ = armpl_spmat_destroy(B_armpl_); if (status_ != ARMPL_STATUS_SUCCESS) { std::cout << "ERROR " << status_ << std::endl; exit(1); @@ -113,12 +113,12 @@ class sp_gemm_cpu : public sp_gemm<T> { // exit(1); // } -// delete [] A_armpl_row_ptr_; -// delete [] A_armpl_col_index_; -// delete [] A_vals_; -// delete [] B_armpl_row_ptr_; -// delete [] B_armpl_col_index_; -// delete [] B_vals_; + delete [] A_armpl_row_ptr_; + delete [] A_armpl_col_index_; + delete [] A_vals_; + delete [] B_armpl_row_ptr_; + delete [] B_armpl_col_index_; + delete [] B_vals_; // delete [] C_armpl_row_ptr_; // delete [] C_armpl_col_index_; // delete [] C_vals_; @@ -131,10 +131,6 @@ class sp_gemm_cpu : public sp_gemm<T> { /** The constant value Beta. */ const T beta = BETA; - armpl_status_t status_; - - armpl_spmat_t armpl_A, armpl_B, armpl_C; - void toCSR_armpl() { n_armpl_ = n_; // ToDo -- check whether flags_ is correct! @@ -145,50 +141,19 @@ class sp_gemm_cpu : public sp_gemm<T> { A_armpl_col_index_ = new armpl_int_t[nnz_]; A_vals_ = new T[nnz_]; A_armpl_row_ptr_[0] = 0; - int nnz_encountered = 0; -// std::cout << "About to load A into csr" << std::endl; + for (int row = 0; row < n_; row++) { -// std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << std::endl; A_armpl_row_ptr_[row + 1] = nnz_encountered; for (int col = 0; col < n_; col++) { if (A_[(row * n_) + col] != 0.0) { -// std::cout << "\t\tCol " << col << " = " << A_[(row * n_) + col] << -// std::endl; A_armpl_col_index_[nnz_encountered] = col; A_vals_[nnz_encountered] = static_cast<T>(A_[(row * n_) + col]); nnz_encountered++; -// std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl; } } } -// std::cout << "___A =" << std::endl << "\t\t["; -// for (int i = 0; i < (n_ + 1); i++) { -// if (i != 0) { -// std::cout << ", "; -// } -// std::cout << A_armpl_row_ptr_[i]; -// } -// std::cout << "]" << std::endl << "\t\t["; -// for (int i = 0; i < nnz_; i++) { -// if (i != 0) { -// std::cout << ", "; -// } -// std::cout << A_armpl_col_index_[i]; -// } -// std::cout << "]" << std::endl << "\t\t["; -// for (int i = 0; i < nnz_; i++) { -// if (i != 0) { -// std::cout << ", "; -// } -// std::cout << A_vals_[i]; -// } -// std::cout << "]" << std::endl; - - -// std::cout << "About to load B into csr" << std::endl; - // Move B to CSR B_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; B_armpl_col_index_ = new armpl_int_t[nnz_]; @@ -197,113 +162,20 @@ class sp_gemm_cpu : public sp_gemm<T> { nnz_encountered = 0; for (int row = 0; row < n_; row++) { -// std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << -// std::endl; B_armpl_row_ptr_[row + 1] = nnz_encountered; for (int col = 0; col < n_; col++) { if (B_[(row * n_) + col] != 0.0) { -// std::cout << "\t\tCol " << col << " = " << B_[(row * n_) + col] << std::endl; B_armpl_col_index_[nnz_encountered] = col; B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]); nnz_encountered++; -// std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl; } } } -// std::cout << "___B =" << std::endl << "\t\t["; -// for (int i = 0; i < (n_ + 1); i++) { -// if (i != 0) { -// std::cout << ", "; -// } -// std::cout << B_armpl_row_ptr_[i]; -// } -// std::cout << "]" << std::endl << "\t\t["; -// for (int i = 0; i < nnz_; i++) { -// if (i != 0) { -// std::cout << ", "; -// } -// std::cout << B_armpl_col_index_[i]; -// } -// std::cout << "]" << std::endl << "\t\t["; -// for (int i = 0; i < nnz_; i++) { -// if (i != 0) { -// std::cout << ", "; -// } -// std::cout << B_vals_[i]; -// } -// std::cout << "]" << std::endl; - - -// // Move B to CSR -// C_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; -// C_armpl_col_index_ = new armpl_int_t[nnz_]; -// C_vals_ = new T[nnz_]; -// C_armpl_row_ptr_[0] = 0; -// -// nnz_encountered = 0; -//// std::cout << "About to load C into csr" << std::endl; -// for (int row = 0; row < n_; row++) { -//// std::cout << "\tRow " << (row + 1) << " = " << nnz_encountered << std::endl; -// C_armpl_row_ptr_[row + 1] = nnz_encountered; -// for (int col = 0; col < n_; col++) { -// if (A_[(row * n_) + col] != 0.0) { -// C_armpl_col_index_[nnz_encountered] = col; -// C_vals_[nnz_encountered] = A_[(row * n_) + col]; -// nnz_encountered++; -//// std::cout << "\t\tCol " << col << " = " << C_vals_[nnz_encountered] << -//// std::endl; -//// std::cout << "\tnnz_encountered = " << nnz_encountered << std::endl; -// } -// } -// } - -// std::cout << "___C =" << std::endl << "\t\t["; -// for (int i = 0; i < (n_ + 1); i++) { -// if (i != 0) { -// std::cout << ", "; -// } -// std::cout << C_armpl_row_ptr_[i]; -// } -// std::cout << "]" << std::endl << "\t\t["; -// for (int i = 0; i < nnz_; i++) { -// if (i != 0) { -// std::cout << ", "; -// } -// std::cout << C_armpl_col_index_[i]; -// } -// std::cout << "]" << std::endl << "\t\t["; -// for (int i = 0; i < nnz_; i++) { -// if (i != 0) { -// std::cout << ", "; -// } -// std::cout << C_vals_[i]; -// } -// std::cout << "]" << std::endl; - - -// std::cout << "Loading csr A into armpl storage formats" << std::endl; if constexpr (std::is_same_v<T, float>) { - std::cout << "\tn_armpl_ = " << n_armpl_ << std::endl; - std::cout << "\tA_armpl_row_ptr_ (size = " << sizeof - (A_armpl_row_ptr_[0]) << ") = [" << A_armpl_row_ptr_[0]; - for (int i = 1; i < (n_ + 1); i++) { - std::cout << ", " << A_armpl_row_ptr_[i]; - } - std::cout << "]" << std::endl << "\tA_armpl_col_index_ (size = " << - sizeof(A_armpl_col_index_[0]) << ") = [" << - A_armpl_col_index_[0]; - for (int i = 1; i < nnz_; i++) { - std::cout << ", " << A_armpl_col_index_[i]; - } - std::cout << "]" << std::endl << "\tA_vals_ (size = " << sizeof - (A_vals_[0]) << ") = [" << A_vals_[0]; - for (int i = 1; i < nnz_; i++) { - std::cout << ", " << A_vals_[i]; - } - std::cout << "]" << std::endl << "flags: " << flags_ << std::endl; - - status_ = armpl_spmat_create_csr_s(A_armpl_, +// printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_s(&A_armpl_, n_armpl_, n_armpl_, A_armpl_row_ptr_, @@ -315,21 +187,9 @@ class sp_gemm_cpu : public sp_gemm<T> { exit(1); } -// std::cout << "Loading csr C into armpl storage formats" << std::endl; -// status_ = armpl_spmat_create_csr_s(C_armpl_, -// n_armpl_, -// n_armpl_, -// C_armpl_row_ptr_, -// C_armpl_col_index_, -// C_vals_, -// flags_); -// if (status_ != ARMPL_STATUS_SUCCESS) { -// std::cout << "ERROR " << status_ << std::endl; -// exit(1); -// } - -// std::cout << "Loading csr B into armpl storage formats" << std::endl; - status_ = armpl_spmat_create_csr_s(B_armpl_, +// printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_s(&B_armpl_, n_armpl_, n_armpl_, B_armpl_row_ptr_, @@ -341,28 +201,9 @@ class sp_gemm_cpu : public sp_gemm<T> { exit(1); } } else if constexpr (std::is_same_v<T, double>) { - std::cout << "\tn_armpl_ = " << n_armpl_ << std::endl; - std::cout << "\tA_armpl_row_ptr_ (size = " << sizeof - (A_armpl_row_ptr_[0]) << ") = [" << A_armpl_row_ptr_[0]; - for (int i = 1; i < (n_ + 1); i++) { - std::cout << ", " << A_armpl_row_ptr_[i]; - } - std::cout << "]" << std::endl << "\tA_armpl_col_index_ (size = " << - sizeof(A_armpl_col_index_[0]) << ") = [" << - A_armpl_col_index_[0]; - for (int i = 1; i < nnz_; i++) { - std::cout << ", " << A_armpl_col_index_[i]; - } - std::cout << "]" << std::endl << "\tA_vals_ (size = " << sizeof - (A_vals_[0]) << ") = [" << A_vals_[0]; - for (int i = 1; i < nnz_; i++) { - std::cout << ", " << A_vals_[i]; - } - std::cout << "]" << std::endl << "flags: " << flags_ << std::endl; - - - std::cout << "About to create CSR A (double)" << std::endl; - status_ = armpl_spmat_create_csr_d(A_armpl_, +// printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, +// nnz_, flags_ + status_ = armpl_spmat_create_csr_d(&A_armpl_, n_armpl_, n_armpl_, A_armpl_row_ptr_, @@ -374,22 +215,9 @@ class sp_gemm_cpu : public sp_gemm<T> { exit(1); } -// std::cout << "Loading csr C into armpl storage formats" << std::endl; -// status_ = armpl_spmat_create_csr_d(C_armpl_, -// n_armpl_, -// n_armpl_, -// C_armpl_row_ptr_, -// C_armpl_col_index_, -// C_vals_, -// flags_); -// if (status_ != ARMPL_STATUS_SUCCESS) { -// std::cout << "ERROR " << status_ << std::endl; -// exit(1); -// } - -// std::cout << "Loading csr B into armpl storage formats" << std::endl; - std::cout << "About to create CSR B (double)" << std::endl; - status_ = armpl_spmat_create_csr_d(B_armpl_, +// printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_d(&B_armpl_, n_armpl_, n_armpl_, B_armpl_row_ptr_, @@ -400,11 +228,33 @@ class sp_gemm_cpu : public sp_gemm<T> { std::cout << "ERROR " << status_ << std::endl; exit(1); } +// std::cout << "Okay, all matrices made!!" << std::endl; } -// std::cout << "Okay, all matrices made!!" << std::endl; } + void printCSR(armpl_int_t n, armpl_int_t* rp, armpl_int_t* ci, T* v, + armpl_int_t nz, armpl_int_t f) { + std::cout << "\tn = " << n << std::endl; + std::cout << "\trow ptr (size = " << sizeof(rp[0]) << ") = [" << rp[0]; + for (int i = 1; i < (n + 1); i++) { + std::cout << ", " << rp[i]; + } + std::cout << "]" << std::endl << "\tcol ind (size = " << sizeof(ci[0]) << + ") = [" << ci[0]; + for (int i = 1; i < nz; i++) { + std::cout << ", " << ci[i]; + } + std::cout << "]" << std::endl << "\tvals (size = " << sizeof(v[0]) << + ") = [" << v[0]; + for (int i = 1; i < nz; i++) { + std::cout << ", " << v[i]; + } + std::cout << "]" << std::endl << "\tflags = " << f << std::endl; + } + + armpl_status_t status_; + armpl_int_t flags_; armpl_int_t n_armpl_; @@ -416,13 +266,9 @@ class sp_gemm_cpu : public sp_gemm<T> { armpl_int_t* C_armpl_row_ptr_; armpl_int_t* C_armpl_col_index_; - T* A_vals_; - T* B_vals_; - T* C_vals_; - - armpl_spmat_t* A_armpl_; - armpl_spmat_t* B_armpl_; - armpl_spmat_t* C_armpl_; + armpl_spmat_t A_armpl_; + armpl_spmat_t B_armpl_; + armpl_spmat_t C_armpl_; armpl_sparse_hint_value transA_ = ARMPL_SPARSE_OPERATION_NOTRANS; armpl_sparse_hint_value transB_ = ARMPL_SPARSE_OPERATION_NOTRANS; diff --git a/createGflopsGraphs.py b/createGflopsGraphs.py index 07ac243..ee1a389 100644 --- a/createGflopsGraphs.py +++ b/createGflopsGraphs.py @@ -372,7 +372,7 @@ plt.margins(x=0.01, y=0.01) leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18) - for obj in leg.legendHandles: + for obj in leg.legend_handles: obj.set_linewidth(3.0) obj.set_markersize(15.0) obj.set_markeredgewidth(3.0) diff --git a/cuBLAS/common.hh b/cuBLAS/common.hh index c8086db..f3ff6ef 100644 --- a/cuBLAS/common.hh +++ b/cuBLAS/common.hh @@ -2,7 +2,7 @@ #if defined GPU_CUBLAS -#include "cusparse.h" +#include <cusparse_v2.h> /** Macro function to check if error occurred when calling cuBLAS. */ /** Macro function to check if error occurred when calling CUDA. */ diff --git a/include/doGemm.hh b/include/doGemm.hh index a33ef7e..c71684f 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -527,12 +527,8 @@ class doGemm { #if CPU_ENABLED if (doCPU_) { -// std::cout << "about to initialise matrices with size = " << N << -// std::endl; spGemmCpu_.initialise(N, sparsity); -// std::cout << "about to run spGEMM" << std::endl; time_checksum_gflop cpuResult = spGemmCpu_.compute(); -// std::cout << "about to calculate flops" << std::endl; cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, cpuResult.runtime, cpuResult.gflops); @@ -543,26 +539,19 @@ class doGemm { // - UNIFIED : data passed from host to device (and device to host) as // needed if (doGPU_) { - std::cout << "Starting with matrix of size " << N << std::endl; - std::cout << "\t\tUnified"; spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); - std::cout << "\tInitialised" << std::endl; time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); gpuResult_unified.gflops = calcGflops(flops, iterations_, gpuResult_unified.runtime); // - ALWAYS: Offload to/from GPU every iteration - std::cout << "\t\tAlways"; spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); - std::cout << "\tInitialised" << std::endl; time_checksum_gflop gpuResult_always = spGemmGpu_.compute(); gpuResult_always.gflops = calcGflops(flops, iterations_, gpuResult_always.runtime); // - ONCE : Offload to/from GPU once before all iterations and once // after - std::cout << "\t\tOnce"; spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); - std::cout << "\tInitialised" << std::endl; time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime); diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh index dfab687..a11dcd0 100644 --- a/include/kernels/CPU/sp_gemm.hh +++ b/include/kernels/CPU/sp_gemm.hh @@ -33,7 +33,6 @@ namespace cpu { // used in the initInputMatricesSparse function. If changed here, // change there nnz_ = 1 + (int) ((double)n_ * (double)n_ * (1.0 - sparsity_)); -// std::cout << "nnz_ = " << nnz_ << std::endl; A_ = (T*)malloc(sizeof(T) * n_ * n_); B_ = (T*)malloc(sizeof(T) * n_ * n_); @@ -46,6 +45,12 @@ namespace cpu { int nnz_; + protected: + + T* A_vals_; + T* B_vals_; + T* C_vals_; + private: /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ @@ -97,9 +102,6 @@ namespace cpu { int* B_col_index_; int* C_row_ptr_; int* C_col_index_; - T* A_vals_; - T* B_vals_; - T* C_vals_; }; } // namespace cpu diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index 6d75554..bbd17cb 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -28,13 +28,10 @@ class gemm { std::chrono::high_resolution_clock::now(); // Perform all GEMM calls -// std::cout << "about to do pre-loop requirements" << std::endl; preLoopRequirements(); for (int i = 0; i < iterations_; i++) { -// std::cout << "entering loop " << i << std::endl; callGemm(); } -// std::cout << "about to do post-loop requirements" << std::endl; postLoopRequirements(); // Stop Timer diff --git a/src/main.cc b/src/main.cc index 51d1cf1..e508b5b 100644 --- a/src/main.cc +++ b/src/main.cc @@ -50,18 +50,18 @@ int main(int argc, char** argv) { // -------- GEMV -------- // SGEMV Comparison - std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl; - doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu); - sgemv.collectData(); - std::cout << "Finished!" << std::endl; - - // DGEMV Comparison - std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl; - doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu); - dgemv.collectData(); - std::cout << "Finished!" << std::endl; +// std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl; +// doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, +// doGpu); +// sgemv.collectData(); +// std::cout << "Finished!" << std::endl; +// +// // DGEMV Comparison +// std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl; +// doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, +// doGpu); +// dgemv.collectData(); +// std::cout << "Finished!" << std::endl; free(absPath); return 0; From 9eb464668e481ef1148dd4a160ccea3fe5e7563f Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Thu, 3 Oct 2024 10:51:18 +0100 Subject: [PATCH 26/38] No longer overwriting B_ --- .idea/workspace.xml | 22 +++++++++++---- ArmPL/sp_gemm.hh | 69 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 73 insertions(+), 18 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index e9a4d65..cb692bc 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -15,7 +15,10 @@ </configurations> </component> <component name="ChangeListManager"> - <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding AOCL files" /> + <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="working changes"> + <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" afterDir="false" /> + </list> <option name="SHOW_DIALOG" value="false" /> <option name="HIGHLIGHT_CONFLICTS" value="true" /> <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> @@ -125,9 +128,9 @@ </method> </configuration> <list> + <item itemvalue="C/C++ File.main.cc" /> <item itemvalue="Native Application.all" /> <item itemvalue="Native Application.gpu-blob" /> - <item itemvalue="C/C++ File.main.cc" /> </list> <recent_temporary> <list> @@ -174,6 +177,7 @@ <workItem from="1726828018604" duration="52619000" /> <workItem from="1727941759103" duration="43000" /> <workItem from="1727941814674" duration="165000" /> + <workItem from="1727941995420" duration="3199000" /> </task> <task id="LOCAL-00001" summary="trivial changes"> <option name="closed" value="true" /> @@ -495,7 +499,15 @@ <option name="project" value="LOCAL" /> <updated>1724234752813</updated> </task> - <option name="localTasksCounter" value="41" /> + <task id="LOCAL-00041" summary="working changes"> + <option name="closed" value="true" /> + <created>1727942003511</created> + <option name="number" value="00041" /> + <option name="presentableId" value="LOCAL-00041" /> + <option name="project" value="LOCAL" /> + <updated>1727942003511</updated> + </task> + <option name="localTasksCounter" value="42" /> <servers /> </component> <component name="TypeScriptGeneratedFilesManager"> @@ -513,7 +525,6 @@ </option> </component> <component name="VcsManagerConfiguration"> - <MESSAGE value="Adding sparse kernel to doGemm" /> <MESSAGE value="Adding matrix type enum class" /> <MESSAGE value="changes" /> <MESSAGE value="adding command line kernel selection" /> @@ -538,6 +549,7 @@ <MESSAGE value="Finalising" /> <MESSAGE value="Rebasing" /> <MESSAGE value="Adding AOCL files" /> - <option name="LAST_COMMIT_MESSAGE" value="Adding AOCL files" /> + <MESSAGE value="working changes" /> + <option name="LAST_COMMIT_MESSAGE" value="working changes" /> </component> </project> \ No newline at end of file diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh index cb6b443..28a2ca3 100644 --- a/ArmPL/sp_gemm.hh +++ b/ArmPL/sp_gemm.hh @@ -53,9 +53,6 @@ class sp_gemm_cpu : public sp_gemm<T> { // Todo -- See if using armpl_spmat_hint can improve performance here. // If so, follow with optimisation functions - - - if constexpr (std::is_same_v<T, float>) { status_ = armpl_spmm_exec_s(transA_, transB_, @@ -63,7 +60,7 @@ class sp_gemm_cpu : public sp_gemm<T> { A_armpl_, B_armpl_, beta, - B_armpl_); + C_armpl_); } else if constexpr (std::is_same_v<T, double>) { status_ = armpl_spmm_exec_d(transA_, transB_, @@ -71,7 +68,7 @@ class sp_gemm_cpu : public sp_gemm<T> { A_armpl_, B_armpl_, beta, - B_armpl_); + C_armpl_); } else { // Un-specialised class will not do any work - print error and exit. std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported." @@ -107,11 +104,11 @@ class sp_gemm_cpu : public sp_gemm<T> { std::cout << "ERROR " << status_ << std::endl; exit(1); } -// status_ = armpl_spmat_destroy(*C_armpl_); -// if (status_ != ARMPL_STATUS_SUCCESS) { -// std::cout << "ERROR " << status_ << std::endl; -// exit(1); -// } + status_ = armpl_spmat_destroy(C_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } delete [] A_armpl_row_ptr_; delete [] A_armpl_col_index_; @@ -119,9 +116,9 @@ class sp_gemm_cpu : public sp_gemm<T> { delete [] B_armpl_row_ptr_; delete [] B_armpl_col_index_; delete [] B_vals_; -// delete [] C_armpl_row_ptr_; -// delete [] C_armpl_col_index_; -// delete [] C_vals_; + delete [] C_armpl_row_ptr_; + delete [] C_armpl_col_index_; + delete [] C_vals_; } @@ -172,6 +169,24 @@ class sp_gemm_cpu : public sp_gemm<T> { } } + // Move C to CSR + C_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; + C_armpl_col_index_ = new armpl_int_t[nnz_]; + C_vals_ = new T[nnz_]; + C_armpl_row_ptr_[0] = 0; + + nnz_encountered = 0; + for (int row = 0; row < n_; row++) { + C_armpl_row_ptr_[row + 1] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (B_[(row * n_) + col] != 0.0) { + C_armpl_col_index_[nnz_encountered] = col; + C_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]); + nnz_encountered++; + } + } + } + if constexpr (std::is_same_v<T, float>) { // printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, // nnz_, flags_); @@ -200,6 +215,20 @@ class sp_gemm_cpu : public sp_gemm<T> { std::cout << "ERROR " << status_ << std::endl; exit(1); } + +// printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_s(&C_armpl_, + n_armpl_, + n_armpl_, + C_armpl_row_ptr_, + C_armpl_col_index_, + C_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } } else if constexpr (std::is_same_v<T, double>) { // printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, // nnz_, flags_ @@ -228,6 +257,20 @@ class sp_gemm_cpu : public sp_gemm<T> { std::cout << "ERROR " << status_ << std::endl; exit(1); } + +// printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_d(&C_armpl_, + n_armpl_, + n_armpl_, + C_armpl_row_ptr_, + C_armpl_col_index_, + C_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } // std::cout << "Okay, all matrices made!!" << std::endl; } From 7f82b7d52f0ab2420774159d9099fb40aef00ce2 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:56:42 +0100 Subject: [PATCH 27/38] Adding AOCL files --- .idea/workspace.xml | 25 +++++++++---- include/doGemm.hh | 66 +++++++++++++++++++++++++++++----- include/doGemv.hh | 57 ++++++++++++++++------------- include/kernels/CPU/sp_gemm.hh | 7 ++-- include/kernels/gemm.hh | 7 ++-- include/kernels/gemv.hh | 5 +-- src/main.cc | 62 +++++++++++++++++++++----------- 7 files changed, 160 insertions(+), 69 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index cb692bc..a5afad2 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -15,9 +15,14 @@ </configurations> </component> <component name="ChangeListManager"> - <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="working changes"> + <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="No longer overwriting B_"> <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/doGemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/src/main.cc" beforeDir="false" afterPath="$PROJECT_DIR$/src/main.cc" afterDir="false" /> </list> <option name="SHOW_DIALOG" value="false" /> <option name="HIGHLIGHT_CONFLICTS" value="true" /> @@ -177,7 +182,7 @@ <workItem from="1726828018604" duration="52619000" /> <workItem from="1727941759103" duration="43000" /> <workItem from="1727941814674" duration="165000" /> - <workItem from="1727941995420" duration="3199000" /> + <workItem from="1727941995420" duration="22747000" /> </task> <task id="LOCAL-00001" summary="trivial changes"> <option name="closed" value="true" /> @@ -507,7 +512,15 @@ <option name="project" value="LOCAL" /> <updated>1727942003511</updated> </task> - <option name="localTasksCounter" value="42" /> + <task id="LOCAL-00042" summary="No longer overwriting B_"> + <option name="closed" value="true" /> + <created>1727949079616</created> + <option name="number" value="00042" /> + <option name="presentableId" value="LOCAL-00042" /> + <option name="project" value="LOCAL" /> + <updated>1727949079616</updated> + </task> + <option name="localTasksCounter" value="43" /> <servers /> </component> <component name="TypeScriptGeneratedFilesManager"> @@ -525,7 +538,6 @@ </option> </component> <component name="VcsManagerConfiguration"> - <MESSAGE value="Adding matrix type enum class" /> <MESSAGE value="changes" /> <MESSAGE value="adding command line kernel selection" /> <MESSAGE value="Adding basic sparse multiplication kernel for default CPU and GPU" /> @@ -550,6 +562,7 @@ <MESSAGE value="Rebasing" /> <MESSAGE value="Adding AOCL files" /> <MESSAGE value="working changes" /> - <option name="LAST_COMMIT_MESSAGE" value="working changes" /> + <MESSAGE value="No longer overwriting B_" /> + <option name="LAST_COMMIT_MESSAGE" value="No longer overwriting B_" /> </component> </project> \ No newline at end of file diff --git a/include/doGemm.hh b/include/doGemm.hh index c71684f..a3e5e77 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -65,7 +65,7 @@ class doGemm { void collectData() { // ToDo -- I've hard coded false here as kernel selection was not working // . Needs to be fixed - if (false) { + if (doDense_) { // Square Problem Sizes... // Re-initialise offload threshold structures cpuGpu_always_ = cpuGpu_offloadThreshold(); @@ -301,13 +301,12 @@ class doGemm { } #endif } - - if (true) { // Square sparse matrix - sparse matrix multiplication + if (doSparse_) { // Square sparse matrix - sparse matrix multiplication cpuGpu_always_ = cpuGpu_offloadThreshold(); cpuGpu_once_ = cpuGpu_offloadThreshold(); cpuGpu_unified_ = cpuGpu_offloadThreshold(); std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" + - getKernelName() + "_sparse_square.csv"); + getKernelName() + "_sparse_square_99.csv"); if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { callSparseKernels(csvFile, dim, 0.99); @@ -316,10 +315,59 @@ class doGemm { // Close file csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Sparse Square"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Sparse Square 0.99"); + } +#endif + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + csvFile = initCSVFile(std::string(CSV_DIR) + "/" + + getKernelName() + "_sparse_square_999.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callSparseKernels(csvFile, dim, 0.999); + } + } +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Sparse Square 0.999"); + } +#endif + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + csvFile = initCSVFile(std::string(CSV_DIR) + "/" + + getKernelName() + "_sparse_square_9999.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callSparseKernels(csvFile, dim, 0.9999); + } + } +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Sparse Square 0.9999"); + } +#endif + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + csvFile = initCSVFile(std::string(CSV_DIR) + "/" + + getKernelName() + + "_sparse_square_99999.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callSparseKernels(csvFile, dim, 0.99999); + } + } +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Sparse Square 0.99999"); + } #endif } } @@ -530,7 +578,7 @@ class doGemm { spGemmCpu_.initialise(N, sparsity); time_checksum_gflop cpuResult = spGemmCpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); - writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, + writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, cpuResult.runtime, cpuResult.gflops); } #endif diff --git a/include/doGemv.hh b/include/doGemv.hh index b86aad6..12cd097 100644 --- a/include/doGemv.hh +++ b/include/doGemv.hh @@ -33,13 +33,16 @@ class doGemv { public: doGemv(const std::string csvDir, const int iters, const int startDim, const int upperLimit, const bool cpuEnabled = true, - const bool gpuEnabled = true) + const bool gpuEnabled = true, const bool doDense = true, const bool + doSparse = true) : CSV_DIR(csvDir), iterations_(iters), startDimention_(startDim), upperLimit_(upperLimit), doCPU_(cpuEnabled), - doGPU_(gpuEnabled) + doGPU_(gpuEnabled), + doDense_(doDense), + doSparse_(doSparse) #if CPU_ENABLED , gemvCpu_(iterations_) @@ -56,28 +59,29 @@ class doGemv { /** Run all problem types and write data to CSV files. */ void collectData() { - // Square Problem Sizes... - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - std::ofstream csvFile = - initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv"); - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = dim, N = dim; - callKernels(csvFile, dim, dim); - } - // Close file - csvFile.close(); -#if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Square x Vector (M=N)"); - } -#endif + if (doDense_) { + // Square Problem Sizes... + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + std::ofstream csvFile = + initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = dim, N = dim; + callKernels(csvFile, dim, dim); + } + // Close file + csvFile.close(); + #if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Vector (M=N)"); + } + #endif // Rectangular Problem Sizes: // Tall and thin x Vector @@ -182,6 +186,7 @@ class doGemv { } #endif } + } private: /** Call the appropriate CPU and GPU GEMV kernels. */ @@ -494,6 +499,10 @@ class doGemv { /** Whether the GPU kernels should be run. */ const bool doGPU_ = true; + /** Whether sparse and or dense kernels should be run. */ + const bool doSparse_; + const bool doDense_; + #if CPU_ENABLED /** The GEMV CPU kernel. */ cpu::gemv_cpu<T> gemvCpu_; diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh index a11dcd0..c431d4d 100644 --- a/include/kernels/CPU/sp_gemm.hh +++ b/include/kernels/CPU/sp_gemm.hh @@ -32,18 +32,19 @@ namespace cpu { // Note that the below should be the same as the edges calculation // used in the initInputMatricesSparse function. If changed here, // change there - nnz_ = 1 + (int) ((double)n_ * (double)n_ * (1.0 - sparsity_)); + nnz_ = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity_)); +// std::cout << "\t____About to malloc()____" << std::endl; A_ = (T*)malloc(sizeof(T) * n_ * n_); B_ = (T*)malloc(sizeof(T) * n_ * n_); C_ = (T*)malloc(sizeof(T) * n_ * n_); - initInputMatricesSparse(sparsity_); + initInputMatricesSparse(sparsity); toCSR_int(); } - int nnz_; + uint64_t nnz_; protected: diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index bbd17cb..6e1328e 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -107,14 +107,14 @@ class gemm { .time_since_epoch().count()); std::uniform_real_distribution<double> dist(0.0, 1.0); - int edges = 1 + (int) (n_ * n_ * (1 - sparsity)); + int edges = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity)); // Using a=0.45 and b=c=0.22 as default probabilities for (int i = 0; i < edges; i++) { while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, - false)) {} + false)) {} while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, - false)) {} + false)) {} } } @@ -165,7 +165,6 @@ class gemm { gen, dist, bin); } } - return true; } void toCSR_int(T* dense, int n_col, int n_row, T* vals, int* col_index, diff --git a/include/kernels/gemv.hh b/include/kernels/gemv.hh index 665fe59..a64b19c 100644 --- a/include/kernels/gemv.hh +++ b/include/kernels/gemv.hh @@ -95,10 +95,11 @@ class gemv { .time_since_epoch().count()); std::uniform_real_distribution<double> dist(0.0, 1.0); - int edges = 1 + (int) (n_ * n_ * (1 - sparsity_)); + uint64_t edges = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - + sparsity_)); // Using a=0.45 and b=c=0.22 as default probabilities - for (int i = 0; i < edges; i++) { + for (uint64_t i = 0; i < edges; i++) { while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, false)) {} } diff --git a/src/main.cc b/src/main.cc index e508b5b..bdc1db2 100644 --- a/src/main.cc +++ b/src/main.cc @@ -7,6 +7,10 @@ bool doSgemm = true; bool doDgemm = true; bool doSp_sgemm = true; bool doSp_dgemm = true; +bool doSgemv = true; +bool doDgemv = true; +bool doSp_sgemv = true; +bool doSp_dgemv = true; bool doCpu = CPU_ENABLED; bool doGpu = GPU_ENABLED; @@ -50,18 +54,18 @@ int main(int argc, char** argv) { // -------- GEMV -------- // SGEMV Comparison -// std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl; -// doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, -// doGpu); -// sgemv.collectData(); -// std::cout << "Finished!" << std::endl; -// -// // DGEMV Comparison -// std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl; -// doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, -// doGpu); -// dgemv.collectData(); -// std::cout << "Finished!" << std::endl; + std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl; + doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, + doGpu, doSgemv, doSp_sgemv); + sgemv.collectData(); + std::cout << "Finished!" << std::endl; + + // DGEMV Comparison + std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl; + doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, + doGpu, doDgemv, doSp_dgemv); + dgemv.collectData(); + std::cout << "Finished!" << std::endl; free(absPath); return 0; @@ -146,7 +150,8 @@ void getParameters(int argc, char** argv) { } else if (!strcmp(argv[i], "--no_gpu")) { doGpu = false; } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) { - doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = false; + doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = + doSgemv = doDgemv = doSp_sgemv = doSp_dgemv = false; std::string kernelList = argv[++i]; if (kernelList.find("sp-sgemm") != std::string::npos) { doSp_sgemm = true; @@ -167,13 +172,28 @@ void getParameters(int argc, char** argv) { doDgemm = true; } - if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm) { - std::cout << "ERROR - no implemented kernels in list" << std::endl; - exit(1); - } - } else if (!strcmp(argv[i], "--output_dir") || !strcmp(argv[i], "-o")) { - if (++i >= argc) { - std::cout << "ERROR - Invalid output directory" << std::endl; + + if (kernelList.find("sp-sgemv") != std::string::npos) { + doSp_sgemv = true; + if (kernelList.find("sgemv") != std::string::npos && + kernelList.find("sgemv") != kernelList.find("sp-sgemv") + 3) { + doSgemv = true; + } + } else if (kernelList.find("sgemv") != std::string::npos) { + doSgemv = true; + } + if (kernelList.find("sp-dgemv") != std::string::npos) { + doSp_dgemv = true; + if (kernelList.find("dgemv") != std::string::npos && + kernelList.find("dgemv") != kernelList.find("sp-dgemv") + 3) { + doDgemv = true; + } + } else if (kernelList.find("dgemv") != std::string::npos) { + doDgemv = true; + } + if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm && + !doSgemv && !doDgemv && !doSp_sgemv && !doSp_dgemv) { + std::cout << "ERROR - no implemented kernels in list" << std::endl; exit(1); } else { CSV_DIR = argv[i]; @@ -212,4 +232,4 @@ void getParameters(int argc, char** argv) { exit(1); } } -} \ No newline at end of file +} From 0130b81655b1fa04b433c4d22f9288df723cefd2 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:58:16 +0100 Subject: [PATCH 28/38] Adding AOCL files --- .idea/workspace.xml | 23 ++++++++----- ArmPL/sp_gemm.hh | 84 +++++++++++++++++++++++++++++++++++++++++++++ Makefile | 2 +- include/doGemm.hh | 26 +++++++------- include/doGemv.hh | 12 +++---- include/helpers.hh | 12 ++++--- 6 files changed, 127 insertions(+), 32 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index a5afad2..2bb35d8 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -15,14 +15,13 @@ </configurations> </component> <component name="ChangeListManager"> - <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="No longer overwriting B_"> + <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding kernel selection for gemv"> <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/Makefile" beforeDir="false" afterPath="$PROJECT_DIR$/Makefile" afterDir="false" /> <change beforePath="$PROJECT_DIR$/include/doGemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemm.hh" afterDir="false" /> <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/kernels/gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemm.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/kernels/gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemv.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/src/main.cc" beforeDir="false" afterPath="$PROJECT_DIR$/src/main.cc" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/helpers.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/helpers.hh" afterDir="false" /> </list> <option name="SHOW_DIALOG" value="false" /> <option name="HIGHLIGHT_CONFLICTS" value="true" /> @@ -520,7 +519,15 @@ <option name="project" value="LOCAL" /> <updated>1727949079616</updated> </task> - <option name="localTasksCounter" value="43" /> + <task id="LOCAL-00043" summary="Adding kernel selection for gemv"> + <option name="closed" value="true" /> + <created>1728650780575</created> + <option name="number" value="00043" /> + <option name="presentableId" value="LOCAL-00043" /> + <option name="project" value="LOCAL" /> + <updated>1728650780575</updated> + </task> + <option name="localTasksCounter" value="44" /> <servers /> </component> <component name="TypeScriptGeneratedFilesManager"> @@ -538,7 +545,6 @@ </option> </component> <component name="VcsManagerConfiguration"> - <MESSAGE value="changes" /> <MESSAGE value="adding command line kernel selection" /> <MESSAGE value="Adding basic sparse multiplication kernel for default CPU and GPU" /> <MESSAGE value="Implementing cuSPARSE kernel" /> @@ -563,6 +569,7 @@ <MESSAGE value="Adding AOCL files" /> <MESSAGE value="working changes" /> <MESSAGE value="No longer overwriting B_" /> - <option name="LAST_COMMIT_MESSAGE" value="No longer overwriting B_" /> + <MESSAGE value="Adding kernel selection for gemv" /> + <option name="LAST_COMMIT_MESSAGE" value="Adding kernel selection for gemv" /> </component> </project> \ No newline at end of file diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh index 28a2ca3..612f4f1 100644 --- a/ArmPL/sp_gemm.hh +++ b/ArmPL/sp_gemm.hh @@ -89,6 +89,90 @@ class sp_gemm_cpu : public sp_gemm<T> { void preLoopRequirements() override { // Need to put A_ and B_ into A_armpl_ and B_armpl_ toCSR_armpl(); + + /** providing hints to ARMPL and optimizing the matrix datastructures */ + // TODO -- is noallocs best here? + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_MEMORY, + ARMPL_SPARSE_MEMORY_NOALLOCS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_MEMORY, + ARMPL_SPARSE_MEMORY_NOALLOCS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_STRUCTURE, + ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_STRUCTURE, + ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + // TODO -- will this be FEW? + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS, + ARMPL_SPARSE_INVOCATIONS_MANY); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS, + ARMPL_SPARSE_INVOCATIONS_MANY); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION, + ARMPL_SPARSE_OPERATION_NOTRANS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION, + ARMPL_SPARSE_OPERATION_NOTRANS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + // TODO -- investigate whch is better here + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY, + ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY, + ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// TODO -- this is thorwing an error -- couldn't immediately fix so come +// back to + +// /** provide hints for the optimisation of the spmm execution */ +// status_ = armpl_spmm_optimize(ARMPL_SPARSE_OPERATION_NOTRANS, +// ARMPL_SPARSE_OPERATION_NOTRANS, +// ARMPL_SPARSE_SCALAR_ONE, +// A_armpl_, B_armpl_, +// ARMPL_SPARSE_SCALAR_ZERO, +// C_armpl_); +// if (status_ != ARMPL_STATUS_SUCCESS) { +// std::cout << "ERROR " << status_ << std::endl; +// exit(1); +// } } /** Perform any required steps after calling the GEMM kernel that should diff --git a/Makefile b/Makefile index e5091e0..22d080c 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,7 @@ CXX = $(CXX_$(COMPILER)) CXXFLAGS_ARM = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native CXXFLAGS_CLANG = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native -CXXFLAGS_GNU = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native +CXXFLAGS_GNU = -std=c++17 -Wall -Wno-deprecated-declarations -Ofast -$(ARCHFLAG)=native CXXFLAGS_INTEL = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native -Wno-tautological-constant-compare CXXFLAGS_NVIDIA = -std=c++17 -Wall -O3 -fast -$(ARCHFLAG)=native CXXFLAGS_HIP = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native diff --git a/include/doGemm.hh b/include/doGemm.hh index a3e5e77..93cc058 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -392,8 +392,8 @@ class doGemm { cpuResult = gemmCpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); // Write result to CSV file - writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize, iterations_, - cpuResult.runtime, cpuResult.gflops); + writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize, + 0.0, iterations_, cpuResult.runtime, cpuResult.gflops); } #endif @@ -422,13 +422,13 @@ class doGemm { // Write results to CSV file writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, K, probSize, - iterations_, gpuResult_once.runtime, + 0.0, iterations_, gpuResult_once.runtime, gpuResult_once.gflops); writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, K, - probSize, iterations_, gpuResult_always.runtime, + probSize, 0.0, iterations_, gpuResult_always.runtime, gpuResult_always.gflops); writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, K, probSize, - iterations_, gpuResult_unified.runtime, + 0.0, iterations_, gpuResult_unified.runtime, gpuResult_unified.gflops); } #endif @@ -578,8 +578,9 @@ class doGemm { spGemmCpu_.initialise(N, sparsity); time_checksum_gflop cpuResult = spGemmCpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); - writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_, - cpuResult.runtime, cpuResult.gflops); + writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, + sparsity, iterations_, cpuResult.runtime, + cpuResult.gflops); } #endif #if GPU_ENABLED @@ -607,13 +608,14 @@ class doGemm { // Write lines to CSV file writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, - iterations_, gpuResult_once.runtime, gpuResult_once.gflops); + sparsity, iterations_, gpuResult_once.runtime, + gpuResult_once.gflops); writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, - iterations_, gpuResult_always.runtime, - gpuResult_always.gflops); + sparsity, iterations_, gpuResult_always.runtime, + gpuResult_always.gflops); writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, - iterations_, gpuResult_unified.runtime, - gpuResult_unified.gflops); + sparsity, iterations_, gpuResult_unified.runtime, + gpuResult_unified.gflops); } #endif diff --git a/include/doGemv.hh b/include/doGemv.hh index 12cd097..2ab5fb1 100644 --- a/include/doGemv.hh +++ b/include/doGemv.hh @@ -207,8 +207,8 @@ class doGemv { cpuResult = gemvCpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); // Write result to CSV file - writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, iterations_, - cpuResult.runtime, cpuResult.gflops); + writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, 0.0, + iterations_, cpuResult.runtime, cpuResult.gflops); } #endif @@ -237,13 +237,13 @@ class doGemv { // Write results to CSV file writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, 0, probSize, - iterations_, gpuResult_once.runtime, + 0.0, iterations_, gpuResult_once.runtime, gpuResult_once.gflops); writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, 0, - probSize, iterations_, gpuResult_always.runtime, + probSize, 0.0, iterations_, gpuResult_always.runtime, gpuResult_always.gflops); writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, 0, probSize, - iterations_, gpuResult_unified.runtime, + 0.0, iterations_, gpuResult_unified.runtime, gpuResult_unified.gflops); } #endif @@ -500,8 +500,8 @@ class doGemv { const bool doGPU_ = true; /** Whether sparse and or dense kernels should be run. */ - const bool doSparse_; const bool doDense_; + const bool doSparse_; #if CPU_ENABLED /** The GEMV CPU kernel. */ diff --git a/include/helpers.hh b/include/helpers.hh index 5618557..d760cd7 100644 --- a/include/helpers.hh +++ b/include/helpers.hh @@ -17,8 +17,8 @@ std::ofstream initCSVFile(const std::string filename) { std::ofstream newFile(filename); - newFile << "Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total " - "Seconds,GFLOP/s" + newFile << "Device,Kernel,M,N,K,Total Problem Size (KiB),sparsity,Iterations," + "Total Seconds,GFLOP/s" << std::endl; return newFile; @@ -28,15 +28,17 @@ std::ofstream initCSVFile(const std::string filename) { * Function does not close the file. */ void writeLineToCsv(std::ofstream& file, const std::string device, const std::string kernel, const int M, const int N, - const int K, const double totalProbSize, const int iters, - const double totalTime, const double gflops) { + const int K, const double totalProbSize, const float + sparsity, const int iters, const double totalTime, + const double gflops) { if (!file.is_open()) { std::cout << "ERROR - Attempted to write line to a closed CSV file." << std::endl; exit(1); } file << device << "," << kernel << "," << M << "," << N << "," << K << "," - << std::fixed << std::setprecision(3) << totalProbSize << "," << iters + << std::fixed << std::setprecision(3) << totalProbSize << "," + << std::fixed << std::setprecision(8) << sparsity << "," << iters << "," << std::fixed << std::setprecision(5) << totalTime << "," << std::fixed << std::setprecision(3) << gflops << std::endl; } From 4581637b57e14c92b4b4ca40c200565aae9e3d91 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:12:42 +0100 Subject: [PATCH 29/38] Providing armpl with hints --- .idea/workspace.xml | 21 ++++++++++++--------- ArmPL/sp_gemm.hh | 1 + 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 2bb35d8..d791fa3 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -15,13 +15,8 @@ </configurations> </component> <component name="ChangeListManager"> - <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding kernel selection for gemv"> - <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> + <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Providing armpl with hints"> <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/Makefile" beforeDir="false" afterPath="$PROJECT_DIR$/Makefile" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/doGemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemm.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/helpers.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/helpers.hh" afterDir="false" /> </list> <option name="SHOW_DIALOG" value="false" /> <option name="HIGHLIGHT_CONFLICTS" value="true" /> @@ -527,7 +522,15 @@ <option name="project" value="LOCAL" /> <updated>1728650780575</updated> </task> - <option name="localTasksCounter" value="44" /> + <task id="LOCAL-00044" summary="Providing armpl with hints"> + <option name="closed" value="true" /> + <created>1728655865948</created> + <option name="number" value="00044" /> + <option name="presentableId" value="LOCAL-00044" /> + <option name="project" value="LOCAL" /> + <updated>1728655865948</updated> + </task> + <option name="localTasksCounter" value="45" /> <servers /> </component> <component name="TypeScriptGeneratedFilesManager"> @@ -545,7 +548,6 @@ </option> </component> <component name="VcsManagerConfiguration"> - <MESSAGE value="adding command line kernel selection" /> <MESSAGE value="Adding basic sparse multiplication kernel for default CPU and GPU" /> <MESSAGE value="Implementing cuSPARSE kernel" /> <MESSAGE value="Trying to work out CSR malloc bug" /> @@ -570,6 +572,7 @@ <MESSAGE value="working changes" /> <MESSAGE value="No longer overwriting B_" /> <MESSAGE value="Adding kernel selection for gemv" /> - <option name="LAST_COMMIT_MESSAGE" value="Adding kernel selection for gemv" /> + <MESSAGE value="Providing armpl with hints" /> + <option name="LAST_COMMIT_MESSAGE" value="Providing armpl with hints" /> </component> </project> \ No newline at end of file diff --git a/ArmPL/sp_gemm.hh b/ArmPL/sp_gemm.hh index 612f4f1..e8e28a5 100644 --- a/ArmPL/sp_gemm.hh +++ b/ArmPL/sp_gemm.hh @@ -355,6 +355,7 @@ class sp_gemm_cpu : public sp_gemm<T> { std::cout << "ERROR " << status_ << std::endl; exit(1); } + // std::cout << "Okay, all matrices made!!" << std::endl; } From 477b7a0a050caeeb86ff4776ab75cbe4982cf883 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Mon, 21 Oct 2024 15:14:42 +0100 Subject: [PATCH 30/38] Updating createGflopsGraphs.py to show sparsity --- .idea/workspace.xml | 6 ++++-- createGflopsGraphs.py | 7 +++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index d791fa3..d27d844 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -15,8 +15,9 @@ </configurations> </component> <component name="ChangeListManager"> - <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Providing armpl with hints"> - <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" afterDir="false" /> + <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding AOCL files"> + <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/createGflopsGraphs.py" beforeDir="false" afterPath="$PROJECT_DIR$/createGflopsGraphs.py" afterDir="false" /> </list> <option name="SHOW_DIALOG" value="false" /> <option name="HIGHLIGHT_CONFLICTS" value="true" /> @@ -177,6 +178,7 @@ <workItem from="1727941759103" duration="43000" /> <workItem from="1727941814674" duration="165000" /> <workItem from="1727941995420" duration="22747000" /> + <workItem from="1729503392250" duration="1773000" /> </task> <task id="LOCAL-00001" summary="trivial changes"> <option name="closed" value="true" /> diff --git a/createGflopsGraphs.py b/createGflopsGraphs.py index ee1a389..7739eeb 100644 --- a/createGflopsGraphs.py +++ b/createGflopsGraphs.py @@ -54,7 +54,8 @@ # Get number of iterations performed and kernel name line1 = lines[0].split(',') - iters = int(line1[6]) + sparsity = float(line1[6]) + iters = int(line1[7]) kernel = line1[1] # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types @@ -143,7 +144,9 @@ elif kernel == "dgemm": fp = "FP64" y_name = "{} GFLOP/s".format(fp) - title = "{}GEMM Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters) + title = ("{}GEMM Performance for {} Problems (sparsity = {})- {} " + "iterations per problemize").format(kernel[0].upper(), + inputTypeStr, sparsity, iters) # Make Graph fig1 = plt.figure(figsize=(28,16)) From 407c008a75384457002c105c71311461af48854e Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Mon, 21 Oct 2024 15:50:36 +0100 Subject: [PATCH 31/38] Beginning gemv ARMPL --- ArmPL/sp_gemv.hh | 406 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 406 insertions(+) create mode 100644 ArmPL/sp_gemv.hh diff --git a/ArmPL/sp_gemv.hh b/ArmPL/sp_gemv.hh new file mode 100644 index 0000000..818c95e --- /dev/null +++ b/ArmPL/sp_gemv.hh @@ -0,0 +1,406 @@ +#pragma once + +#ifdef CPU_ARMPL +#include <stdio.h> +#include <stdlib.h> +#include <armpl.h> +#include <omp.h> + +#include <algorithm> + +#include "../include/kernels/CPU/sp_gemv.hh" +#include "../include/utilities.hh" + +namespace cpu { +/** A class for GEMM CPU BLAS kernels. */ +template <typename T> +class sp_gemv_cpu : public sp_gemv<T> { + public: + using sp_gemv<T>::sp_gemv; + using sp_gemv<T>::callConsume; + using sp_gemv<T>::m_; + using sp_gemv<T>::n_; + using sp_gemv<T>::k_; + using sp_gemv<T>::A_; + using sp_gemv<T>::B_; + using sp_gemv<T>::C_; + using sp_gemv<T>::nnz_; + using sp_gemv<T>::A_vals_; + using sp_gemv<T>::B_vals_; + using sp_gemv<T>::C_vals_; + + private: + /** Make call to the GEMM kernel. */ + void callGemv() override { + + /** + * Flow of ARMPL Sparse LA: + * + * 1. Create sparse matrix objects: armpl_spmat_create_csr[sdcz]() + * + * 2. Supply hints on usage: armpl_spmat_hint() + * + * 3. Optimise for SpMV: armpl_spmv_optimize() + * + * 4. Solve SpMV case: armpl_spmv_exec_[sdcz]() + * + * 5. Destroy sparse matrix object: armpl_spmat_destroy() + * + * In addiion, users can choose to update a set of non-zero values using + * armpl_spmat_update_[sdcz]() + */ + + // Todo -- See if using armpl_spmat_hint can improve performance here. + // If so, follow with optimisation functions + + if constexpr (std::is_same_v<T, float>) { + status_ = armpl_spmm_exec_s(transA_, + transB_, + alpha, + A_armpl_, + B_armpl_, + beta, + C_armpl_); + } else if constexpr (std::is_same_v<T, double>) { + status_ = armpl_spmm_exec_d(transA_, + transB_, + alpha, + A_armpl_, + B_armpl_, + beta, + C_armpl_); + } else { + // Un-specialised class will not do any work - print error and exit. + std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported." + << std::endl; + exit(1); + } + + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + // Ensure compiler doesn't optimise away the work being done + callConsume(); + } + + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override { + // Need to put A_ and B_ into A_armpl_ and B_armpl_ + toCSR_armpl(); + + /** providing hints to ARMPL and optimizing the matrix datastructures */ + // TODO -- is noallocs best here? + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_MEMORY, + ARMPL_SPARSE_MEMORY_NOALLOCS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_MEMORY, + ARMPL_SPARSE_MEMORY_NOALLOCS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_STRUCTURE, + ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_STRUCTURE, + ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + // TODO -- will this be FEW? + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS, + ARMPL_SPARSE_INVOCATIONS_MANY); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS, + ARMPL_SPARSE_INVOCATIONS_MANY); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION, + ARMPL_SPARSE_OPERATION_NOTRANS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION, + ARMPL_SPARSE_OPERATION_NOTRANS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + // TODO -- investigate whch is better here + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY, + ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY, + ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// TODO -- this is thorwing an error -- couldn't immediately fix so come +// back to + +// /** provide hints for the optimisation of the spmm execution */ +// status_ = armpl_spmm_optimize(ARMPL_SPARSE_OPERATION_NOTRANS, +// ARMPL_SPARSE_OPERATION_NOTRANS, +// ARMPL_SPARSE_SCALAR_ONE, +// A_armpl_, B_armpl_, +// ARMPL_SPARSE_SCALAR_ZERO, +// C_armpl_); +// if (status_ != ARMPL_STATUS_SUCCESS) { +// std::cout << "ERROR " << status_ << std::endl; +// exit(1); +// } + } + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override { + status_ = armpl_spmat_destroy(A_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_destroy(B_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_destroy(C_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + delete [] A_armpl_row_ptr_; + delete [] A_armpl_col_index_; + delete [] A_vals_; + delete [] B_armpl_row_ptr_; + delete [] B_armpl_col_index_; + delete [] B_vals_; + delete [] C_armpl_row_ptr_; + delete [] C_armpl_col_index_; + delete [] C_vals_; + + } + + /** The constant value Alpha. */ + const T alpha = ALPHA; + + /** The constant value Beta. */ + const T beta = BETA; + + void toCSR_armpl() { + n_armpl_ = n_; + // ToDo -- check whether flags_ is correct! + flags_ = 0; + + // Move A to CSR + A_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; + A_armpl_col_index_ = new armpl_int_t[nnz_]; + A_vals_ = new T[nnz_]; + A_armpl_row_ptr_[0] = 0; + int nnz_encountered = 0; + + for (int row = 0; row < n_; row++) { + A_armpl_row_ptr_[row + 1] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (A_[(row * n_) + col] != 0.0) { + A_armpl_col_index_[nnz_encountered] = col; + A_vals_[nnz_encountered] = static_cast<T>(A_[(row * n_) + col]); + nnz_encountered++; + } + } + } + + // Move B to CSR + B_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; + B_armpl_col_index_ = new armpl_int_t[nnz_]; + B_vals_ = new T[nnz_]; + B_armpl_row_ptr_[0] = 0; + + nnz_encountered = 0; + for (int row = 0; row < n_; row++) { + B_armpl_row_ptr_[row + 1] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (B_[(row * n_) + col] != 0.0) { + B_armpl_col_index_[nnz_encountered] = col; + B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]); + nnz_encountered++; + } + } + } + + // Move C to CSR + C_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; + C_armpl_col_index_ = new armpl_int_t[nnz_]; + C_vals_ = new T[nnz_]; + C_armpl_row_ptr_[0] = 0; + + nnz_encountered = 0; + for (int row = 0; row < n_; row++) { + C_armpl_row_ptr_[row + 1] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (B_[(row * n_) + col] != 0.0) { + C_armpl_col_index_[nnz_encountered] = col; + C_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]); + nnz_encountered++; + } + } + } + + if constexpr (std::is_same_v<T, float>) { +// printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_s(&A_armpl_, + n_armpl_, + n_armpl_, + A_armpl_row_ptr_, + A_armpl_col_index_, + A_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_s(&B_armpl_, + n_armpl_, + n_armpl_, + B_armpl_row_ptr_, + B_armpl_col_index_, + B_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_s(&C_armpl_, + n_armpl_, + n_armpl_, + C_armpl_row_ptr_, + C_armpl_col_index_, + C_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } else if constexpr (std::is_same_v<T, double>) { +// printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, +// nnz_, flags_ + status_ = armpl_spmat_create_csr_d(&A_armpl_, + n_armpl_, + n_armpl_, + A_armpl_row_ptr_, + A_armpl_col_index_, + A_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_d(&B_armpl_, + n_armpl_, + n_armpl_, + B_armpl_row_ptr_, + B_armpl_col_index_, + B_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_d(&C_armpl_, + n_armpl_, + n_armpl_, + C_armpl_row_ptr_, + C_armpl_col_index_, + C_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// std::cout << "Okay, all matrices made!!" << std::endl; + } + + } + + void printCSR(armpl_int_t n, armpl_int_t* rp, armpl_int_t* ci, T* v, + armpl_int_t nz, armpl_int_t f) { + std::cout << "\tn = " << n << std::endl; + std::cout << "\trow ptr (size = " << sizeof(rp[0]) << ") = [" << rp[0]; + for (int i = 1; i < (n + 1); i++) { + std::cout << ", " << rp[i]; + } + std::cout << "]" << std::endl << "\tcol ind (size = " << sizeof(ci[0]) << + ") = [" << ci[0]; + for (int i = 1; i < nz; i++) { + std::cout << ", " << ci[i]; + } + std::cout << "]" << std::endl << "\tvals (size = " << sizeof(v[0]) << + ") = [" << v[0]; + for (int i = 1; i < nz; i++) { + std::cout << ", " << v[i]; + } + std::cout << "]" << std::endl << "\tflags = " << f << std::endl; + } + + armpl_status_t status_; + + armpl_int_t flags_; + + armpl_int_t n_armpl_; + + armpl_int_t* A_armpl_row_ptr_; + armpl_int_t* A_armpl_col_index_; + armpl_int_t* B_armpl_row_ptr_; + armpl_int_t* B_armpl_col_index_; + armpl_int_t* C_armpl_row_ptr_; + armpl_int_t* C_armpl_col_index_; + + armpl_spmat_t A_armpl_; + armpl_spmat_t B_armpl_; + armpl_spmat_t C_armpl_; + + armpl_sparse_hint_value transA_ = ARMPL_SPARSE_OPERATION_NOTRANS; + armpl_sparse_hint_value transB_ = ARMPL_SPARSE_OPERATION_NOTRANS; + +}; +} // namespace cpu +#endif \ No newline at end of file From 893458824dc6d343e34a66207a7ebbfc9d67f9b3 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Mon, 21 Oct 2024 15:50:43 +0100 Subject: [PATCH 32/38] Beginning gemv ARMPL --- .idea/workspace.xml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index d27d844..5a61e8c 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -16,8 +16,7 @@ </component> <component name="ChangeListManager"> <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding AOCL files"> - <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/createGflopsGraphs.py" beforeDir="false" afterPath="$PROJECT_DIR$/createGflopsGraphs.py" afterDir="false" /> + <change afterPath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" afterDir="false" /> </list> <option name="SHOW_DIALOG" value="false" /> <option name="HIGHLIGHT_CONFLICTS" value="true" /> @@ -550,7 +549,6 @@ </option> </component> <component name="VcsManagerConfiguration"> - <MESSAGE value="Adding basic sparse multiplication kernel for default CPU and GPU" /> <MESSAGE value="Implementing cuSPARSE kernel" /> <MESSAGE value="Trying to work out CSR malloc bug" /> <MESSAGE value="cuSPARSE unified memory implementation" /> @@ -575,6 +573,7 @@ <MESSAGE value="No longer overwriting B_" /> <MESSAGE value="Adding kernel selection for gemv" /> <MESSAGE value="Providing armpl with hints" /> - <option name="LAST_COMMIT_MESSAGE" value="Providing armpl with hints" /> + <MESSAGE value="Updating createGflopsGraphs.py to show sparsity" /> + <option name="LAST_COMMIT_MESSAGE" value="Updating createGflopsGraphs.py to show sparsity" /> </component> </project> \ No newline at end of file From 2e61261a2ea804360db9bd4adbbb031198552f7d Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Thu, 2 Jan 2025 12:03:18 +0000 Subject: [PATCH 33/38] still trying to figure out segfault... --- .idea/workspace.xml | 32 +- ArmPL/sp_gemv.hh | 175 +------ cuBLAS/sp_gemv.hh | 885 +++++++++++++++++++++++---------- include/doGemm.hh | 28 +- include/doGemv.hh | 279 +++++++---- include/kernels/CPU/sp_gemv.hh | 9 + 6 files changed, 864 insertions(+), 544 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 5a61e8c..9592790 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -15,8 +15,13 @@ </configurations> </component> <component name="ChangeListManager"> - <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Adding AOCL files"> - <change afterPath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" afterDir="false" /> + <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Beginning gemv ARMPL"> + <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/cuBLAS/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/sp_gemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/doGemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/sp_gemv.hh" afterDir="false" /> </list> <option name="SHOW_DIALOG" value="false" /> <option name="HIGHLIGHT_CONFLICTS" value="true" /> @@ -178,6 +183,7 @@ <workItem from="1727941814674" duration="165000" /> <workItem from="1727941995420" duration="22747000" /> <workItem from="1729503392250" duration="1773000" /> + <workItem from="1730878516596" duration="9915000" /> </task> <task id="LOCAL-00001" summary="trivial changes"> <option name="closed" value="true" /> @@ -531,7 +537,23 @@ <option name="project" value="LOCAL" /> <updated>1728655865948</updated> </task> - <option name="localTasksCounter" value="45" /> + <task id="LOCAL-00045" summary="Beginning gemv ARMPL"> + <option name="closed" value="true" /> + <created>1729522236773</created> + <option name="number" value="00045" /> + <option name="presentableId" value="LOCAL-00045" /> + <option name="project" value="LOCAL" /> + <updated>1729522236773</updated> + </task> + <task id="LOCAL-00046" summary="Beginning gemv ARMPL"> + <option name="closed" value="true" /> + <created>1729522244950</created> + <option name="number" value="00046" /> + <option name="presentableId" value="LOCAL-00046" /> + <option name="project" value="LOCAL" /> + <updated>1729522244950</updated> + </task> + <option name="localTasksCounter" value="47" /> <servers /> </component> <component name="TypeScriptGeneratedFilesManager"> @@ -549,7 +571,6 @@ </option> </component> <component name="VcsManagerConfiguration"> - <MESSAGE value="Implementing cuSPARSE kernel" /> <MESSAGE value="Trying to work out CSR malloc bug" /> <MESSAGE value="cuSPARSE unified memory implementation" /> <MESSAGE value="Now compiles" /> @@ -574,6 +595,7 @@ <MESSAGE value="Adding kernel selection for gemv" /> <MESSAGE value="Providing armpl with hints" /> <MESSAGE value="Updating createGflopsGraphs.py to show sparsity" /> - <option name="LAST_COMMIT_MESSAGE" value="Updating createGflopsGraphs.py to show sparsity" /> + <MESSAGE value="Beginning gemv ARMPL" /> + <option name="LAST_COMMIT_MESSAGE" value="Beginning gemv ARMPL" /> </component> </project> \ No newline at end of file diff --git a/ArmPL/sp_gemv.hh b/ArmPL/sp_gemv.hh index 818c95e..f39a764 100644 --- a/ArmPL/sp_gemv.hh +++ b/ArmPL/sp_gemv.hh @@ -20,14 +20,10 @@ class sp_gemv_cpu : public sp_gemv<T> { using sp_gemv<T>::callConsume; using sp_gemv<T>::m_; using sp_gemv<T>::n_; - using sp_gemv<T>::k_; using sp_gemv<T>::A_; - using sp_gemv<T>::B_; - using sp_gemv<T>::C_; + using sp_gemv<T>::x_; + using sp_gemv<T>::y_; using sp_gemv<T>::nnz_; - using sp_gemv<T>::A_vals_; - using sp_gemv<T>::B_vals_; - using sp_gemv<T>::C_vals_; private: /** Make call to the GEMM kernel. */ @@ -50,25 +46,20 @@ class sp_gemv_cpu : public sp_gemv<T> { * armpl_spmat_update_[sdcz]() */ - // Todo -- See if using armpl_spmat_hint can improve performance here. - // If so, follow with optimisation functions - if constexpr (std::is_same_v<T, float>) { - status_ = armpl_spmm_exec_s(transA_, - transB_, + status_ = armpl_spmv_exec_s(trans_, alpha, A_armpl_, - B_armpl_, + x_, beta, - C_armpl_); + y_); } else if constexpr (std::is_same_v<T, double>) { - status_ = armpl_spmm_exec_d(transA_, - transB_, + status_ = armpl_spmv_exec_d(trans_, alpha, A_armpl_, - B_armpl_, + x_, beta, - C_armpl_); + y_); } else { // Un-specialised class will not do any work - print error and exit. std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported." @@ -98,12 +89,6 @@ class sp_gemv_cpu : public sp_gemv<T> { std::cout << "ERROR " << status_ << std::endl; exit(1); } - status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_MEMORY, - ARMPL_SPARSE_MEMORY_NOALLOCS); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_STRUCTURE, ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED); @@ -111,12 +96,6 @@ class sp_gemv_cpu : public sp_gemv<T> { std::cout << "ERROR " << status_ << std::endl; exit(1); } - status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_STRUCTURE, - ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } // TODO -- will this be FEW? status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS, @@ -125,12 +104,6 @@ class sp_gemv_cpu : public sp_gemv<T> { std::cout << "ERROR " << status_ << std::endl; exit(1); } - status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS, - ARMPL_SPARSE_INVOCATIONS_MANY); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION, ARMPL_SPARSE_OPERATION_NOTRANS); @@ -138,12 +111,6 @@ class sp_gemv_cpu : public sp_gemv<T> { std::cout << "ERROR " << status_ << std::endl; exit(1); } - status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION, - ARMPL_SPARSE_OPERATION_NOTRANS); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } // TODO -- investigate whch is better here status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY, @@ -152,12 +119,6 @@ class sp_gemv_cpu : public sp_gemv<T> { std::cout << "ERROR " << status_ << std::endl; exit(1); } - status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY, - ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } // TODO -- this is thorwing an error -- couldn't immediately fix so come // back to @@ -183,27 +144,10 @@ class sp_gemv_cpu : public sp_gemv<T> { std::cout << "ERROR " << status_ << std::endl; exit(1); } - status_ = armpl_spmat_destroy(B_armpl_); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } - status_ = armpl_spmat_destroy(C_armpl_); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } delete [] A_armpl_row_ptr_; delete [] A_armpl_col_index_; delete [] A_vals_; - delete [] B_armpl_row_ptr_; - delete [] B_armpl_col_index_; - delete [] B_vals_; - delete [] C_armpl_row_ptr_; - delete [] C_armpl_col_index_; - delete [] C_vals_; - } /** The constant value Alpha. */ @@ -235,42 +179,6 @@ class sp_gemv_cpu : public sp_gemv<T> { } } - // Move B to CSR - B_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; - B_armpl_col_index_ = new armpl_int_t[nnz_]; - B_vals_ = new T[nnz_]; - B_armpl_row_ptr_[0] = 0; - - nnz_encountered = 0; - for (int row = 0; row < n_; row++) { - B_armpl_row_ptr_[row + 1] = nnz_encountered; - for (int col = 0; col < n_; col++) { - if (B_[(row * n_) + col] != 0.0) { - B_armpl_col_index_[nnz_encountered] = col; - B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]); - nnz_encountered++; - } - } - } - - // Move C to CSR - C_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; - C_armpl_col_index_ = new armpl_int_t[nnz_]; - C_vals_ = new T[nnz_]; - C_armpl_row_ptr_[0] = 0; - - nnz_encountered = 0; - for (int row = 0; row < n_; row++) { - C_armpl_row_ptr_[row + 1] = nnz_encountered; - for (int col = 0; col < n_; col++) { - if (B_[(row * n_) + col] != 0.0) { - C_armpl_col_index_[nnz_encountered] = col; - C_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]); - nnz_encountered++; - } - } - } - if constexpr (std::is_same_v<T, float>) { // printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, // nnz_, flags_); @@ -285,34 +193,6 @@ class sp_gemv_cpu : public sp_gemv<T> { std::cout << "ERROR " << status_ << std::endl; exit(1); } - -// printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, -// nnz_, flags_); - status_ = armpl_spmat_create_csr_s(&B_armpl_, - n_armpl_, - n_armpl_, - B_armpl_row_ptr_, - B_armpl_col_index_, - B_vals_, - flags_); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } - -// printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_, -// nnz_, flags_); - status_ = armpl_spmat_create_csr_s(&C_armpl_, - n_armpl_, - n_armpl_, - C_armpl_row_ptr_, - C_armpl_col_index_, - C_vals_, - flags_); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } } else if constexpr (std::is_same_v<T, double>) { // printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, // nnz_, flags_ @@ -328,34 +208,6 @@ class sp_gemv_cpu : public sp_gemv<T> { exit(1); } -// printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, -// nnz_, flags_); - status_ = armpl_spmat_create_csr_d(&B_armpl_, - n_armpl_, - n_armpl_, - B_armpl_row_ptr_, - B_armpl_col_index_, - B_vals_, - flags_); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } - -// printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_, -// nnz_, flags_); - status_ = armpl_spmat_create_csr_d(&C_armpl_, - n_armpl_, - n_armpl_, - C_armpl_row_ptr_, - C_armpl_col_index_, - C_vals_, - flags_); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } - // std::cout << "Okay, all matrices made!!" << std::endl; } @@ -381,25 +233,20 @@ class sp_gemv_cpu : public sp_gemv<T> { std::cout << "]" << std::endl << "\tflags = " << f << std::endl; } + armpl_status_t status_; armpl_int_t flags_; armpl_int_t n_armpl_; + T* A_vals_; armpl_int_t* A_armpl_row_ptr_; armpl_int_t* A_armpl_col_index_; - armpl_int_t* B_armpl_row_ptr_; - armpl_int_t* B_armpl_col_index_; - armpl_int_t* C_armpl_row_ptr_; - armpl_int_t* C_armpl_col_index_; armpl_spmat_t A_armpl_; - armpl_spmat_t B_armpl_; - armpl_spmat_t C_armpl_; - armpl_sparse_hint_value transA_ = ARMPL_SPARSE_OPERATION_NOTRANS; - armpl_sparse_hint_value transB_ = ARMPL_SPARSE_OPERATION_NOTRANS; + armpl_sparse_hint_value trans_ = ARMPL_SPARSE_OPERATION_NOTRANS; }; } // namespace cpu diff --git a/cuBLAS/sp_gemv.hh b/cuBLAS/sp_gemv.hh index 8027746..f35a63a 100644 --- a/cuBLAS/sp_gemv.hh +++ b/cuBLAS/sp_gemv.hh @@ -1,261 +1,624 @@ -//#pragma once -// -//#ifdef GPU_CUBLAS -//#include <cusparse_v2.h> -//#include <cuda.h> -//#include <cublas_v2.h> -//#include <cuda_runtime.h> -//#include <type_traits> -//#include <random> -//#include <iostream> -// -//#include "../include/kernels/GPU/sp_gemv.hh" -//#include "../include/utilities.hh" -//#include "common.hh" -// -//namespace gpu { -///** A class for sparse GEMV GPU BLAS kernels. */ -//template <typename T> -//class gemv_gpu : public gemv<T> { -// public: -// using gemv<T>::gemv; -// using gemv<T>::initInputMatrixVector; -// using gemv<T>::m_; -// using gemv<T>::n_; -// using gemv<T>::A_; -// using gemv<T>::x_; -// using gemv<T>::y_; -// using gemv<T>::offload_; -// using gemv<T>::vecIncrement_; -// -// ~gemv_gpu() { -// if (alreadyInitialised_) { -// // Destroy the handle -// cublasCheckError(cublasDestroy(handle_)); -// -// // Destroy streams after use -// cudaCheckError(cudaStreamDestroy(s1_)); -// cudaCheckError(cudaStreamDestroy(s2_)); -// cudaCheckError(cudaStreamDestroy(s3_)); -// } -// } -// -// /** Initialise the required data structures. -// * `offload` refers to the data offload type: -// * - Once: Move data from host to device before all iterations & move from -// * device to host after all iterations -// * - Always: Move data from host to device and device to host each iteration -// * - Unified: Initialise data as unified memory; no data movement semantics -// * required */ -// void initialise(gpuOffloadType offload, int m, int n) override { -// if (!alreadyInitialised_) { -// alreadyInitialised_ = true; -// // Perform set-up which doesn't need to happen every problem size change. -// // Create a handle for CUBLAS -// cublasCheckError(cublasCreate(&handle_)); -// -// // Get device identifier -// cudaCheckError(cudaGetDevice(&gpuDevice_)); -// -// // Initialise 3 streams to asynchronously move data between host and -// // device -// cudaCheckError(cudaStreamCreate(&s1_)); -// cudaCheckError(cudaStreamCreate(&s2_)); -// cudaCheckError(cudaStreamCreate(&s3_)); -// } -// -// offload_ = offload; -// m_ = m; -// n_ = n; -// -// if (offload_ == gpuOffloadType::unified) { -// cudaCheckError(cudaMallocManaged(&A_, sizeof(T) * m_ * n_)); -// cudaCheckError(cudaMallocManaged(&x_, sizeof(T) * n_)); -// cudaCheckError(cudaMallocManaged(&y_, sizeof(T) * m_)); -// } else { -// // Allocate matrices on host -// cudaCheckError(cudaMallocHost((void**)&A_, sizeof(T) * m_ * n_)); -// cudaCheckError(cudaMallocHost((void**)&x_, sizeof(T) * n_)); -// cudaCheckError(cudaMallocHost((void**)&y_, sizeof(T) * m_)); -// // Allocate matrices on device -// cudaCheckError(cudaMalloc((void**)&A_device_, sizeof(T) * m_ * n_)); -// cudaCheckError(cudaMalloc((void**)&x_device_, sizeof(T) * n_)); -// cudaCheckError(cudaMalloc((void**)&y_device_, sizeof(T) * m_)); -// } -// -// // Initialise the host data structures -// initInputMatrixVector(); -// } -// -// private: -// /** Perform any required steps before calling the GEMV kernel that should -// * be timed. */ -// void preLoopRequirements() override { -// switch (offload_) { -// case gpuOffloadType::always: { -// // Offload data each iteration - no requirements -// break; -// } -// case gpuOffloadType::once: { -// // Offload input data from host to the device. -// cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_, -// cudaMemcpyHostToDevice, s1_)); -// cudaCheckError(cudaMemcpyAsync(x_device_, x_, sizeof(T) * n_, -// cudaMemcpyHostToDevice, s2_)); -// cudaCheckError(cudaMemcpyAsync(y_device_, y_, sizeof(T) * m_, -// cudaMemcpyHostToDevice, s3_)); -// break; -// } -// case gpuOffloadType::unified: { -// // Prefetch input data to device -// cudaCheckError( -// cudaMemPrefetchAsync(A_, sizeof(T) * m_ * n_, gpuDevice_, s1_)); -// cudaCheckError( -// cudaMemPrefetchAsync(x_, sizeof(T) * n_, gpuDevice_, s2_)); -// cudaCheckError( -// cudaMemPrefetchAsync(y_, sizeof(T) * m_, gpuDevice_, s3_)); -// break; -// } -// } -// } -// -// /** Make a call to the BLAS Library Kernel. */ -// void callGemv() override { -// switch (offload_) { -// case gpuOffloadType::always: { -// // Offload input data from host to the device. -// cudaCheckError(cudaMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_, -// cudaMemcpyHostToDevice, s1_)); -// cudaCheckError(cudaMemcpyAsync(x_device_, x_, sizeof(T) * n_, -// cudaMemcpyHostToDevice, s2_)); -// cudaCheckError(cudaMemcpyAsync(y_device_, y_, sizeof(T) * m_, -// cudaMemcpyHostToDevice, s3_)); -// // Call cuBLAS GEMV kernel -// if constexpr (std::is_same_v<T, float>) { -// cublasCheckError(cublasSgemv( -// handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_), -// x_device_, vecIncrement_, &beta, y_device_, vecIncrement_)); -// } else if constexpr (std::is_same_v<T, double>) { -// cublasCheckError(cublasDgemv( -// handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_), -// x_device_, vecIncrement_, &beta, y_device_, vecIncrement_)); -// } -// // Offload output data from device to host -// cudaCheckError(cudaMemcpyAsync(y_, y_device_, sizeof(T) * m_, -// cudaMemcpyDeviceToHost, s3_)); -// // Ensure device has finished all work. -// cudaCheckError(cudaDeviceSynchronize()); -// break; -// } -// case gpuOffloadType::once: { -// // Call cuBLAS GEMV kernel -// if constexpr (std::is_same_v<T, float>) { -// cublasCheckError(cublasSgemv( -// handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_), -// x_device_, vecIncrement_, &beta, y_device_, vecIncrement_)); -// } else if constexpr (std::is_same_v<T, double>) { -// cublasCheckError(cublasDgemv( -// handle_, CUBLAS_OP_N, m_, n_, &alpha, A_device_, std::max(1, m_), -// x_device_, vecIncrement_, &beta, y_device_, vecIncrement_)); -// } -// break; -// } -// case gpuOffloadType::unified: { -// // Call cuBLAS GEMV kernel -// if constexpr (std::is_same_v<T, float>) { -// cublasCheckError(cublasSgemv(handle_, CUBLAS_OP_N, m_, n_, &alpha, A_, -// std::max(1, m_), x_, vecIncrement_, -// &beta, y_, vecIncrement_)); -// } else if constexpr (std::is_same_v<T, double>) { -// cublasCheckError(cublasDgemv(handle_, CUBLAS_OP_N, m_, n_, &alpha, A_, -// std::max(1, m_), x_, vecIncrement_, -// &beta, y_, vecIncrement_)); -// } -// break; -// } -// } -// } -// -// /** Perform any required steps after calling the GEMV kernel that should -// * be timed. */ -// void postLoopRequirements() override { -// switch (offload_) { -// case gpuOffloadType::always: { -// // Offload data each iteration - no requirements -// break; -// } -// case gpuOffloadType::once: { -// // Offload output data from device to host -// cudaCheckError(cudaMemcpyAsync(y_, y_device_, sizeof(T) * m_, -// cudaMemcpyDeviceToHost, s3_)); -// // Ensure device has finished all work. -// cudaCheckError(cudaDeviceSynchronize()); -// break; -// } -// case gpuOffloadType::unified: { -// // Ensure all output data resides on host once work has completed -// cudaCheckError( -// cudaMemPrefetchAsync(y_, sizeof(T) * m_, cudaCpuDeviceId, s3_)); -// // Ensure device has finished all work. -// cudaCheckError(cudaDeviceSynchronize()); -// break; -// } -// } -// } -// -// /** Do any necessary cleanup (free pointers, close library handles, etc.) -// * after Kernel has been called. */ -// void postCallKernelCleanup() override { -// if (offload_ == gpuOffloadType::unified) { -// cudaFree(A_); -// cudaFree(x_); -// cudaFree(y_); -// } else { -// // Free the memory held on host and device -// cudaFreeHost((void*)A_); -// cudaFreeHost((void*)x_); -// cudaFreeHost((void*)y_); -// cudaFree(A_device_); -// cudaFree(x_device_); -// cudaFree(y_device_); -// } -// } -// -// /** Whether the initialise function has been called before. */ -// bool alreadyInitialised_ = false; -// -// /** Handle used when calling cuBLAS. */ -// cublasHandle_t handle_; -// -// /** CUDA Stream 1 - used to asynchronously move data between host and device. -// */ -// cudaStream_t s1_; -// -// /** CUDA Stream 2 - used to asynchronously move data between host and device. -// */ -// cudaStream_t s2_; -// -// /** CUDA Stream 3 - used to asynchronously move data between host and device. -// */ -// cudaStream_t s3_; -// -// /** The ID of the target GPU Device. */ -// int gpuDevice_; -// -// /** Input matrix A, held on the device. */ -// T* A_device_; -// -// /** Input vector x, held on the device. */ -// T* x_device_; -// -// /** Input vector y, held on the device. */ -// T* y_device_; -// -// /** The constant value Alpha. */ -// const T alpha = ALPHA; -// -// /** The constant value Beta. */ -// const T beta = BETA; -//}; -//} // namespace gpu -//#endif \ No newline at end of file +#pragma once + +#ifdef GPU_CUBLAS +#include <cusparse_v2.h> +#include <cuda_runtime_api.h> +#include <type_traits> +#include <random> +#include <iostream> + +#include "../include/kernels/GPU/sp_gemv.hh" +#include "../include/utilities.hh" +#include "common.hh" + +namespace gpu { +/** A class for sparse GEMM GPU BLAS kernels. */ +template <typename T> +class sp_gemv_gpu : public sp_gemv<T> { + public: + using sp_gemv<T>::sp_gemv; + using sp_gemv<T>::initInputMatrixVectorSparse; +// using sp_gemv<T>::toCSR_int; + using sp_gemv<T>::m_; + using sp_gemv<T>::n_; + using sp_gemv<T>::A_; + using sp_gemv<T>::x_; + using sp_gemv<T>::y_; + using sp_gemv<T>::offload_; + using sp_gemv<T>::sparsity_; + + ~sp_gemv_gpu() { + // ToDo -- destroy the handle + + // Destroy streams after use + cudaCheckError(cudaStreamDestroy(s1_)); + cudaCheckError(cudaStreamDestroy(s2_)); + cudaCheckError(cudaStreamDestroy(s3_)); + } + + // ToDo -- No checksum for sparse yet. Need to do + + /** Initialise the required data structures. + * `offload` refers to the data offload type: + * - Once: Move data from host to device before all iterations & move from + * device to host after all iterations + * - Always: Move data from host to device and device to host each iteration + * - Unified: Initialise data as unified memory; no data movement semantics + * required */ + void initialise(gpuOffloadType offload, int n, float sparsity) override { + std::cout << std::endl << "##############################" << std::endl + << "\tCUSPARSE GEMV\t\tInitialising n = " << n << "\tOffload" + << " type = " << + (((offload == gpuOffloadType::unified) ? "Unified" : (offload + == gpuOffloadType::always) ? "Always" : "Once")) + << std::endl + << "##############################" << std::endl; + offload_ = offload; + + sparsity_ = sparsity; + + + /** + * + * T* A_val_; + * int *A_col_, *A_row_; + * T* A_val_dev_; + * int *A_col_dev_, *A_row_dev_; + * uint64_t A_nnz_, vals_size_, cols_size_, rows_size_; + * + * + * T * x_host_, *y_host_; + * T *x_dev_, *y_dev_; + * uint64_t x_size_, y_size_; + * + */ + + // Create a handle for cuSPARSE + cusparseCheckError(cusparseCreate(&handle_)); + cudaCheckError(cudaGetDevice(&gpuDevice_)); + + if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F; + else if (std::is_same_v<T, double>) cudaDataType_ = CUDA_R_64F; + else { + std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; + exit(1); + } + n_ = n; + + // Initialise 3 streams to asynchronously move data between host and device + cudaCheckError(cudaStreamCreate(&s1_)); + cudaCheckError(cudaStreamCreate(&s2_)); + cudaCheckError(cudaStreamCreate(&s3_)); + + std::cout << "\tcuda streams created" << std::endl; + + + // Work out the sizes of all the vectors + A_nnz_ = 1 + (uint64_t)(n_ * n_ * (1 - sparsity)); + vals_size_ = sizeof(T) * A_nnz_; + cols_size_ = sizeof(int) * A_nnz_; + rows_size_ = sizeof(int) * (n_ + 1); + x_size_ = sizeof(T) * n_; + y_size_ = sizeof(T) * n_; + + if (offload_ == gpuOffloadType::unified) { + // Get device identifier + cudaCheckError(cudaMallocManaged(&A_val_, vals_size_)); + cudaCheckError(cudaMallocManaged(&A_col_, cols_size_)); + cudaCheckError(cudaMallocManaged(&A_row_, rows_size_)); + + cudaCheckError(cudaMallocManaged(&x_, x_size_)); + + cudaCheckError(cudaMallocManaged(&y_, y_size_)); + } else { + A_val_ = (T*)malloc(vals_size_); + A_col_ = (int*)malloc(cols_size_); + A_row_ = (int*)malloc(rows_size_); + + std::cout << "\tA_ local csr arrays made" << std::endl; + + x_ = (T*)malloc(x_size_); + y_ = (T*)malloc(y_size_); + + std::cout << "\tx_ and y_ local arrays made" << std::endl; + + cudaCheckError(cudaMalloc((void**)&A_val_dev_, vals_size_)); + cudaCheckError(cudaMalloc((void**)&A_col_dev_, cols_size_)); + cudaCheckError(cudaMalloc((void**)&A_row_dev_, rows_size_)); + + std::cout << "\tA_ dev csr arrays made" << std::endl; + + cudaCheckError(cudaMalloc((void**)&x_dev_, x_size_)); + + cudaCheckError(cudaMalloc((void**)&y_dev_, y_size_)); + + std::cout << "\tx_ and y_ dev arrays made" << std::endl; + } + + // Initialise the host matricies + // cusparseSpGEMM() works on CSR format only. This helpfully makes our + // sparse matrix format decision for us! + + // Initialise the matrices + // Set initial values to 0 + A_ = (T*)malloc(sizeof(T) * n_ * n_); + + std::cout << "\tA_ dense array made" << std::endl; + + initInputMatrixVectorSparse();git branc + + std::cout << "\tinputs made" << std::endl; + + toCSR_int(A_, n_, n_, A_val_, A_col_, A_row_); + + std::cout << "\tA_ moved to CSR" << std::endl; + +// std::cout << "_____Matrix A_____" << std::endl; +// printDenseMatrix(A_, n_, n_); +// std::cout << std::endl << std::endl; +// printCSR(A_val_, A_col_, A_row_, nnz_, n_, n_); + + std::cout << "\tInitialising done!" << std::endl; + } + + private: + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override { + std::cout << std::endl << "##############################" << std::endl + << "\tPreloop Requirements" << std::endl + << "##############################" << std::endl; + switch(offload_) { + case gpuOffloadType::always: { + // Make matrix descriptor + cusparseCheckError( + cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, + A_col_dev_, A_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + std::cout << "\tA_ description made" << std::endl; + // Create vector descriptor + cusparseCheckError(cusparseCreateDnVec(&descrx_, n_, x_dev_, + cudaDataType_)); + std::cout << "\tx_ description made" << std::endl; + cusparseCheckError(cusparseCreateDnVec(&descry_, n_, NULL, + cudaDataType_)); + std::cout << "\ty_ description made" << std::endl; + break; + } + case gpuOffloadType::once: { + cudaCheckError(cudaMemcpy(A_val_dev_, A_val_, vals_size_, + cudaMemcpyHostToDevice)); + cudaCheckError(cudaMemcpy(A_col_dev_, A_col_, cols_size_, + cudaMemcpyHostToDevice)); + cudaCheckError(cudaMemcpy(A_row_dev_, A_row_, rows_size_, + cudaMemcpyHostToDevice)); + std::cout << "\tA_ csr dev arrays sunc" << std::endl; + + cudaCheckError(cudaMemcpy(x_dev_, x_, x_size_, + cudaMemcpyHostToDevice)); + std::cout << "\tx_ dev array sunc" << std::endl; + + cudaCheckError(cudaMemcpy(y_dev_, y_, y_size_, + cudaMemcpyHostToDevice)); + std::cout << "\ty_ dev array sunc" << std::endl; + + // Create matrix descriptor + cusparseCheckError( + cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, + A_col_dev_, A_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + std::cout << "\tA_ description made" << std::endl; + // Create vector descriptor + cusparseCheckError(cusparseCreateDnVec(&descrx_, n_, x_dev_, + cudaDataType_)); + std::cout << "\tx_ description made" << std::endl; + cusparseCheckError(cusparseCreateDnVec(&descry_, n_, NULL, + cudaDataType_)); + std::cout << "\ty_ description made" << std::endl; + break; + } + case gpuOffloadType::unified: { + // Prefetch memory to device + cudaCheckError(cudaMemPrefetchAsync(A_val_, vals_size_, gpuDevice_, + s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_col_, cols_size_, gpuDevice_, + s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_row_, rows_size_, gpuDevice_, + s1_)); + std::cout << "\tA_ csr dev arrays sunc" << std::endl; + + cudaCheckError(cudaMemPrefetchAsync(x_, x_size_, gpuDevice_, s2_)); + std::cout << "\tx_ dev array sunc" << std::endl; + + cudaCheckError(cudaMemPrefetchAsync(y_, y_size_, gpuDevice_, s3_)); + std::cout << "\ty_ dev array sunc" << std::endl; + cudaCheckError(cudaDeviceSynchronize()); + break; + } + } + } + + /** Make a call to the BLAS Library Kernel. */ + void callGemv() override { + std::cout << std::endl << "##############################" << std::endl + << "\tCalling GEMV" << std::endl + << "##############################" << std::endl; + switch(offload_) { + case gpuOffloadType::always: { + cudaCheckError(cudaMemcpy(A_val_dev_, A_val_, vals_size_, + cudaMemcpyHostToDevice)); + cudaCheckError(cudaMemcpy(A_col_dev_, A_col_, cols_size_, + cudaMemcpyHostToDevice)); + cudaCheckError(cudaMemcpy(A_row_dev_, A_row_, rows_size_, + cudaMemcpyHostToDevice)); + std::cout << "\tA_ csr dev arrays sunc" << std::endl; + + cudaCheckError(cudaMemcpy(x_dev_, x_, x_size_, cudaMemcpyHostToDevice)); + std::cout << "\tx_ dev array sunc" << std::endl; + + cudaCheckError(cudaMemcpy(y_dev_, y_, y_size_, cudaMemcpyHostToDevice)); + std::cout << "\ty_ dev array sunc" << std::endl; + + /** + * Workflow is : + * cusparseSpMV_bufferSize + * cisparseSpMV_preprocess + * cusparseSpMV + */ + cusparseCheckError(cusparseSpMV_bufferSize(handle_, + opA_, + &alpha, + descrA_, + descrx_, + &beta, + descry_, + cudaDataType_, + alg_, + &buffer_size_)); + + std::cout << "\tbufferSize run" << std::endl; + cudaCheckError(cudaMalloc((void**)&buffer_, buffer_size_)); + std::cout << "\tbuffer allocated" << std::endl; + + cusparseCheckError(cusparseSpMV_preprocess(handle_, + opA_, + &alpha, + descrA_, + descrx_, + &beta, + descry_, + cudaDataType_, + alg_, + buffer_)); + std::cout << "\tpreProcess run" << std::endl; + cusparseCheckError(cusparseSpMV(handle_, + opA_, + &alpha, + descrA_, + descrx_, + &beta, + descry_, + cudaDataType_, + alg_, + buffer_)); + std::cout << "\tSpMV run" << std::endl; + + cudaCheckError(cudaMemcpy(A_val_, A_val_dev_, vals_size_, + cudaMemcpyDeviceToHost)); + cudaCheckError(cudaMemcpy(A_col_, A_col_dev_, cols_size_, + cudaMemcpyDeviceToHost)); + cudaCheckError(cudaMemcpy(A_row_, A_row_dev_, rows_size_, + cudaMemcpyDeviceToHost)); + + std::cout << "\tA_ csr host arrays sunc" << std::endl; + + cudaCheckError(cudaMemcpy(x_, x_dev_, x_size_, cudaMemcpyDeviceToHost)); + std::cout << "\tx_ host array sunc" << std::endl; + + cudaCheckError(cudaMemcpy(y_, y_dev_, y_size_, cudaMemcpyDeviceToHost)); + std::cout << "\ty_ host array sunc" << std::endl; + + + // Freeing memory + cudaCheckError(cudaFree(buffer_)); + std::cout << "\tBuffer 1 freed" << std::endl; + buffer_size_ = 0; + break; + } + case gpuOffloadType::once: { + cusparseCheckError( + cusparseSpMV_bufferSize(handle_, + opA_, + &alpha, + descrA_, + descrx_, + &beta, + descry_, + cudaDataType_, + alg_, + &buffer_size_)); + std::cout << "\tbufferSize run" << std::endl; + + cudaCheckError(cudaMalloc(&buffer_, buffer_size_)); + std::cout << "\tbuffer allocated" << std::endl; + + // ToDo -- only preprocess once? + cusparseCheckError( + cusparseSpMV_preprocess(handle_, + opA_, + &alpha, + descrA_, + descrx_, + &beta, + descry_, + cudaDataType_, + alg_, + buffer_)); + std::cout << "\tpreProcess run" << std::endl; + cusparseCheckError( + cusparseSpMV(handle_, + opA_, + &alpha, + descrA_, + descrx_, + &beta, + descry_, + cudaDataType_, + alg_, + buffer_)); + std::cout << "\tSpMV run" << std::endl; + + // Freeing memory + cudaCheckError(cudaFree(buffer_)); + std::cout << "\tBuffer 1 freed" << std::endl; + break; + } + case gpuOffloadType::unified: { + cusparseCheckError(cusparseSpMV_bufferSize(handle_, + opA_, + &alpha, + descrA_, + descrx_, + &beta, + descry_, + cudaDataType_, + alg_, + &buffer_size_)); + std::cout << "\tbufferSize run" << std::endl; + + cudaCheckError(cudaMallocManaged((void**)&buffer_, buffer_size_)); + std::cout << "\tbuffer allocated" << std::endl; + + cusparseCheckError(cusparseSpMV_preprocess(handle_, + opA_, + &alpha, + descrA_, + descrx_, + &beta, + descry_, + cudaDataType_, + alg_, + buffer_)); + std::cout << "\tpreProcess run" << std::endl; + + cusparseCheckError(cusparseSpMV(handle_, + opA_, + &alpha, + descrA_, + descrx_, + &beta, + descry_, + cudaDataType_, + alg_, + buffer_)); + std::cout << "\tSpMV run" << std::endl; + + // Freeing memory + cudaCheckError(cudaFree(buffer_)); + buffer_size_ = 0; + break; + } + } + } + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override { + std::cout << std::endl << "##############################" << std::endl + << "\tpostloop Requirements" << std::endl + << "##############################" << std::endl; + switch(offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + cudaCheckError(cudaMemcpy(A_val_, A_val_dev_, vals_size_, + cudaMemcpyDeviceToHost)); + cudaCheckError(cudaMemcpy(A_col_, A_col_dev_, cols_size_, + cudaMemcpyDeviceToHost)); + cudaCheckError(cudaMemcpy(A_row_, A_row_dev_, rows_size_, + cudaMemcpyDeviceToHost)); + std::cout << "\tA_ csr host arrays sunc" << std::endl; + + cudaCheckError(cudaMemcpy(x_, x_dev_, x_size_, cudaMemcpyDeviceToHost)); + std::cout << "\tx_ host array sunc" << std::endl; + + cudaCheckError(cudaMemcpy(y_, y_dev_, y_size_, + cudaMemcpyDeviceToHost)); + std::cout << "\ty_ host array sunc" << std::endl; + + cusparseCheckError(cusparseDestroySpMat(descrA_)); + cusparseCheckError(cusparseDestroyDnVec(descrx_)); + cusparseCheckError(cusparseDestroyDnVec(descry_)); + break; + } + case gpuOffloadType::unified: { + // Ensure all data resides on host once work has completed + cudaCheckError(cudaMemPrefetchAsync(A_val_, vals_size_, + cudaCpuDeviceId, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_col_, cols_size_, + cudaCpuDeviceId, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_row_, rows_size_, + cudaCpuDeviceId, s1_)); + std::cout << "\tA_ csr arrays sunc" << std::endl; + + cudaCheckError(cudaMemPrefetchAsync(x_, x_size_, cudaCpuDeviceId, s2_)); + std::cout << "\tx_ array sunc" << std::endl; + + cudaCheckError(cudaMemPrefetchAsync(y_, y_size_, cudaCpuDeviceId, s3_)); + std::cout << "\ty_ array sunc" << std::endl; + + + // Ensure device has finished all work. + cudaCheckError(cudaDeviceSynchronize()); + std::cout << "\tdevice and host sunc" << std::endl; + + cusparseCheckError(cusparseDestroySpMat(descrA_)); + cusparseCheckError(cusparseDestroyDnVec(descrx_)); + cusparseCheckError(cusparseDestroyDnVec(descry_)); + break; + } + } + } + + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() override { + + free(A_); + if (offload_ == gpuOffloadType::unified) { + cudaCheckError(cudaFree(A_val_)); + cudaCheckError(cudaFree(A_col_)); + cudaCheckError(cudaFree(A_row_)); + } else { + free(A_val_); + free(A_col_); + free(A_row_); + cudaCheckError(cudaFree(A_val_dev_)); + cudaCheckError(cudaFree(A_col_dev_)); + cudaCheckError(cudaFree(A_row_dev_)); + } + + // Destroy the handle + cusparseCheckError(cusparseDestroy(handle_)); + + // Destroy streams after use + cudaCheckError(cudaStreamDestroy(s1_)); + cudaCheckError(cudaStreamDestroy(s2_)); + cudaCheckError(cudaStreamDestroy(s3_)); + } + + + void toCSR_int(T* dense, int n_col, int n_row, T* vals, int* col_index, + int* row_ptr) { + int nnz_encountered = 0; + for (int row = 0; row < n_row; row++) { + row_ptr[row] = nnz_encountered; + for (int col = 0; col < n_col; col++) { + if (dense[(row * n_) + col] != 0.0) { + col_index[nnz_encountered] = col; + vals[nnz_encountered] = dense[(row * n_) + col]; + nnz_encountered++; + } + } + } + }; + + // ToDo -- the two following functons are useful for debugging. I'm + // keeping them in to that end, though they are not used by the benchmark + // itself + void printDenseMatrix(T* M, int rows, int cols) { + for (int row = 0; row < rows; row++) { + std::cout << "| "; + for (int col = 0; col < cols; col++) { + std::cout << M[(row * cols) + col] << " | "; + } + std::cout << std::endl; + } + } + + void printCSR(T* values, int* col_indices, int* row_pointers, int nnz, + int rows, int cols) { + std::cout << "\tRow pointers__" << std::endl; + for (int p = 0; p < (rows + 1); p++) { + std::cout << row_pointers[p] << ", "; + } + std::cout << std::endl << "\tColumn Indices__" << std::endl; + for (int i = 0; i < nnz; i++) { + std::cout << col_indices[i] << ", "; + } + std::cout << std::endl << "\tValues__" << std::endl; + for (int v = 0; v < nnz; v++) { + std::cout << values[v] << ", "; + } + std::cout << std::endl; + } + + /** + * ################################ + * CUSPARSE STUFF + * ################################ + */ + /** Handle used when calling cuBLAS. */ + cusparseHandle_t handle_; + + /** CUDA Streams - used to asynchronously move data between host and device. + */ + cudaStream_t s1_; + cudaStream_t s2_; + cudaStream_t s3_; + + /** The ID of the target GPU Device. */ + int gpuDevice_; + + // Create descriptors for matrices A->C + cusparseSpMatDescr_t descrA_; + cusparseDnVecDescr_t descrx_, descry_; + + // Data type depends on kernel being run + cudaDataType_t cudaDataType_; + + size_t buffer_size_ = 0; + void* buffer_ = NULL; + + cusparseOperation_t opA_ = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseOperation_t opB_ = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseSpMVAlg_t alg_ = CUSPARSE_SPMV_CSR_ALG2; + cusparseIndexType_t rType_ = CUSPARSE_INDEX_32I; + cusparseIndexType_t cType_ = CUSPARSE_INDEX_32I; + cusparseIndexBase_t indType_ = CUSPARSE_INDEX_BASE_ZERO; + + /** The constant value Alpha. */ + const T alpha = ALPHA; + + /** The constant value Beta. */ + const T beta = BETA; + + /** + * ################################ + * Matrix A parameters + * ################################ + */ + /** CSR format vectors on the host (also used for USM) */ + T* A_val_; + int *A_col_, *A_row_; + /** CSR format vectors on the device. */ + T* A_val_dev_; + int *A_col_dev_, *A_row_dev_; + /** Metadata */ + uint64_t A_nnz_, vals_size_, cols_size_, rows_size_; + + /** + * ################################ + * Vectors x and y parameters + * ################################ + */ + /** Vectors on the host (also used for USM) */ + T * x_host_, *y_host_; + /** Vectors on the device */ + T *x_dev_, *y_dev_; + /** Metadata */ + uint64_t x_size_, y_size_; +}; +} // namespace gpu +#endif \ No newline at end of file diff --git a/include/doGemm.hh b/include/doGemm.hh index 93cc058..23caa6f 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -292,14 +292,14 @@ class doGemm { callDenseKernels(csvFile, 32, dim, 32); } } - // Close file - csvFile.close(); #if CPU_ENABLED && GPU_ENABLED if (doCPU_ && doGPU_) { // Print offload results to stdout printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)"); } #endif + // Close file + csvFile.close(); } if (doSparse_) { // Square sparse matrix - sparse matrix multiplication cpuGpu_always_ = cpuGpu_offloadThreshold(); @@ -307,10 +307,8 @@ class doGemm { cpuGpu_unified_ = cpuGpu_offloadThreshold(); std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() + "_sparse_square_99.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - callSparseKernels(csvFile, dim, 0.99); - } + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callSparseKernels(csvFile, dim, 0.99); } // Close file csvFile.close(); @@ -325,10 +323,8 @@ class doGemm { cpuGpu_unified_ = cpuGpu_offloadThreshold(); csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() + "_sparse_square_999.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - callSparseKernels(csvFile, dim, 0.999); - } + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callSparseKernels(csvFile, dim, 0.999); } #if CPU_ENABLED && GPU_ENABLED if (doCPU_ && doGPU_) { @@ -341,10 +337,8 @@ class doGemm { cpuGpu_unified_ = cpuGpu_offloadThreshold(); csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() + "_sparse_square_9999.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - callSparseKernels(csvFile, dim, 0.9999); - } + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callSparseKernels(csvFile, dim, 0.9999); } #if CPU_ENABLED && GPU_ENABLED if (doCPU_ && doGPU_) { @@ -358,10 +352,8 @@ class doGemm { csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() + "_sparse_square_99999.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - callSparseKernels(csvFile, dim, 0.99999); - } + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callSparseKernels(csvFile, dim, 0.99999); } #if CPU_ENABLED && GPU_ENABLED if (doCPU_ && doGPU_) { diff --git a/include/doGemv.hh b/include/doGemv.hh index 2ab5fb1..0ecd814 100644 --- a/include/doGemv.hh +++ b/include/doGemv.hh @@ -8,6 +8,7 @@ #if defined CPU_ARMPL #include "../ArmPL/gemv.hh" +#include "../ArmPL/sp_gemv.hh" #elif defined CPU_ONEMKL #include "../oneMKL/CPU/gemv.hh" #elif defined CPU_AOCL @@ -20,6 +21,7 @@ #if defined GPU_CUBLAS #include "../cuBLAS/gemv.hh" +#include "../cuBLAS/sp_gemv.hh" #elif defined GPU_ONEMKL #include "../oneMKL/GPU/gemv.hh" #elif defined GPU_ROCBLAS @@ -45,11 +47,13 @@ class doGemv { doSparse_(doSparse) #if CPU_ENABLED , - gemvCpu_(iterations_) + gemvCpu_(iterations_), + spGemvCpu_(iterations_) #endif #if GPU_ENABLED , - gemvGpu_(iterations_) + gemvGpu_(iterations_), + spGemvGpu_(iterations_) #endif { static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) && @@ -72,125 +76,148 @@ class doGemv { initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv"); for (int dim = startDimention_; dim <= upperLimit_; dim++) { // M = dim, N = dim; - callKernels(csvFile, dim, dim); + callDenseKernels(csvFile, dim, dim); } // Close file csvFile.close(); - #if CPU_ENABLED && GPU_ENABLED +#if CPU_ENABLED && GPU_ENABLED if (doCPU_ && doGPU_) { // Print offload results to stdout printOffloadThreshold("Square x Vector (M=N)"); } - #endif - - // Rectangular Problem Sizes: - // Tall and thin x Vector - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_tall-thin_vector_M=16N.csv"); - int N = startDimention_; - int M = 16 * N; - while (M <= upperLimit_) { - callKernels(csvFile, M, N); - M += 16; - N++; - } - // Close file - csvFile.close(); +#endif + + // Rectangular Problem Sizes: + // Tall and thin x Vector + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_vector_M=16N.csv"); + int N = startDimention_; + int M = 16 * N; + while (M <= upperLimit_) { + callDenseKernels(csvFile, M, N); + M += 16; + N++; + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Tall-and-Thin x Vector (M=16N)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Vector (M=16N)"); + } #endif - // Tall and thin x Vector - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_tall-thin_vector_M_N=32.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = dim, N = 32; - callKernels(csvFile, dim, 32); + // Tall and thin x Vector + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_vector_M_N=32.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = dim, N = 32; + callDenseKernels(csvFile, dim, 32); + } } - } - // Close file - csvFile.close(); + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Tall-and-Thin x Vector (M, N=32)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Vector (M, N=32)"); + } #endif - // Short and wide x Vector - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_short-wide_vector_N=16M.csv"); - M = startDimention_; - N = 16 * M; - while (N <= upperLimit_) { - callKernels(csvFile, M, N); - M++; - N += 16; - } - // Close file - csvFile.close(); + // Short and wide x Vector + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_vector_N=16M.csv"); + M = startDimention_; + N = 16 * M; + while (N <= upperLimit_) { + callDenseKernels(csvFile, M, N); + M++; + N += 16; + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Short-and-Wide x Vector (N=16M)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Vector (N=16M)"); + } #endif - // Short and wide x Vector - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_short-wide_vector_M=32_N.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = 32, N = dim; - callKernels(csvFile, 32, dim); + // Short and wide x Vector + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_vector_M=32_N.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = 32, N = dim; + callDenseKernels(csvFile, 32, dim); + } } - } - // Close file - csvFile.close(); + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Short-and-Wide x Vector (M=32, N)"); + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Vector (M=32, N)"); + } +#endif } + if (doSparse_) { + // Sparse square matrix + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_sparse_square_9999.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callSparseKernels(csvFile, dim, 0.9999); + } + // Close filex1 + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Sparse square // sparsity = 0.9999"); + } #endif - } + csvFile.close(); + } } private: /** Call the appropriate CPU and GPU GEMV kernels. */ - void callKernels(std::ofstream& csvFile, const int M, const int N) { + void callDenseKernels(std::ofstream& csvFile, const int M, const int N) { const double probSize = calcKib(M, N); const uint64_t flops = calcFlops(M, N); std::string kernelName = getKernelName(); @@ -275,6 +302,64 @@ class doGemv { #endif } + void callSparseKernels(std::ofstream& csvFile, const int N, const float + sparsity) { + const double probSize = calcKib(N, N); + const uint64_t flops = calcFlops(N, N); + std::string kernelName = getKernelName(); + + time_checksum_gflop cpuResult; + time_checksum_gflop gpuResult_once; + time_checksum_gflop gpuResult_always; + time_checksum_gflop gpuResult_unified; + +#if CPU_ENABLED + if (doCPU_) { + spGemvCpu_.initialise(N, sparsity); + time_checksum_gflop cpuResult = spGemvCpu_.compute(); + cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); + // Write result to CSV file + writeLineToCsv(csvFile, "cpu", kernelName, N, N, 0, probSize, sparsity, + iterations_, cpuResult.runtime, cpuResult.gflops); + } +#endif +#if GPU_ENABLED + + if (doGPU_) { + // - ONCE : Offload to/from GPU once before all iterations and once + // after + spGemvGpu_.initialise(gpuOffloadType::once, N, sparsity); + gpuResult_once = spGemvGpu_.compute(); + gpuResult_once.gflops = + calcGflops(flops, iterations_, gpuResult_once.runtime); + + // - ALWAYS: Offload to/from GPU every iteration + spGemvGpu_.initialise(gpuOffloadType::always, N, sparsity); + gpuResult_always = spGemvGpu_.compute(); + gpuResult_always.gflops = + calcGflops(flops, iterations_, gpuResult_always.runtime); + + // - UNIFIED : data passed from host to device (and device to host) as + // needed + spGemvGpu_.initialise(gpuOffloadType::unified, N, sparsity); + gpuResult_unified = spGemvGpu_.compute(); + gpuResult_unified.gflops = + calcGflops(flops, iterations_, gpuResult_unified.runtime); + + // Write results to CSV file + writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, 0, probSize, + sparsity, iterations_, gpuResult_once.runtime, + gpuResult_once.gflops); + writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, 0, + probSize, sparsity, iterations_, gpuResult_always.runtime, + gpuResult_always.gflops); + writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, 0, probSize, + sparsity, iterations_, gpuResult_unified.runtime, + gpuResult_unified.gflops); + } +#endif + } + /** Ensure all CPU and GPU checksums are within the permitted limit of * eachother. */ void checkChecksums(time_checksum_gflop cpuResult, @@ -506,11 +591,13 @@ class doGemv { #if CPU_ENABLED /** The GEMV CPU kernel. */ cpu::gemv_cpu<T> gemvCpu_; + cpu::sp_gemv_cpu<T> spGemvCpu_; #endif #if GPU_ENABLED /** The GEMV GPU kernel. */ gpu::gemv_gpu<T> gemvGpu_; + gpu::sp_gemv_gpu<T> spGemvGpu_; #endif /** The point at which offloading to GPU (offload once) becomes worthwhile. */ diff --git a/include/kernels/CPU/sp_gemv.hh b/include/kernels/CPU/sp_gemv.hh index 0c84cb0..28b0caf 100644 --- a/include/kernels/CPU/sp_gemv.hh +++ b/include/kernels/CPU/sp_gemv.hh @@ -27,6 +27,11 @@ namespace cpu { n_ = n; sparsity_ = sparsity; + // Note that the below should be the same as the edges calculation + // used in the initInputMatricesSparse function. If changed here, + // change there + nnz_ = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity_)); + A_ = (T*)malloc(sizeof(T) * m_ * n_); x_ = (T*)malloc(sizeof(T) * n_); y_ = (T*)malloc(sizeof(T) * m_); @@ -35,6 +40,9 @@ namespace cpu { initInputMatrixVectorSparse(); } + protected: + uint64_t nnz_; + private: /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ @@ -43,5 +51,6 @@ namespace cpu { free(x_); free(y_); } + }; } // namespace cpu \ No newline at end of file From bc70814a714608e7f492d5e331150f8a68263ced Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Thu, 2 Jan 2025 13:11:51 +0000 Subject: [PATCH 34/38] Getting rid of old oneMKL sparse file --- oneMKL/CPU/sp_gemm.hh | 239 ------------------------------------------ 1 file changed, 239 deletions(-) delete mode 100644 oneMKL/CPU/sp_gemm.hh diff --git a/oneMKL/CPU/sp_gemm.hh b/oneMKL/CPU/sp_gemm.hh deleted file mode 100644 index 0b4e32b..0000000 --- a/oneMKL/CPU/sp_gemm.hh +++ /dev/null @@ -1,239 +0,0 @@ -#pragma once - -#ifdef CPU_ONEMKL -#include <mkl.h> - -#include <algorithm> - -#include "../../include/kernels/CPU/sp_gemm.hh" -#include "../../include/utilities.hh" - -namespace cpu { -/** A class for GEMM CPU BLAS kernels. */ -template <typename T> -class sp_gemm_cpu : public sp_gemm<T> { - public: - using sp_gemm<T>::sp_gemm; - using sp_gemm<T>::initInputMatricesSparse; - using sp_gemm<T>::toCSR; - using sp_gemm<T>::callConsume; - using sp_gemm<T>::n_; - using sp_gemm<T>::A_; - using sp_gemm<T>::B_; - using sp_gemm<T>::C_; - - /** Initialise the required data structures. */ - void initialise(int n, float sparsity) { - A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64); - B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64); - C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64); - - n_ = n * 100; - nnz_ = (1 + (int)(n_ * n_ * (1 - sparsity))); - - values_A_ = (T*)mkl_malloc(sizeof(T) * nnz_, ALIGN); - columns_A_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, ALIGN); - rowIndex_A_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (n_ + 1), ALIGN); - - values_B_ = (T*)mkl_malloc(sizeof(T) * nnz_, ALIGN); - columns_B_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, ALIGN); - rowIndex_B_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (n_ + 1), ALIGN); - - x_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN); - y_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN); - rslt_mv_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN); - rslt_mv_trans_ = (T*)mkl_malloc(sizeof(T) * n_, ALIGN); - - // Initialise the matricies - initInputMatricesSparse(sparsity); - - descr_type_gen.type = SPARSE_MATRIX_TYPE_GENERAL; - - // Transfer from dense to CSR format - toCSR_mkl(A_, n_, n_, values_A_, columns_A_, rowIndex_A_); - toCSR_mkl(B_, n_, n_, values_B_, columns_B_, rowIndex_B_); - - // ToDo -- Set values for x and y (which are vectors of length n_?) - - if constexpr (std::is_same_v<T, float>) { - CALL_AND_CHECK_STATUS(mkl_sparse_s_create_csr(&csrA_, - SPARSE_INDEX_BASE_ZERO, n_, - n_, rowIndex_A_, - rowIndex_A_+1, columns_A_, - values_A_), - "Error after MKL_SPARSE_D_CREATE_CSR for csrA\n"); - CALL_AND_CHECK_STATUS(mkl_sparse_s_create_csr(&csrB_, - SPARSE_INDEX_BASE_ZERO, n_, - n_, rowIndex_B_, - rowIndex_B_+1, columns_B_, - values_B_), - "Error after MKL_SPARSE_D_CREATE_CSR for csrB\n"); - } else if constexpr (std::is_same_v<T, double>) { - CALL_AND_CHECK_STATUS(mkl_sparse_d_create_csr(&csrA_, - SPARSE_INDEX_BASE_ZERO, n_, - n_, rowIndex_A_, - rowIndex_A_+1, columns_A_, - values_A_), - "Error after MKL_SPARSE_D_CREATE_CSR for csrA\n"); - CALL_AND_CHECK_STATUS(mkl_sparse_d_create_csr(&csrB_, - SPARSE_INDEX_BASE_ZERO, n_, - n_, rowIndex_B_, - rowIndex_B_+1, columns_B_, - values_B_), - "Error after MKL_SPARSE_D_CREATE_CSR for csrB\n"); - } else { - std::cout << "ERROR - Datatype for OneMKL CPU spGEMM kernel not " - "supported." << std::endl; - exit(1) - }; - - CALL_AND_CHECK_STATUS(mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE, - csrA_, csrB_, &csrC_), - "Error after MKL_SPARSE_SPMM\n"); - - // ToDo -- check that transpose is what I want here - CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrA_, - SPARSE_OPERATION_TRANSPOSE, - descr_type_gen_, 1), - "Error after MKL_SPARSE_SET_MV_HINT with csrA_\n"); - CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrB_, - SPARSE_OPERATION_NON_TRANSPOSE, - descr_type_gen_, 1), - "Error after MKL_SPARSE_SET_MV_HINT with csrB_\n"); - CALL_AND_CHECK_STATUS(mkl_sparse_set_mv_hint(csrC_, - SPARSE_OPERATION_NON_TRANSPOSE, - descr_type_gen_, 1), - "Error after MKL_SPARSE_SET_MV_HINT with csrC_\n"); - - CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrA_), - "Error after MKL_SPARSE_OPTIMIZE with csrA_\n"); - CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrB_), - "Error after MKL_SPARSE_OPTIMIZE with csrB_\n"); - CALL_AND_CHECK_STATUS(mkl_sparse_optimize(csrC_), - "Error after MKL_SPARSE_OPTIMIZE with csrC_\n"); - } - - private: - /** Make call to the GEMM kernel. */ - void callGemm() override { - if constexpr (std::is_same_v<T, float>) { - CALL_AND_CHECK_STATUS(mkl_sparse_s_mv(SPARSE_OPERATION_NON_TRASPOSE, 1 - .0, csrC_, descr_type_gen_, x_, 0.0, rslt_mv_), - "Error after MKL_SPARSE_S_MV for csrC_ * x_\n"); - left_ = cblas_sdot(n_, rstl_mv_, 1, y_, 1); - - CALL_AND_CHECK_STATUS(mkl_sparse_s_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1 - .0, csrB_, descr_type_gen_, x, 0.0, trslt_mv_), - "Error adter MKL_SPARSE_S_MV for csrB_ * x_\n"); - CALL_AND_CHECK_STATUS(mkl_sparse_s_mv(SPARSE_OPERATION_TRANSPOSE, 1.0, - csrA_, descr_type_gen_, y_, 0.0, - rslt_mv_trans_), - "Error adter MKL_SPARSE_S_MV for csrA_ * y_\n"); - right_ = cblas_sdot(n_, rslt_mv_, 1, rslt_mv_trans_, 1); - - residual = fabs(left - right)/(fabs(left) + 1); - - CALL_AND_CHECK_STATUS(mkl_sparse_s_export_csr(csrC_, &indexing_, - &rows_, &cols_, - &pointerB_C_, - &pointerE_C_, - &columns_C_, &values_C_), - "Error after MKL_SPARSE_S_EXPORT_CSR\n"); - } else if constexpr (std::is_same_v<T, double) { - CALL_AND_CHECK_STATUS(mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRASPOSE, 1 - .0, csrC_, descr_type_gen_, x_, 0.0, rslt_mv_), - "Error after MKL_SPARSE_D_MV for csrC_ * x_\n"); - left_ = cblas_ddot(n_, rstl_mv_, 1, y_, 1); - - CALL_AND_CHECK_STATUS(mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1 - .0, csrB_, descr_type_gen_, x, 0.0, trslt_mv_), - "Error adter MKL_SPARSE_D_MV for csrB_ * x_\n"); - CALL_AND_CHECK_STATUS(mkl_sparse_d_mv(SPARSE_OPERATION_TRANSPOSE, 1.0, - csrA_, descr_type_gen_, y_, 0.0, - rslt_mv_trans_), - "Error adter MKL_SPARSE_D_MV for csrA_ * y_\n"); - right_ = cblas_ddot(n_, rslt_mv_, 1, rslt_mv_trans_, 1); - - residual = fabs(left - right)/(fabs(left) + 1); - - CALL_AND_CHECK_STATUS(mkl_sparse_d_export_csr(csrC_, &indexing_, - &rows_, &cols_, - &pointerB_C_, - &pointerE_C_, - &columns_C_, &values_C_), - "Error after MKL_SPARSE_D_EXPORT_CSR\n"); - } - - // Ensure compiler doesn't optimise away the work being done - callConsume(); - } - - /** Perform any required steps before calling the GEMM kernel that should - * be timed. */ - void preLoopRequirements() override {} - - /** Perform any required steps after calling the GEMM kernel that should - * be timed. */ - void postLoopRequirements() override {} - - /** Do any necessary cleanup (free pointers, close library handles, etc.) - * after Kernel has been called. */ - void postCallKernelCleanup() override { - if (mkl_sparse_destroy(csrC_) != SPARSE_STATUS_SUCCESS) { - printf(" Error after MKL_SPARSE_DESTROY, csrC_\n"); - fflush(0); - status = 1; - } - - //Deallocate arrays for which we allocate memory ourselves. - mkl_free(rslt_mv_trans_); - mkl_free(rslt_mv-); - mkl_free(x_); - mkl_free(y_); - - //Release matrix handle and deallocate arrays for which we allocate memory ourselves. - if (mkl_sparse_destroy(csrA_) != SPARSE_STATUS_SUCCESS) { - printf("Error after MKL_SPARSE_DESTROY, csrA_\n"); - fflush(0); - status = 1; - } - - mkl_free(values_A_); - mkl_free(columns_A_); - mkl_free(rowIndex_A_); - - if (mkl_sparse_destroy(csrB_) != SPARSE_STATUS_SUCCESS) { - printf("Error after MKL_SPARSE_DESTROY, csrB_\n"); - fflush(0); - status = 1; - } - - mkl_free(values_B_); - mkl_free(columns_B_); - mkl_free(rowIndex_B_); - } - - int nnz_; - - MKL_INT* columns_A_; - MKL_INT* columns_B_; - MKL_INT* columns_C_; - MKL_INT* rowIndex_A_; - MKL_INT* rowIndex_B_; - MKL_INT* pointerB_C_; - MKL_INT* pointerE_C_; - - T* rslt_mv_; - T* rslt_mv_trans_; - T* x_; - T* y_; - - T left_, right_, residual_; - MKL_INT rows_, cols_, i_, j_, ii_, status_; - - sparse_index_base_t indexing_; - struct matrix_descr descr_type_gen_; - sparse_matrix_t csrA_, csrB_, csrC_; -}; -} // namespace cpu -#endif \ No newline at end of file From 52d5e913fe3715a5da5cb1f32f1b5740fc55ce1b Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Tue, 7 Jan 2025 16:52:51 +0000 Subject: [PATCH 35/38] Refactoring to make individual files relate to a single kernel --- .idea/workspace.xml | 39 +- AOCL/sp_gemm.hh | 88 --- ArmPL/{sp_gemv.hh => spgemv.hh} | 0 ArmPL/{sp_gemm.hh => spmm.hh} | 327 ++++++----- cuBLAS/{sp_gemv.hh => spgemv.hh} | 0 cuBLAS/{sp_gemm.hh => spmm.hh} | 264 +++++---- include/doGemm.hh | 550 +++++++----------- include/doGemv.hh | 340 ++++------- include/doSpgemm.hh | 8 + include/doSpgemv.hh | 8 + include/doSpmm.hh | 445 ++++++++++++++ include/kernels/CPU/sp_gemm.hh | 108 ---- include/kernels/CPU/{sp_gemv.hh => spgemv.hh} | 0 include/kernels/CPU/spgmm.hh | 8 + include/kernels/CPU/spmm.hh | 60 ++ include/kernels/GPU/sp_gemm.hh | 28 - include/kernels/GPU/spgemm.hh | 8 + include/kernels/GPU/{sp_gemv.hh => spgemv.hh} | 0 include/kernels/GPU/spmm.hh | 28 + include/kernels/gemm.hh | 128 ---- include/kernels/spgemm.hh | 8 + include/kernels/spgemv.hh | 8 + include/kernels/spmm.hh | 168 ++++++ include/main.hh | 3 + src/main.cc | 192 +++--- 25 files changed, 1545 insertions(+), 1271 deletions(-) delete mode 100644 AOCL/sp_gemm.hh rename ArmPL/{sp_gemv.hh => spgemv.hh} (100%) rename ArmPL/{sp_gemm.hh => spmm.hh} (87%) rename cuBLAS/{sp_gemv.hh => spgemv.hh} (100%) rename cuBLAS/{sp_gemm.hh => spmm.hh} (80%) create mode 100644 include/doSpgemm.hh create mode 100644 include/doSpgemv.hh create mode 100644 include/doSpmm.hh delete mode 100644 include/kernels/CPU/sp_gemm.hh rename include/kernels/CPU/{sp_gemv.hh => spgemv.hh} (100%) create mode 100644 include/kernels/CPU/spgmm.hh create mode 100644 include/kernels/CPU/spmm.hh delete mode 100644 include/kernels/GPU/sp_gemm.hh create mode 100644 include/kernels/GPU/spgemm.hh rename include/kernels/GPU/{sp_gemv.hh => spgemv.hh} (100%) create mode 100644 include/kernels/GPU/spmm.hh create mode 100644 include/kernels/spgemm.hh create mode 100644 include/kernels/spgemv.hh create mode 100644 include/kernels/spmm.hh diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 9592790..84d08df 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -15,13 +15,30 @@ </configurations> </component> <component name="ChangeListManager"> - <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Beginning gemv ARMPL"> + <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Getting rid of old oneMKL sparse file"> + <change afterPath="$PROJECT_DIR$/include/doSpgemm.hh" afterDir="false" /> + <change afterPath="$PROJECT_DIR$/include/doSpgemv.hh" afterDir="false" /> + <change afterPath="$PROJECT_DIR$/include/doSpmm.hh" afterDir="false" /> + <change afterPath="$PROJECT_DIR$/include/kernels/CPU/spgmm.hh" afterDir="false" /> + <change afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemm.hh" afterDir="false" /> + <change afterPath="$PROJECT_DIR$/include/kernels/spgemm.hh" afterDir="false" /> + <change afterPath="$PROJECT_DIR$/include/kernels/spgemv.hh" afterDir="false" /> + <change afterPath="$PROJECT_DIR$/include/kernels/spmm.hh" afterDir="false" /> <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/cuBLAS/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/sp_gemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/AOCL/sp_gemm.hh" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spmm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spgemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/cuBLAS/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spmm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/cuBLAS/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spgemv.hh" afterDir="false" /> <change beforePath="$PROJECT_DIR$/include/doGemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemm.hh" afterDir="false" /> <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/sp_gemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spgemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/GPU/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spmm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/GPU/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/main.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/main.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/src/main.cc" beforeDir="false" afterPath="$PROJECT_DIR$/src/main.cc" afterDir="false" /> </list> <option name="SHOW_DIALOG" value="false" /> <option name="HIGHLIGHT_CONFLICTS" value="true" /> @@ -553,7 +570,15 @@ <option name="project" value="LOCAL" /> <updated>1729522244950</updated> </task> - <option name="localTasksCounter" value="47" /> + <task id="LOCAL-00047" summary="Getting rid of old oneMKL sparse file"> + <option name="closed" value="true" /> + <created>1735823512058</created> + <option name="number" value="00047" /> + <option name="presentableId" value="LOCAL-00047" /> + <option name="project" value="LOCAL" /> + <updated>1735823512058</updated> + </task> + <option name="localTasksCounter" value="48" /> <servers /> </component> <component name="TypeScriptGeneratedFilesManager"> @@ -571,7 +596,6 @@ </option> </component> <component name="VcsManagerConfiguration"> - <MESSAGE value="Trying to work out CSR malloc bug" /> <MESSAGE value="cuSPARSE unified memory implementation" /> <MESSAGE value="Now compiles" /> <MESSAGE value="Now compiles with fewer runtime errors" /> @@ -596,6 +620,7 @@ <MESSAGE value="Providing armpl with hints" /> <MESSAGE value="Updating createGflopsGraphs.py to show sparsity" /> <MESSAGE value="Beginning gemv ARMPL" /> - <option name="LAST_COMMIT_MESSAGE" value="Beginning gemv ARMPL" /> + <MESSAGE value="Getting rid of old oneMKL sparse file" /> + <option name="LAST_COMMIT_MESSAGE" value="Getting rid of old oneMKL sparse file" /> </component> </project> \ No newline at end of file diff --git a/AOCL/sp_gemm.hh b/AOCL/sp_gemm.hh deleted file mode 100644 index 4fc178b..0000000 --- a/AOCL/sp_gemm.hh +++ /dev/null @@ -1,88 +0,0 @@ -#pragma once - -#ifdef CPU_AOCL -#include <blis.h> - -#include "../include/kernels/CPU/gemm.hh" -#include "../include/utilities.hh" - -namespace cpu { -/** A class for GEMM CPU BLAS kernels. */ -template <typename T> -class gemm_cpu : public gemm<T> { - public: - using gemm<T>::gemm; - using gemm<T>::callConsume; - using gemm<T>::m_; - using gemm<T>::n_; - using gemm<T>::k_; - using gemm<T>::A_; - using gemm<T>::B_; - using gemm<T>::C_; - - private: - /** Make call to the GEMM kernel. */ - void callGemm() override { - if constexpr (std::is_same_v<T, float>) { - bli_sgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_, - rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_), - &beta, C_, rowStride, std::max(1, m_)); - } else if constexpr (std::is_same_v<T, double>) { - // Todo -- base? - aoclsparse_create_dscr(&A_csr_, base, n_, n_, nnz_, cst_row_ptr_A_.data - (), csr_col_ind_A_.data(), csr_val_A_.data()); - aoclsparse_create_dscr(&B_csr_, base, n_, n_, nnz_, cst_row_ptr_B_.data - (), csr_col_ind_B_.data(), csr_val_B_.data()); - - aoclsparse_spmm(aoclsparse_operation_none, A_csr_, B_csr_, &C_csr_); - aoclsparse_export_dcsr(C_csr_, &base, &C_M_, &C_N_, &nnz_C_, - &csr_row_ptr_C_, &csr_col_ind_C_, (void**) - &csr_val_C_); - } else { - // Un-specialised class will not do any work - print error and exit. - std::cout << "ERROR - Datatype for AOCL CPU GEMM kernel not supported." - << std::endl; - exit(1); - } - // Ensure compiler doesn't optimise away the work being done - callConsume(); - } - - /** Perform any required steps before calling the GEMM kernel that should - * be timed. */ - void preLoopRequirements() override {} - - /** Perform any required steps after calling the GEMM kernel that should - * be timed. */ - void postLoopRequirements() override {} - - /** The constant value Alpha. */ - T alpha = ALPHA; - - /** The constant value Beta. */ - T beta = BETA; - - /** The distance in elements to the next column. */ - const int rowStride = 1; - - aoclsparse_matrix A_csr_; - aoclsparse_int* csr_row_ptr_A_; - aoclsparse_int* csr_col_ind_A_; - T* csr_val_A_; - - aoclsparse_matrix B_csr_; - aoclsparse_int* csr_row_ptr_B_; - aoclsparse_int* csr_col_ind_B_; - T* csr_val_B_; - - aoclsparse_matrix C_csr_; - aoclsparse_int* csr_row_ptr_C_; - aoclsparse_int* csr_col_ind_C_; - T* csr_val_C_; - aoclsparse_int C_M_; - aoclsparse_int C_N_; - - aoclsparse_status status; -}; -} // namespace cpu -#endif \ No newline at end of file diff --git a/ArmPL/sp_gemv.hh b/ArmPL/spgemv.hh similarity index 100% rename from ArmPL/sp_gemv.hh rename to ArmPL/spgemv.hh diff --git a/ArmPL/sp_gemm.hh b/ArmPL/spmm.hh similarity index 87% rename from ArmPL/sp_gemm.hh rename to ArmPL/spmm.hh index e8e28a5..93ed4b5 100644 --- a/ArmPL/sp_gemm.hh +++ b/ArmPL/spmm.hh @@ -8,26 +8,177 @@ #include <algorithm> -#include "../include/kernels/CPU/sp_gemm.hh" +#include "../include/kernels/CPU/spmm.hh" #include "../include/utilities.hh" namespace cpu { /** A class for GEMM CPU BLAS kernels. */ template <typename T> -class sp_gemm_cpu : public sp_gemm<T> { +class spmm_cpu : public spmm<T> { public: - using sp_gemm<T>::sp_gemm; - using sp_gemm<T>::callConsume; - using sp_gemm<T>::m_; - using sp_gemm<T>::n_; - using sp_gemm<T>::k_; - using sp_gemm<T>::A_; - using sp_gemm<T>::B_; - using sp_gemm<T>::C_; - using sp_gemm<T>::nnz_; - using sp_gemm<T>::A_vals_; - using sp_gemm<T>::B_vals_; - using sp_gemm<T>::C_vals_; + using spmm<T>::spmm; + using spmm<T>::callConsume; + using spmm<T>::m_; + using spmm<T>::n_; + using spmm<T>::k_; + using spmm<T>::A_; + using spmm<T>::B_; + using spmm<T>::C_; + using spmm<T>::nnzA_; + using spmm<T>::nnzB_; + + protected: + void toSparseFormat() override { + + m_armpl_ = m_; + n_armpl_ = n_; + k_armpl_ = k_; + // ToDo -- check whether flags_ is correct! + flags_ = 0; + + // Move A to CSR + A_armpl_row_ptr_ = new armpl_int_t[m_ + 1]; + A_armpl_col_index_ = new armpl_int_t[nnzA_]; + A_vals_ = new T[nnzA_]; + A_armpl_row_ptr_[0] = 0; + int nnz_encountered = 0; + + for (int row = 0; row < m_; row++) { + A_armpl_row_ptr_[row + 1] = nnz_encountered; + for (int col = 0; col < k_; col++) { + if (A_[(row * k_) + col] != 0.0) { + A_armpl_col_index_[nnz_encountered] = col; + A_vals_[nnz_encountered] = static_cast<T>(A_[(row * k_) + col]); + nnz_encountered++; + } + } + } + + // Move B to CSR + B_armpl_row_ptr_ = new armpl_int_t[k_ + 1]; + B_armpl_col_index_ = new armpl_int_t[nnz_]; + B_vals_ = new T[nnz_]; + B_armpl_row_ptr_[0] = 0; + + nnz_encountered = 0; + for (int row = 0; row < k_; row++) { + B_armpl_row_ptr_[row + 1] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (B_[(row * n_) + col] != 0.0) { + B_armpl_col_index_[nnz_encountered] = col; + B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]); + nnz_encountered++; + } + } + } + + // Move C to CSR + C_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; + C_armpl_col_index_ = new armpl_int_t[0]; + C_vals_ = new T[0]; + // ToDo Commented out below as it should be needed? +// C_armpl_row_ptr_[0] = 0; +// +// nnz_encountered = 0; +// for (int row = 0; row < n_; row++) { +// C_armpl_row_ptr_[row + 1] = nnz_encountered; +// for (int col = 0; col < n_; col++) { +// if (B_[(row * n_) + col] != 0.0) { +// C_armpl_col_index_[nnz_encountered] = col; +// C_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]); +// nnz_encountered++; +// } +// } +// } + + if constexpr (std::is_same_v<T, float>) { +// printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_s(&A_armpl_, + m_armpl_, + k_armpl_, + A_armpl_row_ptr_, + A_armpl_col_index_, + A_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_s(&B_armpl_, + k_armpl_, + n_armpl_, + B_armpl_row_ptr_, + B_armpl_col_index_, + B_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_s(&C_armpl_, + m_armpl_, + n_armpl_, + C_armpl_row_ptr_, + C_armpl_col_index_, + C_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } else if constexpr (std::is_same_v<T, double>) { +// printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, +// nnz_, flags_ + status_ = armpl_spmat_create_csr_d(&A_armpl_, + m_armpl_, + k_armpl_, + A_armpl_row_ptr_, + A_armpl_col_index_, + A_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_d(&B_armpl_, + k_armpl_, + n_armpl_, + B_armpl_row_ptr_, + B_armpl_col_index_, + B_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_d(&C_armpl_, + m_armpl_, + n_armpl_, + C_armpl_row_ptr_, + C_armpl_col_index_, + C_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// std::cout << "Okay, all matrices made!!" << std::endl; + } + } private: /** Make call to the GEMM kernel. */ @@ -213,152 +364,6 @@ class sp_gemm_cpu : public sp_gemm<T> { const T beta = BETA; void toCSR_armpl() { - n_armpl_ = n_; - // ToDo -- check whether flags_ is correct! - flags_ = 0; - - // Move A to CSR - A_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; - A_armpl_col_index_ = new armpl_int_t[nnz_]; - A_vals_ = new T[nnz_]; - A_armpl_row_ptr_[0] = 0; - int nnz_encountered = 0; - - for (int row = 0; row < n_; row++) { - A_armpl_row_ptr_[row + 1] = nnz_encountered; - for (int col = 0; col < n_; col++) { - if (A_[(row * n_) + col] != 0.0) { - A_armpl_col_index_[nnz_encountered] = col; - A_vals_[nnz_encountered] = static_cast<T>(A_[(row * n_) + col]); - nnz_encountered++; - } - } - } - - // Move B to CSR - B_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; - B_armpl_col_index_ = new armpl_int_t[nnz_]; - B_vals_ = new T[nnz_]; - B_armpl_row_ptr_[0] = 0; - - nnz_encountered = 0; - for (int row = 0; row < n_; row++) { - B_armpl_row_ptr_[row + 1] = nnz_encountered; - for (int col = 0; col < n_; col++) { - if (B_[(row * n_) + col] != 0.0) { - B_armpl_col_index_[nnz_encountered] = col; - B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]); - nnz_encountered++; - } - } - } - - // Move C to CSR - C_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; - C_armpl_col_index_ = new armpl_int_t[nnz_]; - C_vals_ = new T[nnz_]; - C_armpl_row_ptr_[0] = 0; - - nnz_encountered = 0; - for (int row = 0; row < n_; row++) { - C_armpl_row_ptr_[row + 1] = nnz_encountered; - for (int col = 0; col < n_; col++) { - if (B_[(row * n_) + col] != 0.0) { - C_armpl_col_index_[nnz_encountered] = col; - C_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]); - nnz_encountered++; - } - } - } - - if constexpr (std::is_same_v<T, float>) { -// printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, -// nnz_, flags_); - status_ = armpl_spmat_create_csr_s(&A_armpl_, - n_armpl_, - n_armpl_, - A_armpl_row_ptr_, - A_armpl_col_index_, - A_vals_, - flags_); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } - -// printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, -// nnz_, flags_); - status_ = armpl_spmat_create_csr_s(&B_armpl_, - n_armpl_, - n_armpl_, - B_armpl_row_ptr_, - B_armpl_col_index_, - B_vals_, - flags_); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } - -// printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_, -// nnz_, flags_); - status_ = armpl_spmat_create_csr_s(&C_armpl_, - n_armpl_, - n_armpl_, - C_armpl_row_ptr_, - C_armpl_col_index_, - C_vals_, - flags_); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } - } else if constexpr (std::is_same_v<T, double>) { -// printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, -// nnz_, flags_ - status_ = armpl_spmat_create_csr_d(&A_armpl_, - n_armpl_, - n_armpl_, - A_armpl_row_ptr_, - A_armpl_col_index_, - A_vals_, - flags_); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } - -// printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, -// nnz_, flags_); - status_ = armpl_spmat_create_csr_d(&B_armpl_, - n_armpl_, - n_armpl_, - B_armpl_row_ptr_, - B_armpl_col_index_, - B_vals_, - flags_); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } - -// printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_, -// nnz_, flags_); - status_ = armpl_spmat_create_csr_d(&C_armpl_, - n_armpl_, - n_armpl_, - C_armpl_row_ptr_, - C_armpl_col_index_, - C_vals_, - flags_); - if (status_ != ARMPL_STATUS_SUCCESS) { - std::cout << "ERROR " << status_ << std::endl; - exit(1); - } - -// std::cout << "Okay, all matrices made!!" << std::endl; - } - } void printCSR(armpl_int_t n, armpl_int_t* rp, armpl_int_t* ci, T* v, @@ -385,7 +390,9 @@ class sp_gemm_cpu : public sp_gemm<T> { armpl_int_t flags_; + armpl_int_t m_armpl_; armpl_int_t n_armpl_; + armpl_int_t k_armpl_; armpl_int_t* A_armpl_row_ptr_; armpl_int_t* A_armpl_col_index_; diff --git a/cuBLAS/sp_gemv.hh b/cuBLAS/spgemv.hh similarity index 100% rename from cuBLAS/sp_gemv.hh rename to cuBLAS/spgemv.hh diff --git a/cuBLAS/sp_gemm.hh b/cuBLAS/spmm.hh similarity index 80% rename from cuBLAS/sp_gemm.hh rename to cuBLAS/spmm.hh index b5e8d93..071c8c1 100644 --- a/cuBLAS/sp_gemm.hh +++ b/cuBLAS/spmm.hh @@ -7,23 +7,24 @@ #include <random> #include <iostream> -#include "../include/kernels/GPU/sp_gemm.hh" +#include "../include/kernels/GPU/spmm.hh" #include "../include/utilities.hh" #include "common.hh" namespace gpu { /** A class for sparse GEMM GPU BLAS kernels. */ template <typename T> -class sp_gemm_gpu : public sp_gemm<T> { +class spmm_gpu : public spmm<T> { public: - using sp_gemm<T>::sp_gemm; - using sp_gemm<T>::initInputMatricesSparse; - using sp_gemm<T>::toCSR_int; - using sp_gemm<T>::n_; - using sp_gemm<T>::A_; - using sp_gemm<T>::B_; - using sp_gemm<T>::C_; - using sp_gemm<T>::offload_; + using spmm<T>::spmm; + using spmm<T>::initInputMatrices; + using spmm<T>::m_ + using spmm<T>::n_; + using spmm<T>::k_ + using spmm<T>::A_; + using spmm<T>::B_; + using spmm<T>::C_; + using spmm<T>::offload_; // ToDo -- No checksum for sparse yet. Need to do @@ -34,7 +35,7 @@ class sp_gemm_gpu : public sp_gemm<T> { * - Always: Move data from host to device and device to host each iteration * - Unified: Initialise data as unified memory; no data movement semantics * required */ - void initialise(gpuOffloadType offload, int n, float sparsity) override { + void initialise(gpuOffloadType offload, int n, double sparsity) override { offload_ = offload; if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F; @@ -43,7 +44,19 @@ class sp_gemm_gpu : public sp_gemm<T> { std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; exit(1); } + m_ = m; n_ = n; + k_ = k; + + A_ = (T*)malloc(sizeof(T) * m_ * k_); + B_ = (T*)malloc(sizeof(T) * k_ * n_); + C_ = (T*)calloc(sizeof(T) * m_ * n_);Ã¥ + + /** Determine the number of nnz elements in A and B */ + nnzA_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + nnzB_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_)); + + initInputMatrices(sparsity_); // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); @@ -53,42 +66,37 @@ class sp_gemm_gpu : public sp_gemm<T> { cudaCheckError(cudaStreamCreate(&s2_)); cudaCheckError(cudaStreamCreate(&s3_)); - - - // Work out number of edges needed to achieve target sparsity - A_nnz_ = B_nnz_ = 1 + (int) (n_ * n_ * (1 - sparsity)); - if (offload_ == gpuOffloadType::unified) { - cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * A_nnz_)); - cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * A_nnz_)); - cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (n_ + 1))); + cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * nnzA_)); + cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * nnzA_)); + cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (m_ + 1))); - cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * B_nnz_)); - cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * B_nnz_)); - cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (n_ + 1))); + cudaCheckError(cudaMallocManaged(&B_val_, sizeof(T) * nnzB_)); + cudaCheckError(cudaMallocManaged(&B_col_, sizeof(int) * nnzB_)); + cudaCheckError(cudaMallocManaged(&B_row_, sizeof(int) * (k_ + 1))); cudaCheckError(cudaMallocManaged(&C_row_, sizeof(int) * (n_ + 1))); C_val_ = NULL; C_col_ = NULL; } else { - A_val_ = (T*)malloc(sizeof(T) * A_nnz_); - A_col_ = (int*)malloc(sizeof(int) * A_nnz_); - A_row_ = (int*)malloc(sizeof(int) * (n_ + 1)); + A_val_ = (T*)malloc(sizeof(T) * nnzA_); + A_col_ = (int*)malloc(sizeof(int) * nnzA_); + A_row_ = (int*)malloc(sizeof(int) * (m_ + 1)); - B_val_ = (T*)malloc(sizeof(T) * B_nnz_); - B_col_ = (int*)malloc(sizeof(int) * B_nnz_); - B_row_ = (int*)malloc(sizeof(int) * (n_ + 1)); + B_val_ = (T*)malloc(sizeof(T) * nnzB_); + B_col_ = (int*)malloc(sizeof(int) * nnzB_); + B_row_ = (int*)malloc(sizeof(int) * (k_ + 1)); C_row_ = (int*)malloc(sizeof(int) * (n_ + 1)); - cudaCheckError(cudaMalloc((void**)&A_val_dev_, sizeof(T) * A_nnz_)); - cudaCheckError(cudaMalloc((void**)&A_col_dev_, sizeof(int) * A_nnz_)); - cudaCheckError(cudaMalloc((void**)&A_row_dev_, sizeof(int) * (n_ + 1))); + cudaCheckError(cudaMalloc((void**)&A_val_dev_, sizeof(T) * nnzA_)); + cudaCheckError(cudaMalloc((void**)&A_col_dev_, sizeof(int) * nnzA_)); + cudaCheckError(cudaMalloc((void**)&A_row_dev_, sizeof(int) * (m_ + 1))); - cudaCheckError(cudaMalloc((void**)&B_val_dev_, sizeof(T) * B_nnz_)); - cudaCheckError(cudaMalloc((void**)&B_col_dev_, sizeof(int) * B_nnz_)); - cudaCheckError(cudaMalloc((void**)&B_row_dev_, sizeof(int) * (n_ + 1))); + cudaCheckError(cudaMalloc((void**)&B_val_dev_, sizeof(T) * nnzB_)); + cudaCheckError(cudaMalloc((void**)&B_col_dev_, sizeof(int) * nnzB_)); + cudaCheckError(cudaMalloc((void**)&B_row_dev_, sizeof(int) * (k_ + 1))); cudaCheckError(cudaMalloc((void**)&C_row_dev_, sizeof(int) * (n_ + 1))); } @@ -97,22 +105,6 @@ class sp_gemm_gpu : public sp_gemm<T> { C_mem_allocated_once_ = false; C_mem_allocated_unified_ = false; - // Initialise the host matricies - // cusparseSpGEMM() works on CSR format only. This helpfully makes our - // sparse matrix format decision for us! - - // Initialise the matrices - // Set initial values to 0 - A_ = (T*)malloc(sizeof(T) * n_ * n_); - B_ = (T*)malloc(sizeof(T) * n_ * n_); - - initInputMatricesSparse(sparsity); - - toCSR_int(A_, n_, n_, A_val_, A_col_, A_row_); - - toCSR_int(B_, n_, n_, B_val_, B_col_, B_row_); - - // std::cout << "_____Matrix A_____" << std::endl; // printDenseMatrix(A_, n_, n_); // std::cout << std::endl << std::endl; @@ -128,6 +120,41 @@ class sp_gemm_gpu : public sp_gemm<T> { cusparseCheckError(cusparseCreate(&handle_)); } + protected: + void toSparseFormat() override { + // Load A into CSR + int nnz_encountered = 0; + for (int row = 0; row < m_; row++) { + A_row_[row] = nnz_encountered; + int nnz_row = 0; + for (int col = 0; col < k_; col++) { + if (B_[(row * k_) + col] != 0.0) { + nnz_row++; + A_col_[nnz_encountered] = col; + A_val_[nnz_encountered] = A_[(row * k_) + col]; + nnz_encountered++; + } + } + } + A_row_[m_] = nnz_encountered; + + // Load B into CSR + int nnz_encountered = 0; + for (int row = 0; row < k_; row++) { + B_row_[row] = nnz_encountered; + int nnz_row = 0; + for (int col = 0; col < n_; col++) { + if (B_[(row * n_) + col] != 0.0) { + nnz_row++; + B_col_[nnz_encountered] = col; + B_val_[nnz_encountered] = B_[(row * n_) + col]; + nnz_encountered++; + } + } + } + B_row_[k_] = nnz_encountered; + } + private: /** Perform any required steps before calling the GEMM kernel that should * be timed. */ @@ -137,31 +164,31 @@ class sp_gemm_gpu : public sp_gemm<T> { case gpuOffloadType::always: { // Make matrix descriptors cusparseCheckError( - cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, + cusparseCreateCsr(&descrA_, m_, k_, nnzA_, A_row_dev_, A_col_dev_, A_val_dev_, rType_, cType_, indType_, cudaDataType_)); cusparseCheckError( - cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_, + cusparseCreateCsr(&descrB_, k_, n_, nnzB_, B_row_dev_, B_col_dev_, B_val_dev_, rType_, cType_, indType_, cudaDataType_)); cusparseCheckError( - cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL, + cusparseCreateCsr(&descrC_, n_, m_, 0, C_row_dev_, NULL, NULL, rType_, cType_, indType_, cudaDataType_)); break; } case gpuOffloadType::once: { cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) * - A_nnz_, cudaMemcpyHostToDevice, s1_)); + nnzA_, cudaMemcpyHostToDevice, s1_)); cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) * - A_nnz_, cudaMemcpyHostToDevice, s1_)); - cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (n_ + nnzA_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (m_ + 1), cudaMemcpyHostToDevice, s1_)); cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) * - B_nnz_, cudaMemcpyHostToDevice, s2_)); + nnzB_, cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) * - B_nnz_, cudaMemcpyHostToDevice, s2_)); - cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_ + nnzB_, cudaMemcpyHostToDevice, s2_)); + cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (k_ + 1), cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_ @@ -169,45 +196,45 @@ class sp_gemm_gpu : public sp_gemm<T> { // Craete matrix descriptors cusparseCheckError( - cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, + cusparseCreateCsr(&descrA_, m_, k_, nnzA_, A_row_dev_, A_col_dev_, A_val_dev_, rType_, cType_, indType_, cudaDataType_)); cusparseCheckError( - cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_, + cusparseCreateCsr(&descrB_, k_, n_, nnzB_, B_row_dev_, B_col_dev_, B_val_dev_, rType_, cType_, indType_, cudaDataType_)); cusparseCheckError( - cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL, + cusparseCreateCsr(&descrC_, n_, m_, 0, C_row_dev_, NULL, NULL, rType_, cType_, indType_, cudaDataType_)); break; } case gpuOffloadType::unified: { // Prefetch memory to device - cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_, + cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * nnzA_, gpuDevice_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * A_nnz_, + cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * nnzA_, gpuDevice_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1), + cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (m_ + 1), gpuDevice_, s1_)); - cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * B_nnz_, + cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * nnzB_, gpuDevice_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * B_nnz_, + cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * nnzB_, gpuDevice_, s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1), + cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (k_ + 1), gpuDevice_, s2_)); // Make matrix descriptors cusparseCheckError( - cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_, A_col_, + cusparseCreateCsr(&descrA_, m_, k_, nnzA_, A_row_, A_col_, A_val_, rType_, cType_, indType_, cudaDataType_)); cusparseCheckError( - cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_, B_col_, + cusparseCreateCsr(&descrB_, k_, n_, nnzB_, B_row_, B_col_, B_val_, rType_, cType_, indType_, cudaDataType_)); cusparseCheckError( - cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_, NULL, NULL, + cusparseCreateCsr(&descrC_, n_, m_, 0, C_row_, NULL, NULL, rType_, cType_, indType_, cudaDataType_)); break; } @@ -224,17 +251,17 @@ class sp_gemm_gpu : public sp_gemm<T> { cusparseCheckError(cusparseDestroySpMat(descrC_)); } cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, sizeof(T) * - A_nnz_, cudaMemcpyHostToDevice, s1_)); + nnzA_, cudaMemcpyHostToDevice, s1_)); cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, sizeof(int) * - A_nnz_, cudaMemcpyHostToDevice, s1_)); - cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (n_ + nnzA_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, sizeof(int) * (m_ + 1), cudaMemcpyHostToDevice, s1_)); cudaCheckError(cudaMemcpyAsync(B_val_dev_, B_val_, sizeof(T) * - B_nnz_, cudaMemcpyHostToDevice, s2_)); + nnzB_, cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(B_col_dev_, B_col_, sizeof(int) * - B_nnz_, cudaMemcpyHostToDevice, s2_)); - cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (n_ + nnzB_, cudaMemcpyHostToDevice, s2_)); + cudaCheckError(cudaMemcpyAsync(B_row_dev_, B_row_, sizeof(int) * (k_ + 1), cudaMemcpyHostToDevice, s2_)); cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_ @@ -243,15 +270,15 @@ class sp_gemm_gpu : public sp_gemm<T> { // Make matrix descriptors cusparseCheckError( - cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, + cusparseCreateCsr(&descrA_, m_, k_, nnzA_, A_row_dev_, A_col_dev_, A_val_dev_, rType_, cType_, indType_, cudaDataType_)); cusparseCheckError( - cusparseCreateCsr(&descrB_, n_, n_, B_nnz_, B_row_dev_, + cusparseCreateCsr(&descrB_, k_, n_, nnzB_, B_row_dev_, B_col_dev_, B_val_dev_, rType_, cType_, indType_, cudaDataType_)); cusparseCheckError( - cusparseCreateCsr(&descrC_, n_, n_, 0, C_row_dev_, NULL, NULL, + cusparseCreateCsr(&descrC_, n_, m_, 0, C_row_dev_, NULL, NULL, rType_, cType_, indType_, cudaDataType_)); cusparseCheckError( @@ -282,14 +309,14 @@ class sp_gemm_gpu : public sp_gemm<T> { cusparseCheckError( cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, - &C_nnz_)); + &nnzC_)); if (C_mem_allocated_always_) { cudaCheckError(cudaFree(C_val_dev_)); cudaCheckError(cudaFree(C_col_dev_)); } - cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_)); - cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_)); + cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * nnzC_)); + cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * nnzC_)); cusparseCheckError( cusparseCsrSetPointers(descrC_, C_row_dev_, C_col_dev_, @@ -300,31 +327,31 @@ class sp_gemm_gpu : public sp_gemm<T> { alg_, spgemmDesc_)); cudaCheckError(cudaMemcpyAsync(A_val_, A_val_dev_, sizeof(T) * - A_nnz_, cudaMemcpyDeviceToHost, s1_)); + nnzA_, cudaMemcpyDeviceToHost, s1_)); cudaCheckError(cudaMemcpyAsync(A_col_, A_col_dev_, sizeof(int) * - A_nnz_, cudaMemcpyDeviceToHost, s1_)); + nnzA_, cudaMemcpyDeviceToHost, s1_)); cudaCheckError(cudaMemcpyAsync(A_row_, A_row_dev_, sizeof(int) * - (n_ + 1), cudaMemcpyDeviceToHost, s1_)); + (m_ + 1), cudaMemcpyDeviceToHost, s1_)); cudaCheckError(cudaMemcpyAsync(B_val_, B_val_dev_, sizeof(T) * - B_nnz_, cudaMemcpyDeviceToHost, s2_)); + nnzB_, cudaMemcpyDeviceToHost, s2_)); cudaCheckError(cudaMemcpyAsync(B_col_, B_col_dev_, sizeof(int) * - B_nnz_, cudaMemcpyDeviceToHost, s2_)); + nnzB_, cudaMemcpyDeviceToHost, s2_)); cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) * - (n_ + 1), cudaMemcpyDeviceToHost, s2_)); + (k_ + 1), cudaMemcpyDeviceToHost, s2_)); if (C_mem_allocated_always_) { free(C_val_); free(C_col_); } - C_val_ = (T*)malloc(sizeof(T) * C_nnz_); - C_col_ = (int*)malloc(sizeof(int) * C_nnz_); + C_val_ = (T*)malloc(sizeof(T) * nnzC_); + C_col_ = (int*)malloc(sizeof(int) * nnzC_); C_mem_allocated_always_ = true; cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) * - C_nnz_, cudaMemcpyDeviceToHost, s3_)); + nnzC_, cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) * - C_nnz_, cudaMemcpyDeviceToHost, s3_)); + nnzC_, cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaDeviceSynchronize()); @@ -364,14 +391,14 @@ class sp_gemm_gpu : public sp_gemm<T> { cusparseCheckError( cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, - &C_nnz_)); + &nnzC_)); if (C_mem_allocated_once_) { cudaCheckError(cudaFree(C_val_dev_)); cudaCheckError(cudaFree(C_col_dev_)); } - cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * C_nnz_)); - cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * C_nnz_)); + cudaCheckError(cudaMalloc(&C_val_dev_, sizeof(T) * nnzC_)); + cudaCheckError(cudaMalloc(&C_col_dev_, sizeof(int) * nnzC_)); C_mem_allocated_once_ = true; cusparseCheckError( @@ -417,15 +444,15 @@ class sp_gemm_gpu : public sp_gemm<T> { cusparseCheckError( cusparseSpMatGetSize(descrC_, &C_num_rows_, &C_num_cols_, - &C_nnz_)); + &nnzC_)); if (C_mem_allocated_unified_) { cudaCheckError(cudaFree(C_val_)); cudaCheckError(cudaFree(C_col_)); } - cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * C_nnz_)); - cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * C_nnz_)); + cudaCheckError(cudaMallocManaged(&C_val_, sizeof(T) * nnzC_)); + cudaCheckError(cudaMallocManaged(&C_col_, sizeof(int) * nnzC_)); C_mem_allocated_unified_ = true; cusparseCheckError( @@ -455,25 +482,25 @@ class sp_gemm_gpu : public sp_gemm<T> { } case gpuOffloadType::once: { cudaCheckError(cudaMemcpyAsync(A_val_, A_val_dev_, sizeof(T) * - A_nnz_, cudaMemcpyDeviceToHost, s1_)); + nnzA_, cudaMemcpyDeviceToHost, s1_)); cudaCheckError(cudaMemcpyAsync(A_col_, A_col_dev_, sizeof(int) * - A_nnz_, cudaMemcpyDeviceToHost, s1_)); + nnzA_, cudaMemcpyDeviceToHost, s1_)); cudaCheckError(cudaMemcpyAsync(A_row_, A_row_dev_, sizeof(int) * - (n_ + 1), cudaMemcpyDeviceToHost, s1_)); + (m_ + 1), cudaMemcpyDeviceToHost, s1_)); cudaCheckError(cudaMemcpyAsync(B_val_, B_val_dev_, sizeof(T) * - B_nnz_, cudaMemcpyDeviceToHost, s2_)); + nnzB_, cudaMemcpyDeviceToHost, s2_)); cudaCheckError(cudaMemcpyAsync(B_col_, B_col_dev_, sizeof(int) * - B_nnz_, cudaMemcpyDeviceToHost, s2_)); + nnzB_, cudaMemcpyDeviceToHost, s2_)); cudaCheckError(cudaMemcpyAsync(B_row_, B_row_dev_, sizeof(int) * - (n_ + 1), cudaMemcpyDeviceToHost, s2_)); + (k_ + 1), cudaMemcpyDeviceToHost, s2_)); - C_val_ = (T*)malloc(sizeof(T) * C_nnz_); - C_col_ = (int*)malloc(sizeof(int) * C_nnz_); + C_val_ = (T*)malloc(sizeof(T) * nnzC_); + C_col_ = (int*)malloc(sizeof(int) * nnzC_); cudaCheckError(cudaMemcpyAsync(C_val_, C_val_dev_, sizeof(T) * - C_nnz_, cudaMemcpyDeviceToHost, s3_)); + nnzC_, cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaMemcpyAsync(C_col_, C_col_dev_, sizeof(int) * - C_nnz_, cudaMemcpyDeviceToHost, s3_)); + nnzC_, cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaMemcpyAsync(C_row_, C_row_dev_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost, s3_)); cudaCheckError(cudaDeviceSynchronize()); @@ -486,23 +513,23 @@ class sp_gemm_gpu : public sp_gemm<T> { } case gpuOffloadType::unified: { // Ensure all data resides on host once work has completed - cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * A_nnz_, + cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * nnzA_, cudaCpuDeviceId, s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * A_nnz_, + cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * nnzA_, cudaCpuDeviceId, s1_)); - cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (n_ + 1), + cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (m_ + 1), cudaCpuDeviceId, s1_)); - cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * B_nnz_, + cudaCheckError(cudaMemPrefetchAsync(B_val_, sizeof(T) * nnzB_, cudaCpuDeviceId, s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * B_nnz_, + cudaCheckError(cudaMemPrefetchAsync(B_col_, sizeof(int) * nnzB_, cudaCpuDeviceId, s2_)); - cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (n_ + 1), + cudaCheckError(cudaMemPrefetchAsync(B_row_, sizeof(int) * (k_ + 1), cudaCpuDeviceId, s2_)); -// cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * C_nnz_, +// cudaCheckError(cudaMemPrefetchAsync(C_val_, sizeof(T) * nnzC_, // cudaCpuDeviceId, s3_)); -// cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * C_nnz_, +// cudaCheckError(cudaMemPrefetchAsync(C_col_, sizeof(int) * nnzC_, // cudaCpuDeviceId, s3_)); cudaCheckError(cudaMemPrefetchAsync(C_row_, sizeof(int) * (n_ + 1), cudaCpuDeviceId, s3_)); @@ -618,21 +645,18 @@ class sp_gemm_gpu : public sp_gemm<T> { int* A_row_; int64_t A_num_rows_; int64_t A_num_cols_; - int64_t A_nnz_; T* B_val_; int* B_col_; int* B_row_; int64_t B_num_rows_; int64_t B_num_cols_; - int64_t B_nnz_; T* C_val_ = NULL; int* C_col_ = NULL; int* C_row_; int64_t C_num_rows_; int64_t C_num_cols_; - int64_t C_nnz_; /** CSR format vectors for matrices A, B and C on the device. */ T* A_val_dev_; diff --git a/include/doGemm.hh b/include/doGemm.hh index 23caa6f..6a0de59 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -8,7 +8,6 @@ #if defined CPU_ARMPL #include "../ArmPL/gemm.hh" -#include "../ArmPL/sp_gemm.hh" #elif defined CPU_ONEMKL #include "../oneMKL/CPU/gemm.hh" #elif defined CPU_AOCL @@ -21,7 +20,6 @@ #if defined GPU_CUBLAS #include "../cuBLAS/gemm.hh" -#include "../cuBLAS/sp_gemm.hh" #elif defined GPU_ONEMKL #include "../oneMKL/GPU/gemm.hh" #elif defined GPU_ROCBLAS @@ -35,25 +33,20 @@ class doGemm { public: doGemm(const std::string csvDir, const int iters, const int startDim, const int upperLimit, const bool cpuEnabled = true, - const bool gpuEnabled = true, const bool doDense = true, - const bool doSparse = true) + const bool gpuEnabled = true) : CSV_DIR(csvDir), iterations_(iters), startDimention_(startDim), upperLimit_(upperLimit), doCPU_(cpuEnabled), - doGPU_(gpuEnabled), - doDense_(doDense), - doSparse_(doSparse) + doGPU_(gpuEnabled) #if CPU_ENABLED , - gemmCpu_(iterations_), - spGemmCpu_(iterations_) + cpu_(iterations_) #endif #if GPU_ENABLED , - gemmGpu_(iterations_), - spGemmGpu_(iterations_) + gpu_(iterations_) #endif { static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) && @@ -65,309 +58,247 @@ class doGemm { void collectData() { // ToDo -- I've hard coded false here as kernel selection was not working // . Needs to be fixed - if (doDense_) { - // Square Problem Sizes... - // Re-initialise offload threshold structures - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_square_square_M=N=K.csv"); - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = dim, N = dim, K = dim; - callDenseKernels(csvFile, dim, dim, dim); - } - // Close file - csvFile.close(); -#if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Square x Square (M=N=K)"); - } -#endif - // Rectangular Problem Sizes: - // Tall and thin x Short and wide - // Re-initialise offload threshold structures & previous results + // Square Problem Sizes... + // Re-initialise offload threshold structures cpuGpu_always_ = cpuGpu_offloadThreshold(); cpuGpu_once_ = cpuGpu_offloadThreshold(); cpuGpu_unified_ = cpuGpu_offloadThreshold(); prev_gpuResult_always = time_checksum_gflop(); prev_gpuResult_once = time_checksum_gflop(); prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_tall-thin_short-wide_M=N_M=16K.csv"); - int K = startDimention_; - int M = 16 * K; - int N = 16 * K; - while (M <= upperLimit_) { - callDenseKernels(csvFile, M, N, K); - M += 16; - N += 16; - K++; + std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_square_M=N=K.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = dim, N = dim, K = dim; + callKernels(csvFile, dim, dim, dim); } // Close file csvFile.close(); #if CPU_ENABLED && GPU_ENABLED if (doCPU_ && doGPU_) { // Print offload results to stdout - printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, M=16K)"); + printOffloadThreshold("Square x Square (M=N=K)"); } #endif - // Tall and thin x Short and wide - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_tall-thin_short-wide_M=N_K=32.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = dim, N = dim, K = 32; - callDenseKernels(csvFile, dim, dim, 32); - } - } - // Close file - csvFile.close(); + // Rectangular Problem Sizes: + // Tall and thin x Short and wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_short-wide_M=N_M=16K.csv"); + int K = startDimention_; + int M = 16 * K; + int N = 16 * K; + while (M <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += 16; + N += 16; + K++; + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, K=32)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, M=16K)"); + } #endif - // Short and wide x Tall and thin - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_short-wide_tall-thin_M=N_K=16M.csv"); - M = startDimention_; - N = startDimention_; - K = 16 * M; - while (K <= upperLimit_) { - callDenseKernels(csvFile, M, N, K); - M++; - N++; - K += 16; + // Tall and thin x Short and wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_short-wide_M=N_K=32.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = dim, N = dim, K = 32; + callKernels(csvFile, dim, dim, 32); } - // Close file - csvFile.close(); + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N, K=16M)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, K=32)"); + } #endif - // Short and wide x Tall and thin - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_short-wide_tall-thin_M=N=32_K.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = 32, N = 32, K = dim; - callDenseKernels(csvFile, 32, 32, dim); - } - } - // Close file - csvFile.close(); + // Short and wide x Tall and thin + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_tall-thin_M=N_K=16M.csv"); + M = startDimention_; + N = startDimention_; + K = 16 * M; + while (K <= upperLimit_) { + callKernels(csvFile, M, N, K); + M++; + N++; + K += 16; + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N=32, K)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N, K=16M)"); + } #endif - // Tall and Thin x Square - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_tall-thin_square_K=N_M=16K.csv"); - K = startDimention_; - N = startDimention_; - M = 16 * K; - while (M <= upperLimit_) { - callDenseKernels(csvFile, M, N, K); - M += 16; - N++; - K++; + // Short and wide x Tall and thin + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_tall-thin_M=N=32_K.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = 32, N = 32, K = dim; + callKernels(csvFile, 32, 32, dim); } - // Close file - csvFile.close(); + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Tall-and-Thin x Square (K=N, M=16K)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N=32, K)"); + } #endif - // Tall and Thin x Square - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_tall-thin_square_K=N=32_M.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = dim, N = 32, K = 32; - callDenseKernels(csvFile, dim, 32, 32); - } - } - // Close file - csvFile.close(); + // Tall and Thin x Square + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_square_K=N_M=16K.csv"); + K = startDimention_; + N = startDimention_; + M = 16 * K; + while (M <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += 16; + N++; + K++; + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Tall-and-Thin x Square (M, K=N=32)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Square (K=N, M=16K)"); + } #endif - // Square x Short and Wide - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_square_short-wide_M=K_N=16K.csv"); - M = startDimention_; - K = startDimention_; - N = 16 * K; - while (N <= upperLimit_) { - callDenseKernels(csvFile, M, N, K); - M++; - N += 16; - K++; + // Tall and Thin x Square + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_square_K=N=32_M.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = dim, N = 32, K = 32; + callKernels(csvFile, dim, 32, 32); } - // Close file - csvFile.close(); + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Square (M, K=N=32)"); + } #endif - // Square x Short and Wide - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_square_short-wide_M=K=32_N.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = 32, N = dim, K = 32; - callDenseKernels(csvFile, 32, dim, 32); - } - } + + // Square x Short and Wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_short-wide_M=K_N=16K.csv"); + M = startDimention_; + K = startDimention_; + N = 16 * K; + while (N <= upperLimit_) { + callKernels(csvFile, M, N, K); + M++; + N += 16; + K++; + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)"); + } #endif - // Close file - csvFile.close(); + // Square x Short and Wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_short-wide_M=K=32_N.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = 32, N = dim, K = 32; + callKernels(csvFile, 32, dim, 32); } - if (doSparse_) { // Square sparse matrix - sparse matrix multiplication - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" + - getKernelName() + "_sparse_square_99.csv"); - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - callSparseKernels(csvFile, dim, 0.99); - } - // Close file - csvFile.close(); -#if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Sparse Square 0.99"); - } -#endif - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - csvFile = initCSVFile(std::string(CSV_DIR) + "/" + - getKernelName() + "_sparse_square_999.csv"); - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - callSparseKernels(csvFile, dim, 0.999); - } -#if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Sparse Square 0.999"); - } -#endif - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - csvFile = initCSVFile(std::string(CSV_DIR) + "/" + - getKernelName() + "_sparse_square_9999.csv"); - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - callSparseKernels(csvFile, dim, 0.9999); - } -#if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Sparse Square 0.9999"); - } -#endif - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - csvFile = initCSVFile(std::string(CSV_DIR) + "/" + - getKernelName() + - "_sparse_square_99999.csv"); - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - callSparseKernels(csvFile, dim, 0.99999); - } + } #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Sparse Square 0.99999"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)"); + } #endif - } + // Close file + csvFile.close(); } private: /** Call the appropriate CPU and GPU GEMM kernels. */ - void callDenseKernels(std::ofstream& csvFile, const int M, const int N, - const int K) { + void callKernels(std::ofstream& csvFile, const int M, const int N, + const int K) { const double probSize = calcKib(M, N, K); const uint64_t flops = calcFlops(M, N, K); std::string kernelName = getKernelName(); @@ -380,8 +311,8 @@ class doGemm { // Perform CPU kernel #if CPU_ENABLED if (doCPU_) { - gemmCpu_.initialise(M, N, K); - cpuResult = gemmCpu_.compute(); + cpu_.initialise(M, N, K); + cpuResult = cpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); // Write result to CSV file writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize, @@ -394,21 +325,21 @@ class doGemm { if (doGPU_) { // - ONCE : Offload to/from GPU once before all iterations and once // after - gemmGpu_.initialise(gpuOffloadType::once, M, N, K); - gpuResult_once = gemmGpu_.compute(); + gpu_.initialise(gpuOffloadType::once, M, N, K); + gpuResult_once = gpu_.compute(); gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime); // - ALWAYS: Offload to/from GPU every iteration - gemmGpu_.initialise(gpuOffloadType::always, M, N, K); - gpuResult_always = gemmGpu_.compute(); + gpu_.initialise(gpuOffloadType::always, M, N, K); + gpuResult_always = gpu_.compute(); gpuResult_always.gflops = calcGflops(flops, iterations_, gpuResult_always.runtime); // - UNIFIED : data passed from host to device (and device to host) as // needed - gemmGpu_.initialise(gpuOffloadType::unified, M, N, K); - gpuResult_unified = gemmGpu_.compute(); + gpu_.initialise(gpuOffloadType::unified, M, N, K); + gpuResult_unified = gpu_.compute(); gpuResult_unified.gflops = calcGflops(flops, iterations_, gpuResult_unified.runtime); @@ -559,61 +490,6 @@ class doGemm { } } - void callSparseKernels(std::ofstream& csvFile, const int N, const float - sparsity) { - const double probSize = calcKib(N, N, N); - const uint64_t flops = calcFlops(N, N, N); - std::string kernelName = getKernelName(); - -#if CPU_ENABLED - if (doCPU_) { - spGemmCpu_.initialise(N, sparsity); - time_checksum_gflop cpuResult = spGemmCpu_.compute(); - cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); - writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, - sparsity, iterations_, cpuResult.runtime, - cpuResult.gflops); - } -#endif -#if GPU_ENABLED - // Perform the GPU kernels - // - UNIFIED : data passed from host to device (and device to host) as - // needed - if (doGPU_) { - spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity); - time_checksum_gflop gpuResult_unified = spGemmGpu_.compute(); - gpuResult_unified.gflops = - calcGflops(flops, iterations_, gpuResult_unified.runtime); - - // - ALWAYS: Offload to/from GPU every iteration - spGemmGpu_.initialise(gpuOffloadType::always, N, sparsity); - time_checksum_gflop gpuResult_always = spGemmGpu_.compute(); - gpuResult_always.gflops = - calcGflops(flops, iterations_, gpuResult_always.runtime); - // - ONCE : Offload to/from GPU once before all iterations and once - // after - spGemmGpu_.initialise(gpuOffloadType::once, N, sparsity); - time_checksum_gflop gpuResult_once = spGemmGpu_.compute(); - gpuResult_once.gflops = - calcGflops(flops, iterations_, gpuResult_once.runtime); - // ToDo -- non-default GPU operations - - // Write lines to CSV file - writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, - sparsity, iterations_, gpuResult_once.runtime, - gpuResult_once.gflops); - writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, - sparsity, iterations_, gpuResult_always.runtime, - gpuResult_always.gflops); - writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, - sparsity, iterations_, gpuResult_unified.runtime, - gpuResult_unified.gflops); - - } -#endif - - } - /** A function for calculating FLOPs performed by a GEMM. * C = alpha*AB + beta*C */ constexpr uint64_t calcFlops(const int M, const int N, const int K) const { @@ -744,20 +620,14 @@ class doGemm { /** Whether the GPU kernels should be run. */ const bool doGPU_ = true; - /** Whether we should run dense and or sparse kernels */ - const bool doDense_; - const bool doSparse_; - #if CPU_ENABLED /** The GEMM CPU kernel. */ - cpu::gemm_cpu<T> gemmCpu_; - cpu::sp_gemm_cpu<T> spGemmCpu_; + cpu::gemm_cpu<T> cpu_; #endif #if GPU_ENABLED /** The GEMM GPU kernel. */ - gpu::gemm_gpu<T> gemmGpu_; - gpu::sp_gemm_gpu<T> spGemmGpu_; + gpu::gemm_gpu<T> gpu_; #endif /** The point at which offloading to GPU (offload once) becomes worthwhile. */ diff --git a/include/doGemv.hh b/include/doGemv.hh index 0ecd814..ebc9262 100644 --- a/include/doGemv.hh +++ b/include/doGemv.hh @@ -8,7 +8,6 @@ #if defined CPU_ARMPL #include "../ArmPL/gemv.hh" -#include "../ArmPL/sp_gemv.hh" #elif defined CPU_ONEMKL #include "../oneMKL/CPU/gemv.hh" #elif defined CPU_AOCL @@ -21,7 +20,6 @@ #if defined GPU_CUBLAS #include "../cuBLAS/gemv.hh" -#include "../cuBLAS/sp_gemv.hh" #elif defined GPU_ONEMKL #include "../oneMKL/GPU/gemv.hh" #elif defined GPU_ROCBLAS @@ -35,25 +33,20 @@ class doGemv { public: doGemv(const std::string csvDir, const int iters, const int startDim, const int upperLimit, const bool cpuEnabled = true, - const bool gpuEnabled = true, const bool doDense = true, const bool - doSparse = true) + const bool gpuEnabled = true) : CSV_DIR(csvDir), iterations_(iters), startDimention_(startDim), upperLimit_(upperLimit), doCPU_(cpuEnabled), - doGPU_(gpuEnabled), - doDense_(doDense), - doSparse_(doSparse) + doGPU_(gpuEnabled) #if CPU_ENABLED , - gemvCpu_(iterations_), - spGemvCpu_(iterations_) + cpu_(iterations_) #endif #if GPU_ENABLED , - gemvGpu_(iterations_), - spGemvGpu_(iterations_) + gpu_(iterations_) #endif { static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) && @@ -63,156 +56,131 @@ class doGemv { /** Run all problem types and write data to CSV files. */ void collectData() { - if (doDense_) { - // Square Problem Sizes... - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - std::ofstream csvFile = - initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv"); - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = dim, N = dim; - callDenseKernels(csvFile, dim, dim); - } - // Close file - csvFile.close(); + // Square Problem Sizes... + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + std::ofstream csvFile = + initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = dim, N = dim; + callDenseKernels(csvFile, dim, dim); + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Square x Vector (M=N)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Vector (M=N)"); + } #endif - // Rectangular Problem Sizes: - // Tall and thin x Vector - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_tall-thin_vector_M=16N.csv"); - int N = startDimention_; - int M = 16 * N; - while (M <= upperLimit_) { - callDenseKernels(csvFile, M, N); - M += 16; - N++; - } - // Close file - csvFile.close(); + // Rectangular Problem Sizes: + // Tall and thin x Vector + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_vector_M=16N.csv"); + int N = startDimention_; + int M = 16 * N; + while (M <= upperLimit_) { + callDenseKernels(csvFile, M, N); + M += 16; + N++; + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Tall-and-Thin x Vector (M=16N)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Vector (M=16N)"); + } #endif - // Tall and thin x Vector - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_tall-thin_vector_M_N=32.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = dim, N = 32; - callDenseKernels(csvFile, dim, 32); - } + // Tall and thin x Vector + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_vector_M_N=32.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = dim, N = 32; + callDenseKernels(csvFile, dim, 32); } - // Close file - csvFile.close(); + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Tall-and-Thin x Vector (M, N=32)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Vector (M, N=32)"); + } #endif - // Short and wide x Vector - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_short-wide_vector_N=16M.csv"); - M = startDimention_; - N = 16 * M; - while (N <= upperLimit_) { - callDenseKernels(csvFile, M, N); - M++; - N += 16; - } - // Close file - csvFile.close(); + // Short and wide x Vector + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_vector_N=16M.csv"); + M = startDimention_; + N = 16 * M; + while (N <= upperLimit_) { + callDenseKernels(csvFile, M, N); + M++; + N += 16; + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Short-and-Wide x Vector (N=16M)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Vector (N=16M)"); + } #endif - // Short and wide x Vector - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_short-wide_vector_M=32_N.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = 32, N = dim; - callDenseKernels(csvFile, 32, dim); - } - } - // Close file - csvFile.close(); -#if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Short-and-Wide x Vector (M=32, N)"); - } -#endif - } - if (doSparse_) { - // Sparse square matrix - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_sparse_square_9999.csv"); + // Short and wide x Vector + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_vector_M=32_N.csv"); + if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { - callSparseKernels(csvFile, dim, 0.9999); + // M = 32, N = dim; + callDenseKernels(csvFile, 32, dim); } - // Close filex1 - csvFile.close(); + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Sparse square // sparsity = 0.9999"); - } -#endif - csvFile.close(); + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Vector (M=32, N)"); } +#endif } private: @@ -230,8 +198,8 @@ class doGemv { // Perform CPU kernel #if CPU_ENABLED if (doCPU_) { - gemvCpu_.initialise(M, N); - cpuResult = gemvCpu_.compute(); + cpu_.initialise(M, N); + cpuResult = cpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); // Write result to CSV file writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, 0.0, @@ -244,21 +212,21 @@ class doGemv { if (doGPU_) { // - ONCE : Offload to/from GPU once before all iterations and once // after - gemvGpu_.initialise(gpuOffloadType::once, M, N); - gpuResult_once = gemvGpu_.compute(); + gpu_.initialise(gpuOffloadType::once, M, N); + gpuResult_once = gpu_.compute(); gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime); // - ALWAYS: Offload to/from GPU every iteration - gemvGpu_.initialise(gpuOffloadType::always, M, N); - gpuResult_always = gemvGpu_.compute(); + gpu_.initialise(gpuOffloadType::always, M, N); + gpuResult_always = gpu_.compute(); gpuResult_always.gflops = calcGflops(flops, iterations_, gpuResult_always.runtime); // - UNIFIED : data passed from host to device (and device to host) as // needed - gemvGpu_.initialise(gpuOffloadType::unified, M, N); - gpuResult_unified = gemvGpu_.compute(); + gpu_.initialise(gpuOffloadType::unified, M, N); + gpuResult_unified = gpu_.compute(); gpuResult_unified.gflops = calcGflops(flops, iterations_, gpuResult_unified.runtime); @@ -302,64 +270,6 @@ class doGemv { #endif } - void callSparseKernels(std::ofstream& csvFile, const int N, const float - sparsity) { - const double probSize = calcKib(N, N); - const uint64_t flops = calcFlops(N, N); - std::string kernelName = getKernelName(); - - time_checksum_gflop cpuResult; - time_checksum_gflop gpuResult_once; - time_checksum_gflop gpuResult_always; - time_checksum_gflop gpuResult_unified; - -#if CPU_ENABLED - if (doCPU_) { - spGemvCpu_.initialise(N, sparsity); - time_checksum_gflop cpuResult = spGemvCpu_.compute(); - cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); - // Write result to CSV file - writeLineToCsv(csvFile, "cpu", kernelName, N, N, 0, probSize, sparsity, - iterations_, cpuResult.runtime, cpuResult.gflops); - } -#endif -#if GPU_ENABLED - - if (doGPU_) { - // - ONCE : Offload to/from GPU once before all iterations and once - // after - spGemvGpu_.initialise(gpuOffloadType::once, N, sparsity); - gpuResult_once = spGemvGpu_.compute(); - gpuResult_once.gflops = - calcGflops(flops, iterations_, gpuResult_once.runtime); - - // - ALWAYS: Offload to/from GPU every iteration - spGemvGpu_.initialise(gpuOffloadType::always, N, sparsity); - gpuResult_always = spGemvGpu_.compute(); - gpuResult_always.gflops = - calcGflops(flops, iterations_, gpuResult_always.runtime); - - // - UNIFIED : data passed from host to device (and device to host) as - // needed - spGemvGpu_.initialise(gpuOffloadType::unified, N, sparsity); - gpuResult_unified = spGemvGpu_.compute(); - gpuResult_unified.gflops = - calcGflops(flops, iterations_, gpuResult_unified.runtime); - - // Write results to CSV file - writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, 0, probSize, - sparsity, iterations_, gpuResult_once.runtime, - gpuResult_once.gflops); - writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, 0, - probSize, sparsity, iterations_, gpuResult_always.runtime, - gpuResult_always.gflops); - writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, 0, probSize, - sparsity, iterations_, gpuResult_unified.runtime, - gpuResult_unified.gflops); - } -#endif - } - /** Ensure all CPU and GPU checksums are within the permitted limit of * eachother. */ void checkChecksums(time_checksum_gflop cpuResult, @@ -584,20 +494,14 @@ class doGemv { /** Whether the GPU kernels should be run. */ const bool doGPU_ = true; - /** Whether sparse and or dense kernels should be run. */ - const bool doDense_; - const bool doSparse_; - #if CPU_ENABLED /** The GEMV CPU kernel. */ - cpu::gemv_cpu<T> gemvCpu_; - cpu::sp_gemv_cpu<T> spGemvCpu_; + cpu::gemv_cpu<T> cpu_; #endif #if GPU_ENABLED /** The GEMV GPU kernel. */ - gpu::gemv_gpu<T> gemvGpu_; - gpu::sp_gemv_gpu<T> spGemvGpu_; + gpu::gemv_gpu<T> gpu_; #endif /** The point at which offloading to GPU (offload once) becomes worthwhile. */ diff --git a/include/doSpgemm.hh b/include/doSpgemm.hh new file mode 100644 index 0000000..2131a7d --- /dev/null +++ b/include/doSpgemm.hh @@ -0,0 +1,8 @@ +// +// Created by Alexander Cockrean on 07/01/2025. +// + +#ifndef GPU_BLAS_OFFLOAD_BENCHMARK_DOSPGEMM_HH +#define GPU_BLAS_OFFLOAD_BENCHMARK_DOSPGEMM_HH + +#endif //GPU_BLAS_OFFLOAD_BENCHMARK_DOSPGEMM_HH diff --git a/include/doSpgemv.hh b/include/doSpgemv.hh new file mode 100644 index 0000000..cf315e0 --- /dev/null +++ b/include/doSpgemv.hh @@ -0,0 +1,8 @@ +// +// Created by Alexander Cockrean on 07/01/2025. +// + +#ifndef GPU_BLAS_OFFLOAD_BENCHMARK_DOSPGEMV_HH +#define GPU_BLAS_OFFLOAD_BENCHMARK_DOSPGEMV_HH + +#endif //GPU_BLAS_OFFLOAD_BENCHMARK_DOSPGEMV_HH diff --git a/include/doSpmm.hh b/include/doSpmm.hh new file mode 100644 index 0000000..2321636 --- /dev/null +++ b/include/doSpmm.hh @@ -0,0 +1,445 @@ +#pragma once +#include <sstream> +#include <type_traits> +#include <cstdint> + +#include "helpers.hh" +#include "tablePrinter.hh" +#include "utilities.hh" + +#if defined CPU_ARMPL +#include "../ArmPL/spmm.hh" +#elif defined CPU_ONEMKL +// Todo #include "../oneMKL/CPU/spmm.hh" +#elif defined CPU_AOCL +// Todo #include "../AOCL/gemm.hh" +#elif defined CPU_NVPL + // Todo #include "../NVPL/gemm.hh" +#elif defined CPU_OPENBLAS +// Todo #include "../OpenBLAS/gemm.hh" +#endif + +#if defined GPU_CUBLAS +#include "../cuBLAS/spmm.hh" +#elif defined GPU_ONEMKL +// Todo #include "../oneMKL/GPU/gemm.hh" +#elif defined GPU_ROCBLAS +// Todo #include "../rocBLAS/gemm.hh" +#endif + +/** `T` represents the type of kernel that will be run - i.e. T=float is for + * SGEMM. */ +template <typename T> +class doSpmm { +public: + doSpmm(const std::string csvDir, const int iters, const int startDim, + const int upperLimit, const bool cpuEnabled = true, + const bool gpuEnabled = true) + : CSV_DIR(csvDir), + iterations_(iters), + startDimention_(startDim), + upperLimit_(upperLimit), + doCPU_(cpuEnabled), + doGPU_(gpuEnabled) +#if CPU_ENABLED + , + cpu_(iterations_) +#endif +#if GPU_ENABLED + , + gpu_(iterations_) +#endif + { + static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) && + "ERROR - doSpmm can only be constructed using one of the " + "following types: [float, double]."); + } + + /** Run all problem types and write data to CSV files. */ + void collectData() { + // ToDo -- I've hard coded false here as kernel selection was not working + // . Needs to be fixed + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" + + getKernelName() + "_sparse_square_99.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callKernels(csvFile, dim, 0.99); + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Sparse Square 0.99"); + } +#endif + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + csvFile = initCSVFile(std::string(CSV_DIR) + "/" + + getKernelName() + "_sparse_square_999.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callKernels(csvFile, dim, 0.999); + } +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Sparse Square 0.999"); + } +#endif + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + csvFile = initCSVFile(std::string(CSV_DIR) + "/" + + getKernelName() + "_sparse_square_9999.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callKernels(csvFile, dim, 0.9999); + } +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Sparse Square 0.9999"); + } +#endif + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + csvFile = initCSVFile(std::string(CSV_DIR) + "/" + + getKernelName() + + "_sparse_square_99999.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + callKernels(csvFile, dim, 0.99999); + } +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Sparse Square 0.99999"); + } +#endif + } + +private: + /** Ensure all CPU and GPU checksums are within the permitted limit of + * eachother. */ + void checkChecksums(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified, const int M, + const int N, const int K) { + // Ensure that each checksum difference is less than 0.1% + double hundredOverChecksum = 100 / std::fabs(cpuResult.checksum); + if (((std::fabs(cpuResult.checksum - gpuResult_once.checksum) * + hundredOverChecksum)) > 0.1 && + ((std::fabs(cpuResult.checksum - gpuResult_always.checksum) * + hundredOverChecksum)) > 0.1 && + ((std::fabs(cpuResult.checksum - gpuResult_unified.checksum) * + hundredOverChecksum)) > 0.1) { + std::cerr << "ERROR - " << getKernelName() + << " kernel checksums do not match:\n\tInput " + "dimensions: M=" + << M << ", N=" << N << ", K=" << K << std::endl; + std::cerr << std::setprecision(10) + << "\tCPU Checksum = " << cpuResult.checksum << std::endl; + std::cerr << std::setprecision(10) + << "\tGPU (Once) Checksum = " << gpuResult_once.checksum + << std::endl; + std::cerr << std::setprecision(10) + << "\tGPU (Always) Checksum = " << gpuResult_always.checksum + << std::endl; + std::cerr << std::setprecision(10) + << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum + << std::endl; + exit(1); + } + } + + /** Check whether the offload structures need to be reset; and doing so if + * required. + * - If CPU.gflops >= GPU.gflops for last two problem sizes, then reset + * offload structures as GPU may not necessarily have reached the offload + * threshold. */ + void checkOffloadStructReset(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified) { + if ((cpuGpu_once_.M != 0) && (cpuResult.gflops >= gpuResult_once.gflops) && + (cpuResult.gflops >= prev_gpuResult_once.gflops)) { + cpuGpu_once_.cpuGflops = 0.0; + cpuGpu_once_.gpuGflops = 0.0; + cpuGpu_once_.probSize_kib = 0.0; + cpuGpu_once_.M = 0; + cpuGpu_once_.N = 0; + cpuGpu_once_.K = 0; + } + if ((cpuGpu_always_.M != 0) && + (cpuResult.gflops >= gpuResult_always.gflops) && + (cpuResult.gflops >= prev_gpuResult_always.gflops)) { + cpuGpu_always_.cpuGflops = 0.0; + cpuGpu_always_.gpuGflops = 0.0; + cpuGpu_always_.probSize_kib = 0.0; + cpuGpu_always_.M = 0; + cpuGpu_always_.N = 0; + cpuGpu_always_.K = 0; + } + if ((cpuGpu_unified_.M != 0) && + (cpuResult.gflops >= gpuResult_unified.gflops) && + (cpuResult.gflops >= prev_gpuResult_unified.gflops)) { + cpuGpu_unified_.cpuGflops = 0.0; + cpuGpu_unified_.gpuGflops = 0.0; + cpuGpu_unified_.probSize_kib = 0.0; + cpuGpu_unified_.M = 0; + cpuGpu_unified_.N = 0; + cpuGpu_unified_.K = 0; + } + } + + /** Update the offload threshold structs if GPU.gflops > CPU.gflops. */ + void updateOffloadStructs(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified, const int M, + const int N, const int K, const double probSize) { + if ((cpuGpu_once_.M == 0) && cpuResult.gflops < gpuResult_once.gflops) { + cpuGpu_once_.cpuGflops = cpuResult.gflops; + cpuGpu_once_.gpuGflops = gpuResult_once.gflops; + cpuGpu_once_.probSize_kib = probSize; + cpuGpu_once_.M = M; + cpuGpu_once_.N = N; + cpuGpu_once_.K = K; + } + if ((cpuGpu_always_.M == 0) && cpuResult.gflops < gpuResult_always.gflops) { + cpuGpu_always_.cpuGflops = cpuResult.gflops; + cpuGpu_always_.gpuGflops = gpuResult_always.gflops; + cpuGpu_always_.probSize_kib = probSize; + cpuGpu_always_.M = M; + cpuGpu_always_.N = N; + cpuGpu_always_.K = K; + } + if ((cpuGpu_unified_.M == 0) && + cpuResult.gflops < gpuResult_unified.gflops) { + cpuGpu_unified_.cpuGflops = cpuResult.gflops; + cpuGpu_unified_.gpuGflops = gpuResult_unified.gflops; + cpuGpu_unified_.probSize_kib = probSize; + cpuGpu_unified_.M = M; + cpuGpu_unified_.N = N; + cpuGpu_unified_.K = K; + } + } + + void callKernels(std::ofstream& csvFile, const int N, const float + sparsity) { + const double probSize = calcKib(N, N, N); + const uint64_t flops = calcFlops(N, N, N); + std::string kernelName = getKernelName(); + +#if CPU_ENABLED + if (doCPU_) { + cpu_.initialise(N, sparsity); + time_checksum_gflop cpuResult = cpu_.compute(); + cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); + writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, + sparsity, iterations_, cpuResult.runtime, + cpuResult.gflops); + } +#endif +#if GPU_ENABLED + // Perform the GPU kernels + // - UNIFIED : data passed from host to device (and device to host) as + // needed + if (doGPU_) { + gpu_.initialise(gpuOffloadType::unified, N, sparsity); + time_checksum_gflop gpuResult_unified = gpu_.compute(); + gpuResult_unified.gflops = + calcGflops(flops, iterations_, gpuResult_unified.runtime); + + // - ALWAYS: Offload to/from GPU every iteration + gpu_.initialise(gpuOffloadType::always, N, sparsity); + time_checksum_gflop gpuResult_always = gpu_.compute(); + gpuResult_always.gflops = + calcGflops(flops, iterations_, gpuResult_always.runtime); + // - ONCE : Offload to/from GPU once before all iterations and once + // after + gpu_.initialise(gpuOffloadType::once, N, sparsity); + time_checksum_gflop gpuResult_once = gpu_.compute(); + gpuResult_once.gflops = + calcGflops(flops, iterations_, gpuResult_once.runtime); + // ToDo -- non-default GPU operations + + // Write lines to CSV file + writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize, + sparsity, iterations_, gpuResult_once.runtime, + gpuResult_once.gflops); + writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize, + sparsity, iterations_, gpuResult_always.runtime, + gpuResult_always.gflops); + writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize, + sparsity, iterations_, gpuResult_unified.runtime, + gpuResult_unified.gflops); + + } +#endif + + } + + /** A function for calculating FLOPs performed by a GEMM. + * C = alpha*AB + beta*C */ + constexpr uint64_t calcFlops(const int M, const int N, const int K) const { + // A * B = 2*M*N*K (FMA) + // alpha * AB = M*N (multiplication) + // beta * C = M*N (multiplication) + // AB + C = M*N (addition) + // = 2MNK + MN + MN + MN + + // If beta==0; = 2MNK + MN ------- alpha*AB Always done + // Else; = 2MNK + 3MN + uint64_t scalar = (BETA != 0) ? 3 : 1; + return (2 * (uint64_t)M * (uint64_t)N * (uint64_t)K) + + (scalar * (uint64_t)M * (uint64_t)N); + } + + /** A function for calculating the total GEMM problem size in KiB. */ + constexpr double calcKib(const int M, const int N, const int K) const { + uint64_t M_ = (uint64_t)M, N_ = (uint64_t)N, K_ = (uint64_t)K; + uint64_t probSize = (M_ * K_) + (K_ * N_) + (M_ * N_); + return ((double)(probSize * (sizeof(T))) / 1024); + } + + /** Get the name of the kernel being run. */ + std::string getKernelName() const { + switch (sizeof(T)) { + case 4: + return "sgemm"; + case 8: + return "dgemm"; + default: + return "unknown"; + } + } + + /** Print to stdout the offload thresholds. */ + void printOffloadThreshold(const std::string& problemName) const { + std::vector<std::string> header = { + "Device", "M", "N", "K", "Total Prob. Size (KiB)", + "GFLOP/s", "CPU GFLOP/s"}; + + std::vector<std::vector<std::string>> rows; + // Initialise GPU_Once row + std::stringstream probSize_o; + std::stringstream gpuGflops_o; + std::stringstream cpuGflops_o; + probSize_o << std::fixed << std::setprecision(2) + << cpuGpu_once_.probSize_kib; + gpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.gpuGflops; + cpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.cpuGflops; + if (cpuGpu_once_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Offload Once)", std::to_string(0), + std::to_string(0), std::to_string(0), probSize_o.str(), + "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Offload Once)", std::to_string(cpuGpu_once_.M), + std::to_string(cpuGpu_once_.N), + std::to_string(cpuGpu_once_.K), probSize_o.str(), + gpuGflops_o.str(), cpuGflops_o.str()}); + } + + // Initialise GPU_always row + std::stringstream probSize_a; + std::stringstream gpuGflops_a; + std::stringstream cpuGflops_a; + probSize_a << std::fixed << std::setprecision(2) + << cpuGpu_always_.probSize_kib; + gpuGflops_a << std::fixed << std::setprecision(2) + << cpuGpu_always_.gpuGflops; + cpuGflops_a << std::fixed << std::setprecision(2) + << cpuGpu_always_.cpuGflops; + if (cpuGpu_always_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Offload Always)", std::to_string(0), + std::to_string(0), std::to_string(0), probSize_a.str(), + "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Offload Always)", std::to_string(cpuGpu_always_.M), + std::to_string(cpuGpu_always_.N), + std::to_string(cpuGpu_always_.K), probSize_a.str(), + gpuGflops_a.str(), cpuGflops_a.str()}); + } + + // Initialise GPU_unified row + std::stringstream probSize_u; + std::stringstream gpuGflops_u; + std::stringstream cpuGflops_u; + probSize_u << std::fixed << std::setprecision(2) + << cpuGpu_unified_.probSize_kib; + gpuGflops_u << std::fixed << std::setprecision(2) + << cpuGpu_unified_.gpuGflops; + cpuGflops_u << std::fixed << std::setprecision(2) + << cpuGpu_unified_.cpuGflops; + if (cpuGpu_unified_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Unified Memory)", std::to_string(0), + std::to_string(0), std::to_string(0), probSize_u.str(), + "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Unified Memory)", std::to_string(cpuGpu_unified_.M), + std::to_string(cpuGpu_unified_.N), + std::to_string(cpuGpu_unified_.K), probSize_u.str(), + gpuGflops_u.str(), cpuGflops_u.str()}); + } + + // Print table + tablePrinter tPrinter( + problemName + " Problem Domian GPU Offload Thresholds:", header, rows); + tPrinter.print(1); + } + + /** The output directory where CSV files should be saved to. */ + const std::string CSV_DIR; + + /** The number of iterations to perform per problem size. */ + const int iterations_; + + /** The value of the first probelm size dimention run. */ + const int startDimention_; + + /** The maximum value of the largest problem size dimention. */ + const int upperLimit_; + + /** Whether the CPU kernels should be run. */ + const bool doCPU_ = true; + + /** Whether the GPU kernels should be run. */ + const bool doGPU_ = true; + +#if CPU_ENABLED + /** The CPU kernel. */ + cpu::spmm_cpu<T> cpu_; +#endif + +#if GPU_ENABLED + /** The GPU kernel. */ + gpu::spmm_gpu<T> gpu_; +#endif + + /** The point at which offloading to GPU (offload once) becomes worthwhile. */ + cpuGpu_offloadThreshold cpuGpu_once_; + + /** The point at which offloading to GPU (offload always) becomes worthwhile. + */ + cpuGpu_offloadThreshold cpuGpu_always_; + + /** The point at which offloading to GPU (unified memory) becomes worthwhile. + */ + cpuGpu_offloadThreshold cpuGpu_unified_; + + /** The previous problem size's GPU (offload once) performance results. */ + time_checksum_gflop prev_gpuResult_once; + + /** The previous problem size's GPU (offload always) performance results. */ + time_checksum_gflop prev_gpuResult_always; + + /** The previous problem size's GPU (unified memory) performance results. */ + time_checksum_gflop prev_gpuResult_unified; +}; \ No newline at end of file diff --git a/include/kernels/CPU/sp_gemm.hh b/include/kernels/CPU/sp_gemm.hh deleted file mode 100644 index c431d4d..0000000 --- a/include/kernels/CPU/sp_gemm.hh +++ /dev/null @@ -1,108 +0,0 @@ -#pragma once - -#include "../gemm.hh" - -#include <random> -#include <memory> -#include <iostream> - -namespace cpu { - -/** An abstract class for GEMM BLAS kernels. */ - template <typename T> - class sp_gemm : public ::gemm<T> { - public: - using ::gemm<T>::gemm; - using ::gemm<T>::initInputMatricesSparse; - using ::gemm<T>::toCSR_int; - using ::gemm<T>::iterations_; - using ::gemm<T>::m_; - using ::gemm<T>::n_; - using ::gemm<T>::k_; - using ::gemm<T>::A_; - using ::gemm<T>::B_; - using ::gemm<T>::C_; - - public: - /** Initialise the required data structures. */ - virtual void initialise(int n, double sparsity, bool binary = false) { - n_ = n; - sparsity_ = sparsity; - - // Note that the below should be the same as the edges calculation - // used in the initInputMatricesSparse function. If changed here, - // change there - nnz_ = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity_)); - -// std::cout << "\t____About to malloc()____" << std::endl; - A_ = (T*)malloc(sizeof(T) * n_ * n_); - B_ = (T*)malloc(sizeof(T) * n_ * n_); - C_ = (T*)malloc(sizeof(T) * n_ * n_); - - initInputMatricesSparse(sparsity); - - toCSR_int(); - } - - uint64_t nnz_; - - protected: - - T* A_vals_; - T* B_vals_; - T* C_vals_; - - private: - /** Do any necessary cleanup (free pointers, close library handles, etc.) - * after Kernel has been called. */ - void postCallKernelCleanup() { - free(A_); - free(B_); - free(C_); - } - - void toCSR_int() { - // Move A to CSR - A_row_ptr_ = new int[n_ + 1]; - A_col_index_ = new int[nnz_]; - A_vals_ = new T[nnz_]; - int nnz_encountered = 0; - for (int row = 0; row < n_; row++) { - A_row_ptr_[row] = nnz_encountered; - for (int col = 0; col < n_; col++) { - if (A_[(row * n_) + col] != 0.0) { - A_col_index_[nnz_encountered] = col; - A_vals_[nnz_encountered] = A_[(row * n_) + col]; - nnz_encountered++; - } - } - } - - // Move B to CSR - B_row_ptr_ = new int[n_ + 1]; - B_col_index_ = new int[nnz_]; - B_vals_ = new T[nnz_]; - nnz_encountered = 0; - for (int row = 0; row < n_; row++) { - B_row_ptr_[row] = nnz_encountered; - for (int col = 0; col < n_; col++) { - if (B_[(row * n_) + col] != 0.0) { - B_col_index_[nnz_encountered] = col; - B_vals_[nnz_encountered] = B_[(row * n_) + col]; - nnz_encountered++; - } - } - } - } - - double sparsity_; - - int* A_row_ptr_; - int* A_col_index_; - int* B_row_ptr_; - int* B_col_index_; - int* C_row_ptr_; - int* C_col_index_; - - }; -} // namespace cpu diff --git a/include/kernels/CPU/sp_gemv.hh b/include/kernels/CPU/spgemv.hh similarity index 100% rename from include/kernels/CPU/sp_gemv.hh rename to include/kernels/CPU/spgemv.hh diff --git a/include/kernels/CPU/spgmm.hh b/include/kernels/CPU/spgmm.hh new file mode 100644 index 0000000..59856ed --- /dev/null +++ b/include/kernels/CPU/spgmm.hh @@ -0,0 +1,8 @@ +// +// Created by Alexander Cockrean on 07/01/2025. +// + +#ifndef GPU_BLAS_OFFLOAD_BENCHMARK_SPGMM_HH +#define GPU_BLAS_OFFLOAD_BENCHMARK_SPGMM_HH + +#endif //GPU_BLAS_OFFLOAD_BENCHMARK_SPGMM_HH diff --git a/include/kernels/CPU/spmm.hh b/include/kernels/CPU/spmm.hh new file mode 100644 index 0000000..7d19f5d --- /dev/null +++ b/include/kernels/CPU/spmm.hh @@ -0,0 +1,60 @@ +#pragma once + +#include "../spmm.hh" + +#include <random> +#include <memory> +#include <iostream> + +namespace cpu { + +/** An abstract class for sparse matrix-sparse matrix BLAS kernels. */ +template <typename T> +class spmm : public ::spmm<T> { +public: + using ::spmm<T>::spmm; + using ::spmm<T>::initInputMatrices; + using ::spmm<T>::toCSR_int; + using ::spmm<T>::iterations_; + using ::spmm<T>::nnzA_; + using ::spmm<T>::nnzB_; + using ::spmm<T>::m_; + using ::spmm<T>::n_; + using ::spmm<T>::k_; + using ::spmm<T>::A_; + using ::spmm<T>::B_; + using ::spmm<T>::C_; + +public: + /** Initialise the required data structures. */ + void initialise(int n, int m, int k, double sparsity, + bool binary = false) { + n_ = n; + m_ = m; + k_ = k; + + sparsity_ = sparsity; + + /** Determine the number of nnz elements in A and B */ + nnzA_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + nnzB_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_)); + + A_ = (T*)malloc(sizeof(T) * m_ * k_); + B_ = (T*)malloc(sizeof(T) * k_ * n_); + C_ = (T*)calloc(sizeof(T) * m_ * n_); + + initInputMatrices(sparsity_); + } + +private: + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() { + free(A_); + free(B_); + free(C_); + } + + double sparsity_; +}; +} // namespace cpu diff --git a/include/kernels/GPU/sp_gemm.hh b/include/kernels/GPU/sp_gemm.hh deleted file mode 100644 index 52a5494..0000000 --- a/include/kernels/GPU/sp_gemm.hh +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include "../gemm.hh" - -namespace gpu { - -/** An abstract class for GEMM BLAS kernels. */ - template <typename T> - class sp_gemm : public ::gemm<T> { - public: - using ::gemm<T>::gemm; - - /** Initialise the required data structures. - * `offload` refers to the data offload type: - * - Once: Move data from host to device before all iterations & move from - * device to host after all iterations - * - Always: Move data from host to device and device to host each iteration - * - Unified: Initialise data as unified memory; no data movement semantics - * required */ - virtual void initialise(gpuOffloadType offload, int n, float sparsity) - = 0; - - protected: - /** Whether data should be offloaded to/from the GPU each iteration, or just - * before & after. */ - gpuOffloadType offload_ = gpuOffloadType::always; - }; -} // namespace gpu \ No newline at end of file diff --git a/include/kernels/GPU/spgemm.hh b/include/kernels/GPU/spgemm.hh new file mode 100644 index 0000000..917469b --- /dev/null +++ b/include/kernels/GPU/spgemm.hh @@ -0,0 +1,8 @@ +// +// Created by Alexander Cockrean on 07/01/2025. +// + +#ifndef GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMM_HH +#define GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMM_HH + +#endif //GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMM_HH diff --git a/include/kernels/GPU/sp_gemv.hh b/include/kernels/GPU/spgemv.hh similarity index 100% rename from include/kernels/GPU/sp_gemv.hh rename to include/kernels/GPU/spgemv.hh diff --git a/include/kernels/GPU/spmm.hh b/include/kernels/GPU/spmm.hh new file mode 100644 index 0000000..3f5002e --- /dev/null +++ b/include/kernels/GPU/spmm.hh @@ -0,0 +1,28 @@ +#pragma once + +#include "../spmm.hh" + +namespace gpu { + +/** An abstract class for sparse matrix-sparse matrix BLAS kernels. */ +template <typename T> +class spmm : public ::spmm<T> { +public: + using ::spmm<T>::spmm; + + /** Initialise the required data structures. + * `offload` refers to the data offload type: + * - Once: Move data from host to device before all iterations & move from + * device to host after all iterations + * - Always: Move data from host to device and device to host each iteration + * - Unified: Initialise data as unified memory; no data movement semantics + * required */ + virtual void initialise(gpuOffloadType offload, int m, int n, int k, + double sparsity, bool binary = false) = 0; + +protected: + /** Whether data should be offloaded to/from the GPU each iteration, or just + * before & after. */ + gpuOffloadType offload_ = gpuOffloadType::always; +}; +} // namespace gpu \ No newline at end of file diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index 6e1328e..3f0aece 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -92,137 +92,9 @@ class gemm { } } - // Note that the below should be the same as the nnz calculation - // used in the cpu initialise functions. If changed here, - // change there - void initInputMatricesSparse(float sparsity) { - for (int i = 0; i < (n_ * n_); i++) { - A_[i] = 0.0; - B_[i] = 0.0; - } - - // Random number generator objects for use in descent - std::default_random_engine gen; - gen.seed(std::chrono::system_clock::now() - .time_since_epoch().count()); - std::uniform_real_distribution<double> dist(0.0, 1.0); - - int edges = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity)); - - // Using a=0.45 and b=c=0.22 as default probabilities - for (int i = 0; i < edges; i++) { - while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, - false)) {} - while (!rMat(B_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, - false)) {} - } - } - /** Call the extern consume() function. */ void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); } - /** Recursive function to populate sparse matrices */ - bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b, - float c, std::default_random_engine* gen, - std::uniform_real_distribution<double> dist, bool bin) { - // If a 1x1 submatrix, then add an edge and return out - if (x1 >= x2 && y1 >= y2) { - // Needed to avoid overfloe segfaults with large problem sizes - uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1); - if (abs(M[index]) > 0.1) { - return false; - } else { - // Add 1.0 if this is a binary graph, and a random real number otherwise - M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0); - return true; - } - } else { - // Divide up the matrix - int xMidPoint = x1 + floor((x2 - x1) / 2); - int yMidPoint = y1 + floor((y2 - y1) / 2); - - // ToDo -- add some noise to these values between iterations - float newA = a; - float newB = b; - float newC = c; - - // Work out which quarter to recurse into - // There are some ugly ternary operators here to avoid going out of bounds in the edge case - // that we are already at 1 width or 1 height - float randomNum = dist(*gen); - if (randomNum < a) { - return rMat(M, n, x1, xMidPoint, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b)) { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, - newA, newB, newC, gen, dist, bin); - } else if (randomNum < (a + b + c)) { - return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, - newA, newB, newC, gen, dist, bin); - } else { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, - ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, - gen, dist, bin); - } - } - } - - void toCSR_int(T* dense, int n_col, int n_row, T* vals, int* col_index, - int* row_ptr) { - int nnz_encountered = 0; - for (int row = 0; row < n_row; row++) { - row_ptr[row] = nnz_encountered; - int nnz_row = 0; - for (int col = 0; col < n_col; col++) { - if (dense[(row * n_col) + col] != 0.0) { - nnz_row++; - col_index[nnz_encountered] = col; - vals[nnz_encountered] = dense[(row * n_col) + col]; - nnz_encountered++; - } - } - } - row_ptr[n_row] = nnz_encountered; - } - -#ifdef CPU_ONEMKL - void toCSR_mkl(T* dense, int n_col, int n_row, T* vals, MKL_INT* col_index, - MKL_INT* row_ptr) { - int nnz_encountered = 0; - for (int row = 0; row < n_row; row++) { - row_ptr[row] = (MKL_INT)nnz_encountered; - int nnz_row = 0; - for (int col = 0; col < n_col; col++) { - if (dense[(row * n_col) + col] != 0.0) { - nnz_row++; - col_index[nnz_encountered] = (MKL_INT)col; - vals[nnz_encountered] = dense[(row * n_col) + col]; - nnz_encountered++; - } - } - } - row_ptr[n_row] = (MKL_INT)nnz_encountered; - } -#endif -#ifdef CPU_AOCL - void toCSR_aocl(T* dense, int n_col, int n_row, T* vals, aoclsparse_int* - col_index, aoclsparse_int* row_ptr) { - int nnz_encountered = 0; - for (int row = 0; row < n_row; row++) { - row_ptr[row] = (aoclsparse_int)nnz_encountered; - int nnz_row = 0; - for (int col = 0; col < n_col; col++) { - if (dense[(row * n_col) + col] != 0.0) { - nnz_row++; - col_index[nnz_encountered] = (aoclsparse_int)col; - vals[nnz_encountered] = dense[(row * n_col) + col]; - nnz_encountered++; - } - } - } - row_ptr[n_row] = (MKL_INT)nnz_encountered; - } -#endif /** The number of iterations to perform per problem size. */ const int iterations_; diff --git a/include/kernels/spgemm.hh b/include/kernels/spgemm.hh new file mode 100644 index 0000000..917469b --- /dev/null +++ b/include/kernels/spgemm.hh @@ -0,0 +1,8 @@ +// +// Created by Alexander Cockrean on 07/01/2025. +// + +#ifndef GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMM_HH +#define GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMM_HH + +#endif //GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMM_HH diff --git a/include/kernels/spgemv.hh b/include/kernels/spgemv.hh new file mode 100644 index 0000000..9e7d953 --- /dev/null +++ b/include/kernels/spgemv.hh @@ -0,0 +1,8 @@ +// +// Created by Alexander Cockrean on 07/01/2025. +// + +#ifndef GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMV_HH +#define GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMV_HH + +#endif //GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMV_HH diff --git a/include/kernels/spmm.hh b/include/kernels/spmm.hh new file mode 100644 index 0000000..37de9cf --- /dev/null +++ b/include/kernels/spmm.hh @@ -0,0 +1,168 @@ +#pragma one + +#include <algorithm> +#include <chrono> +#include <cmath> +#include <limits> +#include <random> +#include <iostream> + +#include "../utilities.hh" + +/** A generic abstract class defining the operation of timing a SPMM BLAS + * kernel for n iterations */ +template <typename T> +class spmm { +public: + spmm(const int iters) : iterations_(iters) {} + + /** Call the kernel n times. Returns the time elapsed for all n calls + * in seconds */ + time_checksum_gflop compute() { + // Start the timer + std::chrono::time_point<std::chrono::high_resolution_clock> startTime = + std::chrono::high_resolution_clock::now(); + + // perform tje SPMM calls + preLoopRequirements(); + for (int i = 0; i < iterations_; i++) { + callSpmm(); + } + postLoopRequirements(); + + // Stop the timer + std::chrono::time_point<std::chrono::high_resolution_clock> endTime = + std::chrono::high_resolution_clock::now(); + std::chrono::duration<double> time_s = endTime - startTime; + + double checksum = calcChecksum(); + + postCallKernelCleanup(); + + return {time_s.count(), checksum, 0.0}; + } + + int64_t nnzA_ = 0; + int64_t nnzB_ = 0; + int64_t nnzC_ = 0; + +private: + /** Performs the steps required before calling the SPMM kernel that + * should be timed */ + virtual void preLoopRequirements() = 0; + + /** Perform the SPMM kernel. */ + virtual void callSpmm() = 0; + + /** Perform any steps required after calling the SPMM kernel that should + * be timed */ + virtual void postLoopRequirements() = 0; + + /** Do the necessary cleanup after the kernel has been finished that + * should not be timed */ + virtual void postCallKernelCleanup() = 0; + + /** Calculate a checksum from the result matrix C. */ + constexpr double calcChecksum() { + // Todo -- think about how this can sensibly be done for SPMM + return 0.0; + } + +protected: + /** Set up the starting matrices */ + void initInputMatrices() { + for (size_t i = 0; i < (m_ * k_); i++) { + A_[i] = 0.0; + } + for (size_t i = 0; i < (k_ * n_); i++) { + B_[i] = 0.0; + } + + // Random number generator objects for use in descent + std::default_random_engine gen; + gen.seed(std::chrono::system_clock::now() + .time_since_epoch().count()); + std::uniform_real_distribution<double> dist(0.0, 1.0); + + // Using a=0.45 and b=c=0.22 as default probabilities + for (size_t i = 0; i < nnzA_; i++) { + while (!rMat(A_, k_, 0, k_ - 1, 0, m_ - 1, 0.45, 0.22, 0.22, &gen, dist, + false)) {} + } + for (size_t i = 0; i < nnzB_; i++) { + while (!rMat(B_, n_, 0, n_ - 1, 0, k_ - 1, 0.45, 0.22, 0.22, &gen, dist, + false)) {} + } + + toSparseFormat() + } + + /** Move matrices into the sparse representation of for the given library */ + virtual void toSparseFormat() = 0; + + /** Call the external consume() function on the matrices */ + void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); }/** Recursive function to populate sparse matrices */ + + // On first iteration, n should be x2 + 1 + bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b, + float c, std::default_random_engine* gen, + std::uniform_real_distribution<double> dist, bool bin) { + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + // Needed to avoid overflow segfaults with large problem sizes + uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1); + if (abs(M[index]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + a, b, c, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + a, b, c, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + a, b, c, gen, dist, bin); + } else { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, a, + b, c, gen, dist, bin); + } + } + } + + /** The number of iterations to perform per problem size. */ + const int iterations_; + + /** Matrix dimension M. */ + int m_ = 0; + + /** Matrix dimension N. */ + int n_ = 0; + + /** Matrix dimension K. */ + int k_ = 0; + + /** Dense representation of input matrix A. */ + T* A_; + + /** Dense representation of input matrix B. */ + T* B_; + + /** Dense representation of output matrix C. */ + T* C_; + +}; \ No newline at end of file diff --git a/include/main.hh b/include/main.hh index f12ebcb..f639407 100644 --- a/include/main.hh +++ b/include/main.hh @@ -5,7 +5,10 @@ #include <string> #include "doGemm.hh" +#include "doSpgemm.hh" +#include "doSpmm.hh" #include "doGemv.hh" +#include "doSpgemv.hh" #include "utilities.hh" /** A function which prints standard configuration information to stdout. */ diff --git a/src/main.cc b/src/main.cc index bdc1db2..8bb7412 100644 --- a/src/main.cc +++ b/src/main.cc @@ -3,14 +3,21 @@ int iters = 10; int startDim = 1; int upperLimit = 128; +// GEMM kernels bool doSgemm = true; bool doDgemm = true; -bool doSp_sgemm = true; -bool doSp_dgemm = true; +// Sparse GEMM kernels +bool doSspgemm = true; +bool doDspgemm = true; +// GEMV kernels bool doSgemv = true; bool doDgemv = true; -bool doSp_sgemv = true; -bool doSp_dgemv = true; +// Sparse GEMV kernles +bool doSspgemv = true; +bool doDspgemv = true; +// Sparse-sparse matrix multiplication kernels +bool doSspmm = true; +bool doDspmm = true; bool doCpu = CPU_ENABLED; bool doGpu = GPU_ENABLED; @@ -39,33 +46,101 @@ int main(int argc, char** argv) { // -------- GEMM -------- // SGEMM Comparison - std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl; - doGemm<float> sgemm(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu, doSgemm, doSp_sgemm); - sgemm.collectData(); - std::cout << "Finished!" << std::endl; + if (doSgemm) { + std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl; + doGemm<float> sgemm(std::string(absPath), iters, startDim, upperLimit, + doCpu, + doGpu); + sgemm.collectData(); + std::cout << "Finished!" << std::endl; + } + + // DGEMM Comparison + if (doDgemm) { + std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl; + doGemm<double> dgemm(std::string(absPath), iters, startDim, upperLimit, + doCpu, + doGpu); + dgemm.collectData(); + std::cout << "Finished!" << std::endl; + } + + // -------- SPGEMM -------- + // SPGEMM Comparison + if (doSspgemm) { + std::cout << std::endl << "Comparing SSpGEMM Kernels:" << std::endl; + doSpgemm<float> sspgemm(std::string(absPath), iters, startDim, upperLimit, + doCpu, doGpu); + sspgemm.collectData(); + std::cout << "Finished!" << std::endl; + } // DGEMM Comparison - std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl; - doGemm<double> dgemm(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu, doDgemm, doSp_dgemm); - dgemm.collectData(); - std::cout << "Finished!" << std::endl; + if (doDspgemm) { + std::cout << std::endl << "Comparing DSpMM Kernels:" << std::endl; + doSpgemm<double> dspgemm(std::string(absPath), iters, startDim, upperLimit, + doCpu, doGpu); + dspgemm.collectData(); + std::cout << "Finished!" << std::endl; + } + + // -------- SPMM -------- + // SSPMM comparison + if (doSspmm) { + std::cout << std::endl << "Comparing SSpMM Kernels:" << std::endl; + doSpmm<float> sspmm(std::string(absPath), iters, startDim, upperLimit, + doCpu, doGpu); + sspmm.collectData(); + std::cout << "Finished!" << std::endl; + } + + // DSPMM Comparison + if (doDspmm) { + std::cout << std::endl << "Comparing DSpMM Kernels:" << std::endl; + doSpmm<double> dspmm(std::string(absPath), iters, startDim, upperLimit, + doCpu, doGpu); + dspmm.collectData(); + std::cout << "Finished!" << std::endl; + } // -------- GEMV -------- // SGEMV Comparison - std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl; - doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu, doSgemv, doSp_sgemv); - sgemv.collectData(); - std::cout << "Finished!" << std::endl; + if (doSgemv) { + std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl; + doGemv<float> sgemv(std::string(absPath), iters, startDim, upperLimit, + doCpu, doGpu); + sgemv.collectData(); + std::cout << "Finished!" << std::endl; + } // DGEMV Comparison - std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl; - doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu, doDgemv, doSp_dgemv); - dgemv.collectData(); - std::cout << "Finished!" << std::endl; + if (doDgemv) { + std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl; + doGemv<double> dgemv(std::string(absPath), iters, startDim, upperLimit, + doCpu, doGpu); + dgemv.collectData(); + std::cout << "Finished!" << std::endl; + } + + // -------- SPGEMV -------- + // SSPGEMV Comparison + if (doSspgemv) { + std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl; + doSpgemv<float> sspgemv(std::string(absPath), iters, startDim, upperLimit, + doCpu, doGpu); + sspgemv.collectData(); + std::cout << "Finished!" << std::endl; + } + + // DSPGEMV Comparison + if (doDgemv) { + std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl; + doSpgemv<double> dspgemv(std::string(absPath), iters, startDim, upperLimit, + doCpu, doGpu); + dspgemv.collectData(); + std::cout << "Finished!" << std::endl; + } + free(absPath); return 0; @@ -150,49 +225,20 @@ void getParameters(int argc, char** argv) { } else if (!strcmp(argv[i], "--no_gpu")) { doGpu = false; } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) { - doSgemm = doDgemm = doSp_sgemm = doSp_dgemm = - doSgemv = doDgemv = doSp_sgemv = doSp_dgemv = false; std::string kernelList = argv[++i]; - if (kernelList.find("sp-sgemm") != std::string::npos) { - doSp_sgemm = true; - if (kernelList.find("sgemm") != std::string::npos && - kernelList.find("sgemm") != kernelList.find("sp-sgemm") + 3) { - doSgemm = true; - } - } else if (kernelList.find("sgemm") != std::string::npos) { - doSgemm = true; - } - if (kernelList.find("sp-dgemm") != std::string::npos) { - doSp_dgemm = true; - if (kernelList.find("dgemm") != std::string::npos && - kernelList.find("dgemm") != kernelList.find("sp-dgemm") + 3) { - doDgemm = true; - } - } else if (kernelList.find("dgemm") != std::string::npos) { - doDgemm = true; - } - + doSgemm = (kernelList.find("sgemm") != std::string::npos); + doDgemm = (kernelList.find("dgemm") != std::string::npos); + doSspgemm = (kernelList.find("sspgemm") != std::string::npos); + doDspgemm = (kernelList.find("dspgemm") != std::string::npos); + doSspmm = (kernelList.find("sspmm") != std::string::npos); + doDspmm = (kernelList.find("dspmm") != std::string::npos); + doSgemv = (kernelList.find("sgemv") != std::string::npos); + doDgemv = (kernelList.find("dgemv") != std::string::npos); + doSspgemv = (kernelList.find("sspgemv") != std::string::npos); + doDspgemv = (kernelList.find("dspgemv") != std::string::npos); - if (kernelList.find("sp-sgemv") != std::string::npos) { - doSp_sgemv = true; - if (kernelList.find("sgemv") != std::string::npos && - kernelList.find("sgemv") != kernelList.find("sp-sgemv") + 3) { - doSgemv = true; - } - } else if (kernelList.find("sgemv") != std::string::npos) { - doSgemv = true; - } - if (kernelList.find("sp-dgemv") != std::string::npos) { - doSp_dgemv = true; - if (kernelList.find("dgemv") != std::string::npos && - kernelList.find("dgemv") != kernelList.find("sp-dgemv") + 3) { - doDgemv = true; - } - } else if (kernelList.find("dgemv") != std::string::npos) { - doDgemv = true; - } - if (!doSgemm && !doDgemm && !doSp_sgemm && !doSp_dgemm && - !doSgemv && !doDgemv && !doSp_sgemv && !doSp_dgemv) { + if (!doSgemm && !doDgemm && !doSspgemm && !doDspgemm && + !doSgemv && !doDgemv && !doSspgemv && !doDspgemv) { std::cout << "ERROR - no implemented kernels in list" << std::endl; exit(1); } else { @@ -212,18 +258,16 @@ void getParameters(int argc, char** argv) { << " -o --output_dir The CSV file output directory" << std::endl; std::cout << " -i --iterations I Repeat each kernel I times " - "(default: " - << iters << ")" << std::endl; + "(default: " << iters << ")" << std::endl; std::cout << " -s --start_dimension S First value of M, N, K is S " - "(default: " - << startDim << ")" << std::endl; + "(default: " << startDim << ")" << std::endl; std::cout << " -d --dimension_limit D Max value of M, N, K is D " - "(default: " - << upperLimit << ")" << std::endl; + "(default: " << upperLimit << ")" << std::endl; std::cout << " -k --kernels <kernels> Comma-separated list of " - "kernels to be run. Options are sgemm, dgemm, sp-sgemm, " - "sp-dgemm (default: sgemm,dgemm,sp-gemm,sp-dgemm)" << - std::endl; + "kernels to be run. Options are sgemm, dgemm, sspgemm, " + "dspgemm, sspmm, dspmm, sgemv, dgemv, sspgemv, dspgemv " + "(default: `-k sgemm,dgemm,sspgemm,dspgemm,sspmm,dspmm," + "sgemv,dgemv,sspgemv,dspgemv`)" << std::endl; std::cout << std::endl; exit(0); } else { From 7819f6f6f1ea1f7849f274bc4b66f81d8d026ba2 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Wed, 8 Jan 2025 14:04:30 +0000 Subject: [PATCH 36/38] Moving spgemv into new format --- .idea/workspace.xml | 46 ++-- ArmPL/spgemv.hh | 31 +-- cuBLAS/spgemv.hh | 90 ++++--- include/doGemv.hh | 12 +- include/doSpgemv.hh | 429 +++++++++++++++++++++++++++++++++- include/kernels/CPU/spgemv.hh | 34 ++- include/kernels/CPU/spmm.hh | 5 +- include/kernels/GPU/spgemv.hh | 10 +- include/kernels/spgemv.hh | 135 ++++++++++- include/kernels/spmm.hh | 46 +--- include/utilities.hh | 110 ++++++++- 11 files changed, 772 insertions(+), 176 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 84d08df..3d4f373 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -15,30 +15,18 @@ </configurations> </component> <component name="ChangeListManager"> - <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Getting rid of old oneMKL sparse file"> - <change afterPath="$PROJECT_DIR$/include/doSpgemm.hh" afterDir="false" /> - <change afterPath="$PROJECT_DIR$/include/doSpgemv.hh" afterDir="false" /> - <change afterPath="$PROJECT_DIR$/include/doSpmm.hh" afterDir="false" /> - <change afterPath="$PROJECT_DIR$/include/kernels/CPU/spgmm.hh" afterDir="false" /> - <change afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemm.hh" afterDir="false" /> - <change afterPath="$PROJECT_DIR$/include/kernels/spgemm.hh" afterDir="false" /> - <change afterPath="$PROJECT_DIR$/include/kernels/spgemv.hh" afterDir="false" /> - <change afterPath="$PROJECT_DIR$/include/kernels/spmm.hh" afterDir="false" /> + <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Refactoring to make individual files relate to a single kernel"> <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/AOCL/sp_gemm.hh" beforeDir="false" /> - <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spmm.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/ArmPL/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spgemv.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/cuBLAS/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spmm.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/cuBLAS/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spgemv.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/doGemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/ArmPL/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spgemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/cuBLAS/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spgemv.hh" afterDir="false" /> <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/kernels/CPU/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spgemv.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/kernels/GPU/sp_gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spmm.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/kernels/GPU/sp_gemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemv.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/kernels/gemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/gemm.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/main.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/main.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/src/main.cc" beforeDir="false" afterPath="$PROJECT_DIR$/src/main.cc" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/doSpgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpgemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/CPU/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spgemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/GPU/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spgemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spmm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/utilities.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/utilities.hh" afterDir="false" /> </list> <option name="SHOW_DIALOG" value="false" /> <option name="HIGHLIGHT_CONFLICTS" value="true" /> @@ -578,7 +566,15 @@ <option name="project" value="LOCAL" /> <updated>1735823512058</updated> </task> - <option name="localTasksCounter" value="48" /> + <task id="LOCAL-00048" summary="Refactoring to make individual files relate to a single kernel"> + <option name="closed" value="true" /> + <created>1736268772766</created> + <option name="number" value="00048" /> + <option name="presentableId" value="LOCAL-00048" /> + <option name="project" value="LOCAL" /> + <updated>1736268772766</updated> + </task> + <option name="localTasksCounter" value="49" /> <servers /> </component> <component name="TypeScriptGeneratedFilesManager"> @@ -596,7 +592,6 @@ </option> </component> <component name="VcsManagerConfiguration"> - <MESSAGE value="cuSPARSE unified memory implementation" /> <MESSAGE value="Now compiles" /> <MESSAGE value="Now compiles with fewer runtime errors" /> <MESSAGE value="Implementing other offload types - still some runtime errors" /> @@ -621,6 +616,7 @@ <MESSAGE value="Updating createGflopsGraphs.py to show sparsity" /> <MESSAGE value="Beginning gemv ARMPL" /> <MESSAGE value="Getting rid of old oneMKL sparse file" /> - <option name="LAST_COMMIT_MESSAGE" value="Getting rid of old oneMKL sparse file" /> + <MESSAGE value="Refactoring to make individual files relate to a single kernel" /> + <option name="LAST_COMMIT_MESSAGE" value="Refactoring to make individual files relate to a single kernel" /> </component> </project> \ No newline at end of file diff --git a/ArmPL/spgemv.hh b/ArmPL/spgemv.hh index f39a764..5045062 100644 --- a/ArmPL/spgemv.hh +++ b/ArmPL/spgemv.hh @@ -8,22 +8,22 @@ #include <algorithm> -#include "../include/kernels/CPU/sp_gemv.hh" +#include "../include/kernels/CPU/spgemv.hh" #include "../include/utilities.hh" namespace cpu { /** A class for GEMM CPU BLAS kernels. */ template <typename T> -class sp_gemv_cpu : public sp_gemv<T> { +class spgemv_cpu : public spgemv<T> { public: - using sp_gemv<T>::sp_gemv; - using sp_gemv<T>::callConsume; - using sp_gemv<T>::m_; - using sp_gemv<T>::n_; - using sp_gemv<T>::A_; - using sp_gemv<T>::x_; - using sp_gemv<T>::y_; - using sp_gemv<T>::nnz_; + using spgemv<T>::spgemv; + using spgemv<T>::callConsume; + using spgemv<T>::m_; + using spgemv<T>::n_; + using spgemv<T>::A_; + using spgemv<T>::x_; + using spgemv<T>::y_; + using spgemv<T>::nnz_; private: /** Make call to the GEMM kernel. */ @@ -62,7 +62,7 @@ class sp_gemv_cpu : public sp_gemv<T> { y_); } else { // Un-specialised class will not do any work - print error and exit. - std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported." + std::cout << "ERROR - Datatype for ArmPL CPU SPGEMM kernel not supported." << std::endl; exit(1); } @@ -156,7 +156,7 @@ class sp_gemv_cpu : public sp_gemv<T> { /** The constant value Beta. */ const T beta = BETA; - void toCSR_armpl() { + void toSparseFormat() { n_armpl_ = n_; // ToDo -- check whether flags_ is correct! flags_ = 0; @@ -168,7 +168,7 @@ class sp_gemv_cpu : public sp_gemv<T> { A_armpl_row_ptr_[0] = 0; int nnz_encountered = 0; - for (int row = 0; row < n_; row++) { + for (int row = 0; row < m_; row++) { A_armpl_row_ptr_[row + 1] = nnz_encountered; for (int col = 0; col < n_; col++) { if (A_[(row * n_) + col] != 0.0) { @@ -183,7 +183,7 @@ class sp_gemv_cpu : public sp_gemv<T> { // printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, // nnz_, flags_); status_ = armpl_spmat_create_csr_s(&A_armpl_, - n_armpl_, + m_armpl_, n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, @@ -197,7 +197,7 @@ class sp_gemv_cpu : public sp_gemv<T> { // printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, // nnz_, flags_ status_ = armpl_spmat_create_csr_d(&A_armpl_, - n_armpl_, + m_armpl_, n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, @@ -239,6 +239,7 @@ class sp_gemv_cpu : public sp_gemv<T> { armpl_int_t flags_; armpl_int_t n_armpl_; + armpl_int_t m_armpl; T* A_vals_; armpl_int_t* A_armpl_row_ptr_; diff --git a/cuBLAS/spgemv.hh b/cuBLAS/spgemv.hh index f35a63a..2076488 100644 --- a/cuBLAS/spgemv.hh +++ b/cuBLAS/spgemv.hh @@ -7,27 +7,27 @@ #include <random> #include <iostream> -#include "../include/kernels/GPU/sp_gemv.hh" +#include "../include/kernels/GPU/spgemv.hh" #include "../include/utilities.hh" #include "common.hh" namespace gpu { /** A class for sparse GEMM GPU BLAS kernels. */ template <typename T> -class sp_gemv_gpu : public sp_gemv<T> { +class spgemv_gpu : public spgemv<T> { public: - using sp_gemv<T>::sp_gemv; - using sp_gemv<T>::initInputMatrixVectorSparse; -// using sp_gemv<T>::toCSR_int; - using sp_gemv<T>::m_; - using sp_gemv<T>::n_; - using sp_gemv<T>::A_; - using sp_gemv<T>::x_; - using sp_gemv<T>::y_; - using sp_gemv<T>::offload_; - using sp_gemv<T>::sparsity_; - - ~sp_gemv_gpu() { + using spgemv<T>::spgemv; + using spgemv<T>::initInputMatrixVector; + using spgemv<T>::nnz_; + using spgemv<T>::m_; + using spgemv<T>::n_; + using spgemv<T>::A_; + using spgemv<T>::x_; + using spgemv<T>::y_; + using spgemv<T>::offload_; + using spgemv<T>::sparsity_; + + ~spgemv_gpu() { // ToDo -- destroy the handle // Destroy streams after use @@ -45,14 +45,15 @@ class sp_gemv_gpu : public sp_gemv<T> { * - Always: Move data from host to device and device to host each iteration * - Unified: Initialise data as unified memory; no data movement semantics * required */ - void initialise(gpuOffloadType offload, int n, float sparsity) override { - std::cout << std::endl << "##############################" << std::endl - << "\tCUSPARSE GEMV\t\tInitialising n = " << n << "\tOffload" - << " type = " << - (((offload == gpuOffloadType::unified) ? "Unified" : (offload - == gpuOffloadType::always) ? "Always" : "Once")) - << std::endl - << "##############################" << std::endl; + void initialise(gpuOffloadType offload, int m, int n, float sparsity) + override { +// std::cout << std::endl << "##############################" << std::endl +// << "\tCUSPARSE GEMV\t\tInitialising n = " << n << "\tOffload" +// << " type = " << +// (((offload == gpuOffloadType::unified) ? "Unified" : (offload +// == gpuOffloadType::always) ? "Always" : "Once")) +// << std::endl +// << "##############################" << std::endl; offload_ = offload; sparsity_ = sparsity; @@ -83,6 +84,7 @@ class sp_gemv_gpu : public sp_gemv<T> { std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; exit(1); } + m_ = m; n_ = n; // Initialise 3 streams to asynchronously move data between host and device @@ -93,13 +95,11 @@ class sp_gemv_gpu : public sp_gemv<T> { std::cout << "\tcuda streams created" << std::endl; - // Work out the sizes of all the vectors - A_nnz_ = 1 + (uint64_t)(n_ * n_ * (1 - sparsity)); - vals_size_ = sizeof(T) * A_nnz_; - cols_size_ = sizeof(int) * A_nnz_; - rows_size_ = sizeof(int) * (n_ + 1); + vals_size_ = sizeof(T) * nnz_; + cols_size_ = sizeof(int) * nnz_; + rows_size_ = sizeof(int) * (m_ + 1); x_size_ = sizeof(T) * n_; - y_size_ = sizeof(T) * n_; + y_size_ = sizeof(T) * m_; if (offload_ == gpuOffloadType::unified) { // Get device identifier @@ -141,18 +141,14 @@ class sp_gemv_gpu : public sp_gemv<T> { // Initialise the matrices // Set initial values to 0 - A_ = (T*)malloc(sizeof(T) * n_ * n_); + A_ = (T*)malloc(sizeof(T) * m_ * n_); std::cout << "\tA_ dense array made" << std::endl; - initInputMatrixVectorSparse();git branc + initInputMatrixVector(); std::cout << "\tinputs made" << std::endl; - toCSR_int(A_, n_, n_, A_val_, A_col_, A_row_); - - std::cout << "\tA_ moved to CSR" << std::endl; - // std::cout << "_____Matrix A_____" << std::endl; // printDenseMatrix(A_, n_, n_); // std::cout << std::endl << std::endl; @@ -172,7 +168,7 @@ class sp_gemv_gpu : public sp_gemv<T> { case gpuOffloadType::always: { // Make matrix descriptor cusparseCheckError( - cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, + cusparseCreateCsr(&descrA_, m_, n_, nnz_, A_row_dev_, A_col_dev_, A_val_dev_, rType_, cType_, indType_, cudaDataType_)); std::cout << "\tA_ description made" << std::endl; @@ -180,7 +176,7 @@ class sp_gemv_gpu : public sp_gemv<T> { cusparseCheckError(cusparseCreateDnVec(&descrx_, n_, x_dev_, cudaDataType_)); std::cout << "\tx_ description made" << std::endl; - cusparseCheckError(cusparseCreateDnVec(&descry_, n_, NULL, + cusparseCheckError(cusparseCreateDnVec(&descry_, m_, NULL, cudaDataType_)); std::cout << "\ty_ description made" << std::endl; break; @@ -204,7 +200,7 @@ class sp_gemv_gpu : public sp_gemv<T> { // Create matrix descriptor cusparseCheckError( - cusparseCreateCsr(&descrA_, n_, n_, A_nnz_, A_row_dev_, + cusparseCreateCsr(&descrA_, m_, n_, nnz_, A_row_dev_, A_col_dev_, A_val_dev_, rType_, cType_, indType_, cudaDataType_)); std::cout << "\tA_ description made" << std::endl; @@ -212,7 +208,7 @@ class sp_gemv_gpu : public sp_gemv<T> { cusparseCheckError(cusparseCreateDnVec(&descrx_, n_, x_dev_, cudaDataType_)); std::cout << "\tx_ description made" << std::endl; - cusparseCheckError(cusparseCreateDnVec(&descry_, n_, NULL, + cusparseCheckError(cusparseCreateDnVec(&descry_, m_, NULL, cudaDataType_)); std::cout << "\ty_ description made" << std::endl; break; @@ -508,16 +504,14 @@ class sp_gemv_gpu : public sp_gemv<T> { cudaCheckError(cudaStreamDestroy(s3_)); } - - void toCSR_int(T* dense, int n_col, int n_row, T* vals, int* col_index, - int* row_ptr) { + void toSparseFormat() { int nnz_encountered = 0; - for (int row = 0; row < n_row; row++) { - row_ptr[row] = nnz_encountered; - for (int col = 0; col < n_col; col++) { - if (dense[(row * n_) + col] != 0.0) { - col_index[nnz_encountered] = col; - vals[nnz_encountered] = dense[(row * n_) + col]; + for (int row = 0; row < m_; row++) { + A_row_[row] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (A_[(row * n_) + col] != 0.0) { + A_col_[nnz_encountered] = col; + A_val_[nnz_encountered] = A_[(row * n_) + col]; nnz_encountered++; } } @@ -606,7 +600,7 @@ class sp_gemv_gpu : public sp_gemv<T> { T* A_val_dev_; int *A_col_dev_, *A_row_dev_; /** Metadata */ - uint64_t A_nnz_, vals_size_, cols_size_, rows_size_; + uint64_t vals_size_, cols_size_, rows_size_; /** * ################################ diff --git a/include/doGemv.hh b/include/doGemv.hh index ebc9262..0068a1c 100644 --- a/include/doGemv.hh +++ b/include/doGemv.hh @@ -68,7 +68,7 @@ class doGemv { initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv"); for (int dim = startDimention_; dim <= upperLimit_; dim++) { // M = dim, N = dim; - callDenseKernels(csvFile, dim, dim); + callKernels(csvFile, dim, dim); } // Close file csvFile.close(); @@ -93,7 +93,7 @@ class doGemv { int N = startDimention_; int M = 16 * N; while (M <= upperLimit_) { - callDenseKernels(csvFile, M, N); + callKernels(csvFile, M, N); M += 16; N++; } @@ -119,7 +119,7 @@ class doGemv { if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { // M = dim, N = 32; - callDenseKernels(csvFile, dim, 32); + callKernels(csvFile, dim, 32); } } // Close file @@ -144,7 +144,7 @@ class doGemv { M = startDimention_; N = 16 * M; while (N <= upperLimit_) { - callDenseKernels(csvFile, M, N); + callKernels(csvFile, M, N); M++; N += 16; } @@ -170,7 +170,7 @@ class doGemv { if (upperLimit_ >= 32) { for (int dim = startDimention_; dim <= upperLimit_; dim++) { // M = 32, N = dim; - callDenseKernels(csvFile, 32, dim); + callKernels(csvFile, 32, dim); } } // Close file @@ -185,7 +185,7 @@ class doGemv { private: /** Call the appropriate CPU and GPU GEMV kernels. */ - void callDenseKernels(std::ofstream& csvFile, const int M, const int N) { + void callKernels(std::ofstream& csvFile, const int M, const int N) { const double probSize = calcKib(M, N); const uint64_t flops = calcFlops(M, N); std::string kernelName = getKernelName(); diff --git a/include/doSpgemv.hh b/include/doSpgemv.hh index cf315e0..c2c6a3d 100644 --- a/include/doSpgemv.hh +++ b/include/doSpgemv.hh @@ -1,8 +1,425 @@ -// -// Created by Alexander Cockrean on 07/01/2025. -// +#pragma once +#include <sstream> +#include <type_traits> -#ifndef GPU_BLAS_OFFLOAD_BENCHMARK_DOSPGEMV_HH -#define GPU_BLAS_OFFLOAD_BENCHMARK_DOSPGEMV_HH +#include "helpers.hh" +#include "tablePrinter.hh" +#include "utilities.hh" -#endif //GPU_BLAS_OFFLOAD_BENCHMARK_DOSPGEMV_HH +#if defined CPU_ARMPL +#include "../ArmPL/spgemv.hh" +#elif defined CPU_ONEMKL +// Todo #include "../oneMKL/CPU/spgemv.hh" +#elif defined CPU_AOCL +// Todo #include "../AOCL/spgemv.hh" +#elif defined CPU_NVPL +// Todo #include "../NVPL/spgemv.hh" +#elif defined CPU_OPENBLAS +// Todo #include "../OpenBLAS/spgemv.hh" +#endif + +#if defined GPU_CUBLAS +#include "../cuBLAS/spgemv.hh" +#elif defined GPU_ONEMKL +// Todo #include "../oneMKL/GPU/spgemv.hh" +#elif defined GPU_ROCBLAS +// Todo #include "../rocBLAS/spgemv.hh" +#endif + +/** `T` represents the type of kernel that will be run - i.e. T=float is for + * SSPGEMV. */ +template <typename T> +class doSpgemv { +public: + doSpgemv(const std::string csvDir, const int iters, const int startDim, + const int upperLimit, const bool cpuEnabled = true, + const bool gpuEnabled = true) + : CSV_DIR(csvDir), + iterations_(iters), + startDimention_(startDim), + upperLimit_(upperLimit), + doCPU_(cpuEnabled), + doGPU_(gpuEnabled) +#if CPU_ENABLED + , + cpu_(iterations_) +#endif +#if GPU_ENABLED + , + gpu_(iterations_) +#endif + { + static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) && + "ERROR - doSpgemv can only be constructed using one of the " + "following types: [float, double]."); + } + + /** Run all problem types and write data to CSV files. */ + void collectData() { + // Square Problem Sizes... + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + std::ofstream csvFile = + initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = dim, N = dim; + callKernels(csvFile, dim, dim); + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Vector (M=N)"); + } +#endif + } + +private: + /** Call the appropriate CPU and GPU SPGEMV kernels. */ + void callKernels(std::ofstream& csvFile, const int M, const int N) { + const double probSize = calcKib(M, N); + const uint64_t flops = calcFlops(M, N); + std::string kernelName = getKernelName(); + + time_checksum_gflop cpuResult; + time_checksum_gflop gpuResult_once; + time_checksum_gflop gpuResult_always; + time_checksum_gflop gpuResult_unified; + +// Perform CPU kernel +#if CPU_ENABLED + if (doCPU_) { + cpu_.initialise(M, N); + cpuResult = cpu_.compute(); + cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); + // Write result to CSV file + writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, 0.0, + iterations_, cpuResult.runtime, cpuResult.gflops); + } +#endif + +// Perform the GPU kernels +#if GPU_ENABLED + if (doGPU_) { + // - ONCE : Offload to/from GPU once before all iterations and once + // after + gpu_.initialise(gpuOffloadType::once, M, N); + gpuResult_once = gpu_.compute(); + gpuResult_once.gflops = + calcGflops(flops, iterations_, gpuResult_once.runtime); + + // - ALWAYS: Offload to/from GPU every iteration + gpu_.initialise(gpuOffloadType::always, M, N); + gpuResult_always = gpu_.compute(); + gpuResult_always.gflops = + calcGflops(flops, iterations_, gpuResult_always.runtime); + + // - UNIFIED : data passed from host to device (and device to host) as + // needed + gpu_.initialise(gpuOffloadType::unified, M, N); + gpuResult_unified = gpu_.compute(); + gpuResult_unified.gflops = + calcGflops(flops, iterations_, gpuResult_unified.runtime); + + // Write results to CSV file + writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, 0, probSize, + 0.0, iterations_, gpuResult_once.runtime, + gpuResult_once.gflops); + writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, 0, + probSize, 0.0, iterations_, gpuResult_always.runtime, + gpuResult_always.gflops); + writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, 0, probSize, + 0.0, iterations_, gpuResult_unified.runtime, + gpuResult_unified.gflops); + } +#endif + +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Make sure all checksums match if CPU and GPU kernels are run. + // - The majority of BLAS Libraries guarentee the same result if a + // function + // is called multiple times. Given all input matrices are identical for + // each GPU offload type, we need only to compare the CPU and GPU + // checksums. + checkChecksums(cpuResult, gpuResult_once, gpuResult_always, + gpuResult_unified, M, N); + + // Check if offload structs should be reset + checkOffloadStructReset(cpuResult, gpuResult_once, gpuResult_always, + gpuResult_unified); + + // Check if offload threshold has been achieved for each GPU offload type. + updateOffloadStructs(cpuResult, gpuResult_once, gpuResult_always, + gpuResult_unified, M, N, probSize); + + // Update previous results + prev_gpuResult_once = gpuResult_once; + prev_gpuResult_always = gpuResult_always; + prev_gpuResult_unified = gpuResult_unified; + } +#endif + } + + /** Todo -- find a sensible way to do this for sparse */ + void checkChecksums(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified, const int M, + const int N) { + // Ensure that each checksum difference is less than 0.1% +// double hundredOverChecksum = 100 / std::fabs(cpuResult.checksum); +// if (((std::fabs(cpuResult.checksum - gpuResult_once.checksum) * +// hundredOverChecksum)) > 0.1 && +// ((std::fabs(cpuResult.checksum - gpuResult_always.checksum) * +// hundredOverChecksum)) > 0.1 && +// ((std::fabs(cpuResult.checksum - gpuResult_unified.checksum) * +// hundredOverChecksum)) > 0.1) { +// std::cerr << "ERROR - " << getKernelName() +// << " kernel checksums do not match:\n\tInput " +// "dimensions: M=" +// << M << ", N=" << N << std::endl; +// std::cerr << std::setprecision(10) +// << "\tCPU Checksum = " << cpuResult.checksum << std::endl; +// std::cerr << std::setprecision(10) +// << "\tGPU (Once) Checksum = " << gpuResult_once.checksum +// << std::endl; +// std::cerr << std::setprecision(10) +// << "\tGPU (Always) Checksum = " << gpuResult_always.checksum +// << std::endl; +// std::cerr << std::setprecision(10) +// << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum +// << std::endl; +// exit(1); +// } + } + + + /** Check whether the offload structures need to be reset; and doing so if + * required. + * - If CPU.gflops >= GPU.gflops for last two problem sizes, then reset + * offload structures as GPU may not necessarily have reached the offload + * threshold. + */ + void checkOffloadStructReset(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified) { + if ((cpuGpu_once_.M != 0) && (cpuResult.gflops >= gpuResult_once.gflops) && + (cpuResult.gflops >= prev_gpuResult_once.gflops)) { + cpuGpu_once_.cpuGflops = 0.0; + cpuGpu_once_.gpuGflops = 0.0; + cpuGpu_once_.probSize_kib = 0.0; + cpuGpu_once_.M = 0; + cpuGpu_once_.N = 0; + } + if ((cpuGpu_always_.M != 0) && + (cpuResult.gflops >= gpuResult_always.gflops) && + (cpuResult.gflops >= prev_gpuResult_always.gflops)) { + cpuGpu_always_.cpuGflops = 0.0; + cpuGpu_always_.gpuGflops = 0.0; + cpuGpu_always_.probSize_kib = 0.0; + cpuGpu_always_.M = 0; + cpuGpu_always_.N = 0; + } + if ((cpuGpu_unified_.M != 0) && + (cpuResult.gflops >= gpuResult_unified.gflops) && + (cpuResult.gflops >= prev_gpuResult_unified.gflops)) { + cpuGpu_unified_.cpuGflops = 0.0; + cpuGpu_unified_.gpuGflops = 0.0; + cpuGpu_unified_.probSize_kib = 0.0; + cpuGpu_unified_.M = 0; + cpuGpu_unified_.N = 0; + } + } + + /** Update the offload threshold structs if GPU.gflops > CPU.gflops. */ + void updateOffloadStructs(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified, const int M, + const int N, const double probSize) { + if ((cpuGpu_once_.M == 0) && cpuResult.gflops < gpuResult_once.gflops) { + cpuGpu_once_.cpuGflops = cpuResult.gflops; + cpuGpu_once_.gpuGflops = gpuResult_once.gflops; + cpuGpu_once_.probSize_kib = probSize; + cpuGpu_once_.M = M; + cpuGpu_once_.N = N; + } + if ((cpuGpu_always_.M == 0) && cpuResult.gflops < gpuResult_always.gflops) { + cpuGpu_always_.cpuGflops = cpuResult.gflops; + cpuGpu_always_.gpuGflops = gpuResult_always.gflops; + cpuGpu_always_.probSize_kib = probSize; + cpuGpu_always_.M = M; + cpuGpu_always_.N = N; + } + if ((cpuGpu_unified_.M == 0) && + cpuResult.gflops < gpuResult_unified.gflops) { + cpuGpu_unified_.cpuGflops = cpuResult.gflops; + cpuGpu_unified_.gpuGflops = gpuResult_unified.gflops; + cpuGpu_unified_.probSize_kib = probSize; + cpuGpu_unified_.M = M; + cpuGpu_unified_.N = N; + } + } + + /** Todo -- work out how tis can be determined for a sparse problem with + * an unknown algorithm + * A function for calculating FLOPs performed by a GEMV. + * y = alpha*Ax + beta*y */ + constexpr uint64_t calcFlops(const int M, const int N) const { + // A * x = 2*M*N (FMA) + // alpha * Ax = M (multiplication) + // beta * y = M (multiplication) + // Ax + y = M (addition) + // = 2MN + M + M + M + + // If beta==0; = 2MN + M ------- alpha*Ax Always done + // Else; = 2MN + 3M + uint64_t scalar = (BETA != 0) ? 3 : 1; + return (2 * (uint64_t)M * (uint64_t)N) + (scalar * (uint64_t)M); + } + + /** A function for calculating the total GEMV problem size in KiB. */ + constexpr double calcKib(const int M, const int N) const { + uint64_t M_ = (uint64_t)M, N_ = (uint64_t)N; + uint64_t probSize = (M_ * N_) + N_ + M_; + return ((double)(probSize * (sizeof(T))) / 1024); + } + + /** Get the name of the kernel being run. */ + std::string getKernelName() const { + switch (sizeof(T)) { + case 4: + return "sgemv"; + case 8: + return "dgemv"; + default: + return "unknown"; + } + } + + /** Print to stdout the offload thresholds. */ + void printOffloadThreshold(std::string problemName) const { + std::vector<std::string> header = { + "Device", "M", "N", "Total Prob. Size (KiB)", "GFLOP/s", "CPU GFLOP/s"}; + + std::vector<std::vector<std::string>> rows; + // Initialise GPU_Once row + std::stringstream probSize_o; + std::stringstream gpuGflops_o; + std::stringstream cpuGflops_o; + probSize_o << std::fixed << std::setprecision(2) + << cpuGpu_once_.probSize_kib; + gpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.gpuGflops; + cpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.cpuGflops; + if (cpuGpu_once_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Offload Once)", std::to_string(0), + std::to_string(0), probSize_o.str(), "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Offload Once)", std::to_string(cpuGpu_once_.M), + std::to_string(cpuGpu_once_.N), probSize_o.str(), + gpuGflops_o.str(), cpuGflops_o.str()}); + } + + // Initialise GPU_always row + std::stringstream probSize_a; + std::stringstream gpuGflops_a; + std::stringstream cpuGflops_a; + probSize_a << std::fixed << std::setprecision(2) + << cpuGpu_always_.probSize_kib; + gpuGflops_a << std::fixed << std::setprecision(2) + << cpuGpu_always_.gpuGflops; + cpuGflops_a << std::fixed << std::setprecision(2) + << cpuGpu_always_.cpuGflops; + if (cpuGpu_always_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Offload Always)", std::to_string(0), + std::to_string(0), probSize_a.str(), "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Offload Always)", std::to_string(cpuGpu_always_.M), + std::to_string(cpuGpu_always_.N), probSize_a.str(), + gpuGflops_a.str(), cpuGflops_a.str()}); + } + + // Initialise GPU_unified row + std::stringstream probSize_u; + std::stringstream gpuGflops_u; + std::stringstream cpuGflops_u; + probSize_u << std::fixed << std::setprecision(2) + << cpuGpu_unified_.probSize_kib; + gpuGflops_u << std::fixed << std::setprecision(2) + << cpuGpu_unified_.gpuGflops; + cpuGflops_u << std::fixed << std::setprecision(2) + << cpuGpu_unified_.cpuGflops; + if (cpuGpu_unified_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Unified Memory)", std::to_string(0), + std::to_string(0), probSize_u.str(), "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Unified Memory)", std::to_string(cpuGpu_unified_.M), + std::to_string(cpuGpu_unified_.N), probSize_u.str(), + gpuGflops_u.str(), cpuGflops_u.str()}); + } + + // Print table + tablePrinter tPrinter( + problemName + " Problem Domian GPU Offload Thresholds:", header, rows); + tPrinter.print(1); + } + + /** The output directory where CSV files should be saved to. */ + const std::string CSV_DIR; + + /** The number of iterations to perform per problem size. */ + const int iterations_; + + /** The value of the first probelm size dimention run. */ + const int startDimention_; + + /** The maximum value of the largest problem size dimention. */ + const int upperLimit_; + + /** Whether the CPU kernels should be run. */ + const bool doCPU_ = true; + + /** Whether the GPU kernels should be run. */ + const bool doGPU_ = true; + +#if CPU_ENABLED + /** The GEMV CPU kernel. */ + cpu::gemv_cpu<T> cpu_; +#endif + +#if GPU_ENABLED + /** The GEMV GPU kernel. */ + gpu::gemv_gpu<T> gpu_; +#endif + + /** The point at which offloading to GPU (offload once) becomes worthwhile. */ + cpuGpu_offloadThreshold cpuGpu_once_; + + /** The point at which offloading to GPU (offload always) becomes worthwhile. + */ + cpuGpu_offloadThreshold cpuGpu_always_; + + /** The point at which offloading to GPU (unified memory) becomes worthwhile. + */ + cpuGpu_offloadThreshold cpuGpu_unified_; + + /** The previous problem size's GPU (offload once) performance results. */ + time_checksum_gflop prev_gpuResult_once; + + /** The previous problem size's GPU (offload always) performance results. */ + time_checksum_gflop prev_gpuResult_always; + + /** The previous problem size's GPU (unified memory) performance results. */ + time_checksum_gflop prev_gpuResult_unified; +}; \ No newline at end of file diff --git a/include/kernels/CPU/spgemv.hh b/include/kernels/CPU/spgemv.hh index 28b0caf..84722c2 100644 --- a/include/kernels/CPU/spgemv.hh +++ b/include/kernels/CPU/spgemv.hh @@ -1,6 +1,6 @@ #pragma once -#include "../gemv.hh" +#include "../spgemv.hh" #include <random> #include <memory> @@ -9,44 +9,42 @@ namespace cpu { /** An abstract class for GEMV BLAS kernels. */ template <typename T> - class sp_gemv : public ::gemv<T> { + class spgemv : public ::spgemv<T> { public: - using ::gemv<T>::gemv; - using ::gemv<T>::initInputMatrixVectorSparse; - using ::gemv<T>::m_; - using ::gemv<T>::n_; - using ::gemv<T>::A_; - using ::gemv<T>::x_; - using ::gemv<T>::y_; - using ::gemv<T>::sparsity_; + using ::spgemv<T>::spgemv; + using ::spgemv<T>::initInputMatrixVector; + using ::spgemv<T>::m_; + using ::spgemv<T>::n_; + using ::spgemv<T>::A_; + using ::spgemv<T>::x_; + using ::spgemv<T>::y_; + using ::spgemv<T>::sparsity_; + using ::spgemv<T>::nnz_; public: /** Initialise the required data structures. */ - void initialise(int n, double sparsity) { - m_ = n; + void initialise(int m, int n, double sparsity) { + m_ = m; n_ = n; sparsity_ = sparsity; // Note that the below should be the same as the edges calculation // used in the initInputMatricesSparse function. If changed here, // change there - nnz_ = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - sparsity_)); + nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_)); A_ = (T*)malloc(sizeof(T) * m_ * n_); x_ = (T*)malloc(sizeof(T) * n_); y_ = (T*)malloc(sizeof(T) * m_); // Initialise the matrix and vectors - initInputMatrixVectorSparse(); + initInputMatrixVector(); } - protected: - uint64_t nnz_; - private: /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ - void postCallKernelCleanup() override { + void postCallKernelCleanup() { free(A_); free(x_); free(y_); diff --git a/include/kernels/CPU/spmm.hh b/include/kernels/CPU/spmm.hh index 7d19f5d..d90f48b 100644 --- a/include/kernels/CPU/spmm.hh +++ b/include/kernels/CPU/spmm.hh @@ -18,6 +18,7 @@ public: using ::spmm<T>::iterations_; using ::spmm<T>::nnzA_; using ::spmm<T>::nnzB_; + using ::spmm<T>::sparsity_; using ::spmm<T>::m_; using ::spmm<T>::n_; using ::spmm<T>::k_; @@ -43,7 +44,7 @@ public: B_ = (T*)malloc(sizeof(T) * k_ * n_); C_ = (T*)calloc(sizeof(T) * m_ * n_); - initInputMatrices(sparsity_); + initInputMatrices(); } private: @@ -54,7 +55,5 @@ private: free(B_); free(C_); } - - double sparsity_; }; } // namespace cpu diff --git a/include/kernels/GPU/spgemv.hh b/include/kernels/GPU/spgemv.hh index 75fd126..0a93c77 100644 --- a/include/kernels/GPU/spgemv.hh +++ b/include/kernels/GPU/spgemv.hh @@ -1,14 +1,14 @@ #pragma once -#include "../gemv.hh" +#include "../spgemv.hh" namespace gpu { /** An abstract class for GEMV BLAS kernels. */ template <typename T> - class sp_gemv : public ::gemv<T> { + class spgemv : public ::spgemv<T> { public: - using ::gemv<T>::gemv; + using ::spgemv<T>::spgemv; /** Initialise the required data structures. * `offload` refers to the data offload type: @@ -17,8 +17,8 @@ namespace gpu { * - Always: Move data from host to device and device to host each iteration * - Unified: Initialise data as unified memory; no data movement semantics * required */ - virtual void initialise(gpuOffloadType offload, int n, float sparsity) - = 0; + virtual void initialise(gpuOffloadType offload, int m, int n, + float sparsity) = 0; protected: /** Whether data should be offloaded to/from the GPU each iteration, or just diff --git a/include/kernels/spgemv.hh b/include/kernels/spgemv.hh index 9e7d953..297b406 100644 --- a/include/kernels/spgemv.hh +++ b/include/kernels/spgemv.hh @@ -1,8 +1,131 @@ -// -// Created by Alexander Cockrean on 07/01/2025. -// -#ifndef GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMV_HH -#define GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMV_HH +#pragma once -#endif //GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMV_HH +#include <algorithm> +#include <chrono> +#include <cmath> +#include <limits> +#include <random> + +#include "../utilities.hh" + +/** A generic abstract class defining the operation of timing an SPGEMM BLAS + * kernel for n iterations. */ +template <typename T> +class spgemv { +public: + spgemv(const int iters) : iterations_(iters) {} + + /** Call the BLAS kernel n times. + * Returns the time elapsed for n BLAS calls in seconds. */ + time_checksum_gflop compute() { + // Start timer + std::chrono::time_point<std::chrono::high_resolution_clock> startTime = + std::chrono::high_resolution_clock::now(); + + // Perform all SPGEMM calls + preLoopRequirements(); + for (int i = 0; i < iterations_; i++) { + callSpgemv(); + } + postLoopRequirements(); + + // Stop Timer + std::chrono::time_point<std::chrono::high_resolution_clock> endTime = + std::chrono::high_resolution_clock::now(); + // Get time elapsed in seconds + std::chrono::duration<double> time_s = endTime - startTime; + + double checksum = calcChecksum(); + + postCallKernelCleanup(); + + return {time_s.count(), checksum, 0.0}; + } + + int64_t nnz_ = 0; + +private: + /** Perform any required steps before calling the SPGEMV kernel that should + * be timed. */ + virtual void preLoopRequirements() = 0; + + /** Perform the SPGEMV kernel. */ + virtual void callSpgemv() = 0; + + /** Perform any required steps after calling the SPGEMV kernel that should + * be timed. */ + virtual void postLoopRequirements() = 0; + + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + virtual void postCallKernelCleanup() = 0; + + /** Calculate a checksum from the result vector y. */ + // Todo -- work out how to sensibly do this for sparse + constexpr double calcChecksum() { + // Checksum for GEMV calculated by summing max and min element of output + // vector + return ((double)y_[0] + (double)y_[m_ - 1]); + } + +protected: + void initInputMatrixVector() { + // Initialise matric to + for (size_t i = 0; i < (n_ * m_); i++) { + A_[i] = 0.0; + } + + // Random number generator objects for use in descent + std::default_random_engine gen; + gen.seed(std::chrono::system_clock::now() + .time_since_epoch().count()); + std::uniform_real_distribution<double> dist(0.0, 1.0); + + // Using a=0.45 and b=c=0.22 as default probabilities + for (size_t i = 0; i < nnz_; i++) { + while (!rMat(A_, m_, 0, n_ - 1, 0, m_ - 1, 0.45, 0.22, 0.22, &gen, dist, + false)) {} + } + + // Initialise the input and output vectors + for (int y = 0; y < n_; y++) { + x_[y] = (T)((double)(rand() % 100) / 3.0); + } + for (int y = 0; y < m_; y++) { + y_[y] = (T)0.0; + } + + toSparseFormat(); + } + + /** Move starting matrix into the sparse representation of for the given + * library */ + virtual void toSparseFormat() = 0; + + /** Call the extern consume() function. */ + void callConsume() { consume((void*)A_, (void*)x_, (void*)y_); } + + /** The number of iterations to perform per problem size. */ + const int iterations_; + + /** Matrix dimension M. */ + int m_ = 0; + + /** Matrix / vector dimension N. */ + int n_ = 0; + + /** Input matrix A. */ + T* A_; + + /** Input vector x. */ + T* x_; + + /** Input vector y. */ + T* y_; + + /** The distance between two vector elements. */ + const int vecIncrement_ = 1; + + double sparsity_ = 0.0; +}; diff --git a/include/kernels/spmm.hh b/include/kernels/spmm.hh index 37de9cf..9d45f56 100644 --- a/include/kernels/spmm.hh +++ b/include/kernels/spmm.hh @@ -4,7 +4,6 @@ #include <chrono> #include <cmath> #include <limits> -#include <random> #include <iostream> #include "../utilities.hh" @@ -94,7 +93,7 @@ protected: false)) {} } - toSparseFormat() + toSparseFormat(); } /** Move matrices into the sparse representation of for the given library */ @@ -103,47 +102,6 @@ protected: /** Call the external consume() function on the matrices */ void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); }/** Recursive function to populate sparse matrices */ - // On first iteration, n should be x2 + 1 - bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b, - float c, std::default_random_engine* gen, - std::uniform_real_distribution<double> dist, bool bin) { - // If a 1x1 submatrix, then add an edge and return out - if (x1 >= x2 && y1 >= y2) { - // Needed to avoid overflow segfaults with large problem sizes - uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1); - if (abs(M[index]) > 0.1) { - return false; - } else { - // Add 1.0 if this is a binary graph, and a random real number otherwise - M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0); - return true; - } - } else { - // Divide up the matrix - int xMidPoint = x1 + floor((x2 - x1) / 2); - int yMidPoint = y1 + floor((y2 - y1) / 2); - - // Work out which quarter to recurse into - // There are some ugly ternary operators here to avoid going out of bounds in the edge case - // that we are already at 1 width or 1 height - float randomNum = dist(*gen); - if (randomNum < a) { - return rMat(M, n, x1, xMidPoint, y1, yMidPoint, - a, b, c, gen, dist, bin); - } else if (randomNum < (a + b)) { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, - a, b, c, gen, dist, bin); - } else if (randomNum < (a + b + c)) { - return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, - a, b, c, gen, dist, bin); - } else { - return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, - ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, a, - b, c, gen, dist, bin); - } - } - } - /** The number of iterations to perform per problem size. */ const int iterations_; @@ -165,4 +123,6 @@ protected: /** Dense representation of output matrix C. */ T* C_; + double sparsity_; + }; \ No newline at end of file diff --git a/include/utilities.hh b/include/utilities.hh index ac0aeb0..675ac2c 100644 --- a/include/utilities.hh +++ b/include/utilities.hh @@ -1,5 +1,7 @@ #pragma once +#include <random> + // Define CPU related macros #if defined CPU_ARMPL #define CPU_LIB_NAME "Arm Performance Libraries" @@ -76,4 +78,110 @@ struct cpuGpu_offloadThreshold { // performed. extern "C" { int consume(void* a, void* b, void* c); -} \ No newline at end of file +} + + +/** + * RMAT is a recursive function used to generate sparse matrices. It is + * needed for both single and double precision so I've simply overloaded this + * function to have M as both float and double types. Ugly, but works for + * now. + * Todo -- Consider different approach if other data types are supported in the + * future. + */ + +/** + * @param M input matrix + * @param n number of columns in the full matrix (i.e. full range of the x axis) + * @param x1 beginning x coordinate of the submatrix + * @param x2 ending x coordinate of the submatrix + * @param y1 starting y coordinate of the submatrix + * @param y2 ending y coordinate of the submatrix + * @param a probability of tile a being chosen + * @param b probability of tile b being chosen + * @param c probability of tile c being chosen + * @param gen random number generator + * @param dist random number distribution + * @param bin bool to decide whether values added are binary of float/double + * @return + */ +bool rMat(float* M, int n, int x1, int x2, int y1, int y2, float a, float b, + float c, std::default_random_engine* gen, + std::uniform_real_distribution<double> dist, bool bin) { + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + // Needed to avoid overflow segfaults with large problem sizes + uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1); + if (abs(M[index]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds + // in the edge case that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + a, b, c, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + a, b, c, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + a, b, c, gen, dist, bin); + } else { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, a, + b, c, gen, dist, bin); + } + } + return true; +} +bool rMat(double* M, int n, int x1, int x2, int y1, int y2, float a, float b, + float c, std::default_random_engine* gen, + std::uniform_real_distribution<double> dist, bool bin) { + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + // Needed to avoid overflow segfaults with large problem sizes + uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1); + if (abs(M[index]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + a, b, c, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + a, b, c, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + a, b, c, gen, dist, bin); + } else { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, a, + b, c, gen, dist, bin); + } + } + return true; +} From d7ad2b7639095e5bfa2e7f4985be5aa22b7112e7 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Thu, 9 Jan 2025 15:44:59 +0000 Subject: [PATCH 37/38] Finishing off armpl and cusparse kernels --- .idea/workspace.xml | 33 +- ArmPL/spgemm.hh | 417 +++++++++++++++++++++ ArmPL/spmm.hh | 7 +- cuBLAS/spgemm.hh | 323 +++++++++++++++++ cuBLAS/spmm.hh | 8 +- include/doSpgemm.hh | 661 +++++++++++++++++++++++++++++++++- include/doSpmm.hh | 10 +- include/kernels/CPU/spgemm.hh | 56 +++ include/kernels/CPU/spgmm.hh | 8 - include/kernels/CPU/spmm.hh | 3 +- include/kernels/GPU/spgemm.hh | 32 +- include/kernels/spgemm.hh | 134 ++++++- include/kernels/spmm.hh | 3 + 13 files changed, 1641 insertions(+), 54 deletions(-) create mode 100644 ArmPL/spgemm.hh create mode 100644 cuBLAS/spgemm.hh create mode 100644 include/kernels/CPU/spgemm.hh delete mode 100644 include/kernels/CPU/spgmm.hh diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 3d4f373..8556bf2 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -15,18 +15,19 @@ </configurations> </component> <component name="ChangeListManager"> - <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Refactoring to make individual files relate to a single kernel"> + <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Moving spgemv into new format"> + <change afterPath="$PROJECT_DIR$/ArmPL/spgemm.hh" afterDir="false" /> + <change afterPath="$PROJECT_DIR$/cuBLAS/spgemm.hh" afterDir="false" /> <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/ArmPL/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spgemv.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/cuBLAS/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spgemv.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/doGemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doGemv.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/doSpgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpgemv.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/kernels/CPU/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spgemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/ArmPL/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spmm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/cuBLAS/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spmm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/doSpgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpgemm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/doSpmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpmm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/CPU/spgmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spgemm.hh" afterDir="false" /> <change beforePath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/kernels/GPU/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemv.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/kernels/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spgemv.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/GPU/spgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/spgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spgemm.hh" afterDir="false" /> <change beforePath="$PROJECT_DIR$/include/kernels/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spmm.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/utilities.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/utilities.hh" afterDir="false" /> </list> <option name="SHOW_DIALOG" value="false" /> <option name="HIGHLIGHT_CONFLICTS" value="true" /> @@ -574,7 +575,15 @@ <option name="project" value="LOCAL" /> <updated>1736268772766</updated> </task> - <option name="localTasksCounter" value="49" /> + <task id="LOCAL-00049" summary="Moving spgemv into new format"> + <option name="closed" value="true" /> + <created>1736345071717</created> + <option name="number" value="00049" /> + <option name="presentableId" value="LOCAL-00049" /> + <option name="project" value="LOCAL" /> + <updated>1736345071717</updated> + </task> + <option name="localTasksCounter" value="50" /> <servers /> </component> <component name="TypeScriptGeneratedFilesManager"> @@ -592,7 +601,6 @@ </option> </component> <component name="VcsManagerConfiguration"> - <MESSAGE value="Now compiles" /> <MESSAGE value="Now compiles with fewer runtime errors" /> <MESSAGE value="Implementing other offload types - still some runtime errors" /> <MESSAGE value="All implemented and running. No checksum at the end" /> @@ -617,6 +625,7 @@ <MESSAGE value="Beginning gemv ARMPL" /> <MESSAGE value="Getting rid of old oneMKL sparse file" /> <MESSAGE value="Refactoring to make individual files relate to a single kernel" /> - <option name="LAST_COMMIT_MESSAGE" value="Refactoring to make individual files relate to a single kernel" /> + <MESSAGE value="Moving spgemv into new format" /> + <option name="LAST_COMMIT_MESSAGE" value="Moving spgemv into new format" /> </component> </project> \ No newline at end of file diff --git a/ArmPL/spgemm.hh b/ArmPL/spgemm.hh new file mode 100644 index 0000000..0f9e81d --- /dev/null +++ b/ArmPL/spgemm.hh @@ -0,0 +1,417 @@ +#pragma once + +#ifdef CPU_ARMPL +#include <stdlib.h> +#include <armpl.h> +#include <omp.h> + +#include <algorithm> +#include <iostream> + +#include "../include/kernels/CPU/spgemm.hh" +#include "../include/utilities.hh" + +namespace cpu { + /** + * a class for sparse matrix-dense matric CPU BLAS kernels + */ +class spgemm_cpu : public spgemm<T> { +public: + using spgemm<T>::spgemm; + using spgemm<T>::callConsume; + using spgemm<T>::m_; + using spgemm<T>::n_; + using spgemm<T>::k_; + using spgemm<T>::A_; + using spgemm<T>::B_; + using spgemm<T>::C_; + using spgemm<T>::nnz_; + +protected: + void toSparseFormat() override { + + m_armpl_ = m_; + n_armpl_ = n_; + k_armpl_ = k_; + + nnzA_ = nnz_; + nnzB_ = k_ * n_; + // ToDo -- check whether flags_ is correct! + flags_ = 0; + + // Move A to CSR + A_armpl_row_ptr_ = new armpl_int_t[m_ + 1]; + A_armpl_col_index_ = new armpl_int_t[nnzA_]; + A_vals_ = new T[nnzA_]; + A_armpl_row_ptr_[0] = 0; + int nnz_encountered = 0; + + for (int row = 0; row < m_; row++) { + A_armpl_row_ptr_[row + 1] = nnz_encountered; + for (int col = 0; col < k_; col++) { + if (A_[(row * k_) + col] != 0.0) { + A_armpl_col_index_[nnz_encountered] = col; + A_vals_[nnz_encountered] = static_cast<T>(A_[(row * k_) + col]); + nnz_encountered++; + } + } + } + + // Move B to CSR + B_armpl_row_ptr_ = new armpl_int_t[k_ + 1]; + B_armpl_col_index_ = new armpl_int_t[nnz_]; + B_vals_ = new T[nnz_]; + B_armpl_row_ptr_[0] = 0; + + nnz_encountered = 0; + for (int row = 0; row < k_; row++) { + B_armpl_row_ptr_[row + 1] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (B_[(row * n_) + col] != 0.0) { + B_armpl_col_index_[nnz_encountered] = col; + B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]); + nnz_encountered++; + } + } + } + + // Move C to CSR + C_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; + C_armpl_col_index_ = new armpl_int_t[0]; + C_vals_ = new T[0]; + // ToDo Commented out below as it should be needed? +// C_armpl_row_ptr_[0] = 0; +// +// nnz_encountered = 0; +// for (int row = 0; row < n_; row++) { +// C_armpl_row_ptr_[row + 1] = nnz_encountered; +// for (int col = 0; col < n_; col++) { +// if (B_[(row * n_) + col] != 0.0) { +// C_armpl_col_index_[nnz_encountered] = col; +// C_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]); +// nnz_encountered++; +// } +// } +// } + + if constexpr (std::is_same_v<T, float>) { +// printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_s(&A_armpl_, + m_armpl_, + k_armpl_, + A_armpl_row_ptr_, + A_armpl_col_index_, + A_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_s(&B_armpl_, + k_armpl_, + n_armpl_, + B_armpl_row_ptr_, + B_armpl_col_index_, + B_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_s(&C_armpl_, + m_armpl_, + n_armpl_, + C_armpl_row_ptr_, + C_armpl_col_index_, + C_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } else if constexpr (std::is_same_v<T, double>) { +// printCSR(n_armpl_, A_armpl_row_ptr_, A_armpl_col_index_, A_vals_, +// nnz_, flags_ + status_ = armpl_spmat_create_csr_d(&A_armpl_, + m_armpl_, + k_armpl_, + A_armpl_row_ptr_, + A_armpl_col_index_, + A_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// printCSR(n_armpl_, B_armpl_row_ptr_, B_armpl_col_index_, B_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_d(&B_armpl_, + k_armpl_, + n_armpl_, + B_armpl_row_ptr_, + B_armpl_col_index_, + B_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// printCSR(n_armpl_, C_armpl_row_ptr_, C_armpl_col_index_, C_vals_, +// nnz_, flags_); + status_ = armpl_spmat_create_csr_d(&C_armpl_, + m_armpl_, + n_armpl_, + C_armpl_row_ptr_, + C_armpl_col_index_, + C_vals_, + flags_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// std::cout << "Okay, all matrices made!!" << std::endl; + } + } + +private: + /** Make call to the GEMM kernel. */ + void callGemm() override { + + /** + * Flow of ARMPL Sparse LA: + * + * 1. Create sparse matrix objects: armpl_spmat_create_csr[sdcz]() + * + * 2. Supply hints on usage: armpl_spmat_hint() + * + * 3. Optimise for SpMV: armpl_spmv_optimize() + * + * 4. Solve SpMV case: armpl_spmv_exec_[sdcz]() + * + * 5. Destroy sparse matrix object: armpl_spmat_destroy() + * + * In addiion, users can choose to update a set of non-zero values using + * armpl_spmat_update_[sdcz]() + */ + + // Todo -- See if using armpl_spmat_hint can improve performance here. + // If so, follow with optimisation functions + + if constexpr (std::is_same_v<T, float>) { + status_ = armpl_spmm_exec_s(transA_, + transB_, + alpha, + A_armpl_, + B_armpl_, + beta, + C_armpl_); + } else if constexpr (std::is_same_v<T, double>) { + status_ = armpl_spmm_exec_d(transA_, + transB_, + alpha, + A_armpl_, + B_armpl_, + beta, + C_armpl_); + } else { + // Un-specialised class will not do any work - print error and exit. + std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported." + << std::endl; + exit(1); + } + + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + // Ensure compiler doesn't optimise away the work being done + callConsume(); + } + + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override { + // Need to put A_ and B_ into A_armpl_ and B_armpl_ + toCSR_armpl(); + + /** providing hints to ARMPL and optimizing the matrix datastructures */ + // TODO -- is noallocs best here? + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_MEMORY, + ARMPL_SPARSE_MEMORY_NOALLOCS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_MEMORY, + ARMPL_SPARSE_MEMORY_NOALLOCS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_STRUCTURE, + ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_STRUCTURE, + ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + // TODO -- will this be FEW? + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS, + ARMPL_SPARSE_INVOCATIONS_MANY); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_INVOCATIONS, + ARMPL_SPARSE_INVOCATIONS_MANY); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION, + ARMPL_SPARSE_OPERATION_NOTRANS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_OPERATION, + ARMPL_SPARSE_OPERATION_NOTRANS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + // TODO -- investigate whch is better here + status_ = armpl_spmat_hint(A_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY, + ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, ARMPL_SPARSE_HINT_SPMM_STRATEGY, + ARMPL_SPARSE_SPMM_STRAT_OPT_PART_STRUCT); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + +// TODO -- this is thorwing an error -- couldn't immediately fix so come +// back to + +// /** provide hints for the optimisation of the spmm execution */ +// status_ = armpl_spmm_optimize(ARMPL_SPARSE_OPERATION_NOTRANS, +// ARMPL_SPARSE_OPERATION_NOTRANS, +// ARMPL_SPARSE_SCALAR_ONE, +// A_armpl_, B_armpl_, +// ARMPL_SPARSE_SCALAR_ZERO, +// C_armpl_); +// if (status_ != ARMPL_STATUS_SUCCESS) { +// std::cout << "ERROR " << status_ << std::endl; +// exit(1); +// } + } + + + void postLoopRequirements() override { + status_ = armpl_spmat_destroy(A_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_destroy(B_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_destroy(C_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + delete [] A_armpl_row_ptr_; + delete [] A_armpl_col_index_; + delete [] A_vals_; + delete [] B_armpl_row_ptr_; + delete [] B_armpl_col_index_; + delete [] B_vals_; + delete [] C_armpl_row_ptr_; + delete [] C_armpl_col_index_; + delete [] C_vals_; + + } + + /** The constant value Alpha. */ + const T alpha = ALPHA; + + /** The constant value Beta. */ + const T beta = BETA; + + void printCSR(armpl_int_t n, armpl_int_t* rp, armpl_int_t* ci, T* v, + armpl_int_t nz, armpl_int_t f) { + std::cout << "\tn = " << n << std::endl; + std::cout << "\trow ptr (size = " << sizeof(rp[0]) << ") = [" << rp[0]; + for (int i = 1; i < (n + 1); i++) { + std::cout << ", " << rp[i]; + } + std::cout << "]" << std::endl << "\tcol ind (size = " << sizeof(ci[0]) << + ") = [" << ci[0]; + for (int i = 1; i < nz; i++) { + std::cout << ", " << ci[i]; + } + std::cout << "]" << std::endl << "\tvals (size = " << sizeof(v[0]) << + ") = [" << v[0]; + for (int i = 1; i < nz; i++) { + std::cout << ", " << v[i]; + } + std::cout << "]" << std::endl << "\tflags = " << f << std::endl; + } + + int64_t nnzA_; + int64_t nnzB_; + + armpl_status_t status_; + + armpl_int_t flags_; + + armpl_int_t m_armpl_; + armpl_int_t n_armpl_; + armpl_int_t k_armpl_; + + armpl_int_t* A_armpl_row_ptr_; + armpl_int_t* A_armpl_col_index_; + armpl_int_t* B_armpl_row_ptr_; + armpl_int_t* B_armpl_col_index_; + armpl_int_t* C_armpl_row_ptr_; + armpl_int_t* C_armpl_col_index_; + + armpl_spmat_t A_armpl_; + armpl_spmat_t B_armpl_; + armpl_spmat_t C_armpl_; + + armpl_sparse_hint_value transA_ = ARMPL_SPARSE_OPERATION_NOTRANS; + armpl_sparse_hint_value transB_ = ARMPL_SPARSE_OPERATION_NOTRANS; +}; +} + + + +#endif diff --git a/ArmPL/spmm.hh b/ArmPL/spmm.hh index 93ed4b5..9680f09 100644 --- a/ArmPL/spmm.hh +++ b/ArmPL/spmm.hh @@ -1,18 +1,18 @@ #pragma once #ifdef CPU_ARMPL -#include <stdio.h> #include <stdlib.h> #include <armpl.h> #include <omp.h> #include <algorithm> +#include <iostream> #include "../include/kernels/CPU/spmm.hh" #include "../include/utilities.hh" namespace cpu { -/** A class for GEMM CPU BLAS kernels. */ +/** A class for sparse matrix-sparse matrix CPU BLAS kernels. */ template <typename T> class spmm_cpu : public spmm<T> { public: @@ -363,9 +363,6 @@ class spmm_cpu : public spmm<T> { /** The constant value Beta. */ const T beta = BETA; - void toCSR_armpl() { - } - void printCSR(armpl_int_t n, armpl_int_t* rp, armpl_int_t* ci, T* v, armpl_int_t nz, armpl_int_t f) { std::cout << "\tn = " << n << std::endl; diff --git a/cuBLAS/spgemm.hh b/cuBLAS/spgemm.hh new file mode 100644 index 0000000..d4233fd --- /dev/null +++ b/cuBLAS/spgemm.hh @@ -0,0 +1,323 @@ +#pragma once + +#ifdef GPU_CUBLAS +#include <cusparse_v2.h> +#include <cuda_runtime.h> +#include <type_traits> +#include <random> +#include <iostream> + +#include "../include/kernels/GPU/spgemm.hh" +#include "../include/utilities.hh" +#include "common.hh" + +namespace gpu { + /** + * A class for sparse matrix-dense matrix BLAS + */ +template <typename T> +class spgemm_gpu : public spgemm<T> { +public: + using spmm<T>::spmm; + using spmm<T>::initInputMatrices; + using spmm<T>::m_ + using spmm<T>::n_; + using spmm<T>::k_ + using spmm<T>::A_; + using spmm<T>::B_; + using spmm<T>::C_; + using spmm<T>::offload_; + using spmm<T>::nnz_; + + void initialise(gpuOffloadType offload, int n, double sparsity) override { + offload_ = offload; + + if (std::is_same_v<T, float>) cudaDataType_ = CUDA_R_32F; + else if (std::is_same_v<T, double>) cudaDataType_ = CUDA_R_64F; + else { + std::cout << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; + exit(1); + } + m_ = m; + n_ = n; + k_ = k; + + A_ = (T*)malloc(sizeof(T) * m_ * k_); + B_ = (T*)malloc(sizeof(T) * k_ * n_); + C_ = (T*)calloc(sizeof(T) * m_ * n_); + + /** Determine the number of nnz elements in A and B */ + nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + + // Get device identifier + cudaCheckError(cudaGetDevice(&gpuDevice_)); + + // Initialise 3 streams to asynchronously move data between host and device + cudaCheckError(cudaStreamCreate(&s1_)); + cudaCheckError(cudaStreamCreate(&s2_)); + cudaCheckError(cudaStreamCreate(&s3_)); + + if (offload_ == gpuOffloadType::unified) { + cudaCheckError(cudaMallocManaged(&A_val_, sizeof(T) * nnz_)); + cudaCheckError(cudaMallocManaged(&A_col_, sizeof(int) * nnz_)); + cudaCheckError(cudaMallocManaged(&A_row_, sizeof(int) * (m_ + 1))); + + cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * k_ * n_)); + + cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * m_ * n_)); + } else { + A_val_ = (T*)malloc(sizeof(T) * nnz_); + A_col_ = (int*)malloc(sizeof(int) * nnz_); + A_row_ = (int*)malloc(sizeof(int) * (m_ + 1)); + + B_ = (T*)malloc(sizeof(T) * k_ * n_); + + C_ = (T*)malloc(sizeof(T) * m_ * n_); + + cudaCheckError(cudaMalloc((void**)&A_val_dev_, sizeof(T) * nnz_)); + cudaCheckError(cudaMalloc((void**)&A_col_dev_, sizeof(T) * nnz_)); + cudaCheckError(cudaMalloc((void**)&A_row_dev_, sizeof(T) * (m_ + 1))); + + cudaCheckError(cudaMalloc((void**)&B_dev_, sizeof(T) * k_ * n_)); + + cudaCheckError(cudaMalloc((void**)&C_dev_, sizeof(T) * m_ * n_)); + } + + cusparseCheckError(cusparseCreate(&handle_)); + + initInputMatrices(); + } + +protected: + void toSparseFormat() override { + // Load A into CSR + int nnz_encountered = 0; + for (int row = 0; row < m_; row++) { + A_row_[row] = nnz_encountered; + int nnz_row = 0; + for (int col = 0; col < k_; col++) { + if (B_[(row * k_) + col] != 0.0) { + nnz_row++; + A_col_[nnz_encountered] = col; + A_val_[nnz_encountered] = A_[(row * k_) + col]; + nnz_encountered++; + } + } + } + A_row_[m_] = nnz_encountered; + + B_order_ = C_order_ = CUSPARSE_ORDER_ROW; + } + +private: + void preLoopRequirements() override { + // Todo -- do I need a SPMM description here? + switch(offload_) { + case gpuOffloadType::always: { + [[fallthorugh]]; + } + case gpuOffloadType::once: { + cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, (sizeof(T) * nnz_), + cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, + (sizeof(int) * nnz_), + cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, + (sizeof(int) * (m_ + 1)), + cudaMemcpyHostToDevice, s1_)); + + cudaCheckError(cudaMemcpyAsync(B_dev_, B_, (sizeof(T) * k_ * n_), + cudaMemcpyHostToDevice, s2_)); + + cudaCheckError(cudaMemcpyAsync(C_dev_, C_, (sizeof(T) * m_ * n_), + cudaMemcpyHostToDevice, s3_)); + + + cusparseCreateCsr(&descrA_, m_, k_, nnz_, A_row_dev_, + A_col_dev_, A_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateDnMat(&descrB_, B_num_rows_, B_num_cols_, + B_leading_dim_, B_dev_, cudaDataType_, + B_order_)); + cusparseCheckError( + cusparseCreateDnMat(&descrC_, C_num_rows_, C_num_cols_, + C_leading_dim_, C_dev_, cudaDataType_, + C_order_)); + break; + } + case gpuOffloadType::unified: { + cudaCheckError(cudaMemPrefetchAsync(A_val_, sizeof(T) * nnz_, + gpuDevice_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_col_, sizeof(int) * nnz_, + gpuDevice_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_row_, sizeof(int) * (m_ + 1), + gpuDevice_, s1_)); + + cudaCheckError(cudaMemPrefetchAsync(B_, sizeof(T) * n_ * k_, + gpuDevice_, s2_)); + + cudaCheckError(cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_, + gpuDevice_, s3_)); + + cudaCheckError(cudaDeviceSynchronize()); + + + cusparseCheckError( + cusparseCreateCsr(&descrA_, m_, k_, nnzA_, A_row_, A_col_, + A_val_, rType_, cType_, indType_, + cudaDataType_)); + cusparseCheckError( + cusparseCreateDnMat(&descrB_, B_num_rows_, B_num_cols_, + B_leading_dim_, B_, cudaDataType_, + B_order_)); + cusparseCheckError( + cusparseCreateDnMat(&descrC_, C_num_rows_, C_num_cols_, + C_leading_dim_, C_, cudaDataType_, + C_order_)); + break; + } + } + } + + void callGemm() override { + switch(offload_) { + case gpuOffloadType::always: { + // Clean up old descriptors + cusparseCheckError(cusparseDestroySpMat(descrA_)); + cuspraseCheckError(cusparseDestroyDnMat(descrB_)); + cuspraseCheckError(cusparseDestroyDnMat(descrC_)); + + // Move over data + cudaCheckError(cudaMemcpyAsync(A_val_dev_, A_val_, (sizeof(T) * nnz_), + cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_col_dev_, A_col_, + (sizeof(int) * nnz_), + cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_row_dev_, A_row_, + (sizeof(int) * (m_ + 1)), + cudaMemcpyHostToDevice, s1_)); + + cudaCheckError(cudaMemcpyAsync(B_dev_, B_, (sizeof(T) * k_ * n_), + cudaMemcpyHostToDevice, s2_)); + + cudaCheckError(cudaMemcpyAsync(C_dev_, C_, (sizeof(T) * m_ * n_), + cudaMemcpyHostToDevice, s3_)); + + cudaCheckError(cudaDeviceSynchronize()); + + // Set up descriptors + cusparseCreateCsr(&descrA_, m_, k_, nnz_, A_row_dev_, + A_col_dev_, A_val_dev_, rType_, cType_, + indType_, cudaDataType_)); + cusparseCheckError( + cusparseCreateDnMat(&descrB_, B_num_rows_, B_num_cols_, + B_leading_dim_, B_dev_, cudaDataType_, + B_order_)); + cusparseCheckError( + cusparseCreateDnMat(&descrC_, C_num_rows_, C_num_cols_, + C_leading_dim_, C_dev_, cudaDataType_, + C_order_)); + + // Begin matrix-matrix multiplication + cusparseCheckError( + cusparseSpMM_bufferSize(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, + cudaDataType_, alg_, &buffer_size_1_)); + + cudaCheckError(cudaMalloc((void**)&buffer1_, buffer_size_1_)); + cusparseCheckError( + cusparseSpMM_preprocess(handle_, opA_, opB_, &alpha, descrA_, + descrB_, &beta, descrC_, + cudaDataType_, alg_, buffer1_)); + cusparseCheckError( + cusparseSpMM(handle_, opA_, opB_, &alpha, descrA_, descrB_, + &beta, descrC_, cudaDataType_, alg_, buffer1_)); + } + } + } + + /** Handle used when calling cuBLAS. */ + cusparseHandle_t handle_; + + /** CUDA Stream 1 - used to asynchronously move data between host and device. + */ + cudaStream_t s1_; + + /** CUDA Stream 1 - used to asynchronously move data between host and device. + */ + cudaStream_t s2_; + + /** CUDA Stream 1 - used to asynchronously move data between host and device. + */ + cudaStream_t s3_; + + /** The ID of the target GPU Device. */ + int gpuDevice_; + + bool C_mem_allocated_always_; + bool C_mem_allocated_once_; + bool C_mem_allocated_unified_; + + /** The constant value Alpha. */ + const T alpha = ALPHA; + + /** The constant value Beta. */ + const T beta = BETA; + + + size_t buffer_size1_ = 0; + size_t buffer_size2_ = 0; + void* buffer1_ = NULL; + void* buffer2_ = NULL; + + cusparseOperation_t opA_ = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseOperation_t opB_ = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseSpMMAlg_t alg_ = CUSPARSE_SPMM_ALG_DEFAULT; + + // Data type depends on kernel being run + cudaDataType_t cudaDataType_; + + /** + * ___________ Host data ______________ + */ + /** CSR format vectors for matrix A */ + cusparseSpMatDescr_t descrA_; + T* A_val_; + int* A_col_; + int* A_row_; + int64_t A_num_rows_; + int64_t A_num_cols_; + + /** dense format values for matrices B and C */ + cusparseDnMatDescr_t descrB_; + int B_num_rows_; + int B_num_cols_; + int B_leading_dim_; + cusparseOrder_t B_order_; + + cusaprseDnMatDescr_t descrC_; + int C_num_rows_; + int C_num_cols_; + int C_leading_dim_; + cusparseOrder_t C_order_; + + /** + * _____________ Device data ________________ + */ + T* A_val_dev_; + int* A_col_dev_; + int* A_row_dev_; + + T* B_dev_; + + T* C_dev_; + + + +}; + +}; + + +#endif diff --git a/cuBLAS/spmm.hh b/cuBLAS/spmm.hh index 071c8c1..249f1ea 100644 --- a/cuBLAS/spmm.hh +++ b/cuBLAS/spmm.hh @@ -50,14 +50,12 @@ class spmm_gpu : public spmm<T> { A_ = (T*)malloc(sizeof(T) * m_ * k_); B_ = (T*)malloc(sizeof(T) * k_ * n_); - C_ = (T*)calloc(sizeof(T) * m_ * n_);Ã¥ + C_ = (T*)calloc(sizeof(T) * m_ * n_); /** Determine the number of nnz elements in A and B */ nnzA_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); nnzB_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_)); - initInputMatrices(sparsity_); - // Get device identifier cudaCheckError(cudaGetDevice(&gpuDevice_)); @@ -118,6 +116,8 @@ class spmm_gpu : public spmm<T> { // Create a handle for cuSPARSE cusparseCheckError(cusparseCreate(&handle_)); + + initInputMatrices(); } protected: @@ -194,7 +194,7 @@ class spmm_gpu : public spmm<T> { cudaCheckError(cudaMemcpyAsync(C_row_dev_, C_row_, sizeof(int) * (n_ + 1), cudaMemcpyHostToDevice, s3_)); - // Craete matrix descriptors + // Create matrix descriptors cusparseCheckError( cusparseCreateCsr(&descrA_, m_, k_, nnzA_, A_row_dev_, A_col_dev_, A_val_dev_, rType_, cType_, diff --git a/include/doSpgemm.hh b/include/doSpgemm.hh index 2131a7d..b8d1d9b 100644 --- a/include/doSpgemm.hh +++ b/include/doSpgemm.hh @@ -1,8 +1,657 @@ -// -// Created by Alexander Cockrean on 07/01/2025. -// +#pragma once +#include <sstream> +#include <type_traits> -#ifndef GPU_BLAS_OFFLOAD_BENCHMARK_DOSPGEMM_HH -#define GPU_BLAS_OFFLOAD_BENCHMARK_DOSPGEMM_HH +#include "helpers.hh" +#include "tablePrinter.hh" +#include "utilities.hh" -#endif //GPU_BLAS_OFFLOAD_BENCHMARK_DOSPGEMM_HH +#if defined CPU_ARMPL +#include "../ArmPL/spgemm.hh" +#elif defined CPU_ONEMKL +// Todo #include "../oneMKL/CPU/spgemm.hh" +#elif defined CPU_AOCL +// Todo #include "../AOCL/spgemm.hh" +#elif defined CPU_NVPL +// Todo #include "../NVPL/spgemm.hh" +#elif defined CPU_OPENBLAS +// Todo #include "../OpenBLAS/spgemm.hh" +#endif + +#if defined GPU_CUBLAS +#include "../cuBLAS/spgemm.hh" +#elif defined GPU_ONEMKL +// Todo #include "../oneMKL/GPU/spgemm.hh" +#elif defined GPU_ROCBLAS +// Todo #include "../rocBLAS/spgemm.hh" +#endif + + +/** +* 'T represents the type of the sparse GEMM kernel that will be run. E.g., + * T=float is for SSPGEMM +*/ +template <typename T> +class doSpgemm { +public: + doSpgemm(const std::string csvDir, const int iters, const int startDim, + const int upperlimit, const bool cpuEnabled = true, + const bool gpuEnabled = true) + : CSV_DIR(csvDir), + iterations_(iterations), + startDimention_(startDim), + upperLimit_(upperLimit), + doCPU_(cpuEnables), + doGPU_(gpuEnabled) +#if CPU_ENABLED + , + cpu_(iterations_) +#endif +#if GPU_ENABLED + , + gpu_(iterations_) +#endif + { + static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>) && + "ERROR - doGemm can only be constructed using one of the " + "following types: [float, double]."); + } + + void collectData() { + // ToDo -- I've hard coded false here as kernel selection was not working + // . Needs to be fixed + + // Square Problem Sizes... + // Re-initialise offload threshold structures + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_square_M=N=K.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = dim, N = dim, K = dim; + callKernels(csvFile, dim, dim, dim); + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Square (M=N=K)"); + } +#endif + + // Rectangular Problem Sizes: + // Tall and thin x Short and wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_short-wide_M=N_M=16K.csv"); + int K = startDimention_; + int M = 16 * K; + int N = 16 * K; + while (M <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += 16; + N += 16; + K++; + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, M=16K)"); + } +#endif + + // Tall and thin x Short and wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_short-wide_M=N_K=32.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = dim, N = dim, K = 32; + callKernels(csvFile, dim, dim, 32); + } + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, K=32)"); + } +#endif + + // Short and wide x Tall and thin + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_tall-thin_M=N_K=16M.csv"); + M = startDimention_; + N = startDimention_; + K = 16 * M; + while (K <= upperLimit_) { + callKernels(csvFile, M, N, K); + M++; + N++; + K += 16; + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N, K=16M)"); + } +#endif + + // Short and wide x Tall and thin + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_tall-thin_M=N=32_K.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = 32, N = 32, K = dim; + callKernels(csvFile, 32, 32, dim); + } + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N=32, K)"); + } +#endif + + // Tall and Thin x Square + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_square_K=N_M=16K.csv"); + K = startDimention_; + N = startDimention_; + M = 16 * K; + while (M <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += 16; + N++; + K++; + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Square (K=N, M=16K)"); + } +#endif + + // Tall and Thin x Square + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_square_K=N=32_M.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = dim, N = 32, K = 32; + callKernels(csvFile, dim, 32, 32); + } + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Square (M, K=N=32)"); + } +#endif + + // Square x Short and Wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_short-wide_M=K_N=16K.csv"); + M = startDimention_; + K = startDimention_; + N = 16 * K; + while (N <= upperLimit_) { + callKernels(csvFile, M, N, K); + M++; + N += 16; + K++; + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)"); + } +#endif + // Square x Short and Wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_short-wide_M=K=32_N.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim++) { + // M = 32, N = dim, K = 32; + callKernels(csvFile, 32, dim, 32); + } + } +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)"); + } +#endif + // Close file + csvFile.close(); + } + +private: + /** Call the appropriate CPU and GPU GEMM kernels. */ + void callKernels(std::ofstream& csvFile, const int M, const int N, + const int K) { + const double probSize = calcKib(M, N, K); + const uint64_t flops = calcFlops(M, N, K); + std::string kernelName = getKernelName(); + + time_checksum_gflop cpuResult; + time_checksum_gflop gpuResult_once; + time_checksum_gflop gpuResult_always; + time_checksum_gflop gpuResult_unified; + +// Perform CPU kernel +#if CPU_ENABLED + if (doCPU_) { + cpu_.initialise(M, N, K); + cpuResult = cpu_.compute(); + cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); + // Write result to CSV file + writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize, + 0.0, iterations_, cpuResult.runtime, cpuResult.gflops); + } +#endif + +// Perform the GPU kernels +#if GPU_ENABLED + if (doGPU_) { + // - ONCE : Offload to/from GPU once before all iterations and once + // after + gpu_.initialise(gpuOffloadType::once, M, N, K); + gpuResult_once = gpu_.compute(); + gpuResult_once.gflops = + calcGflops(flops, iterations_, gpuResult_once.runtime); + + // - ALWAYS: Offload to/from GPU every iteration + gpu_.initialise(gpuOffloadType::always, M, N, K); + gpuResult_always = gpu_.compute(); + gpuResult_always.gflops = + calcGflops(flops, iterations_, gpuResult_always.runtime); + + // - UNIFIED : data passed from host to device (and device to host) as + // needed + gpu_.initialise(gpuOffloadType::unified, M, N, K); + gpuResult_unified = gpu_.compute(); + gpuResult_unified.gflops = + calcGflops(flops, iterations_, gpuResult_unified.runtime); + + // Write results to CSV file + writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, K, probSize, + 0.0, iterations_, gpuResult_once.runtime, + gpuResult_once.gflops); + writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, K, + probSize, 0.0, iterations_, gpuResult_always.runtime, + gpuResult_always.gflops); + writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, K, probSize, + 0.0, iterations_, gpuResult_unified.runtime, + gpuResult_unified.gflops); + } +#endif + +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Make sure all checksums match if CPU and GPU kernels are run. + // - The majority of BLAS Libraries guarentee the same result if a + // function + // is called multiple times. Given all input matrices are identical for + // each GPU offload type, we need only to compare the CPU and GPU + // checksums. + checkChecksums(cpuResult, gpuResult_once, gpuResult_always, + gpuResult_unified, M, N, K); + + // Check if offload structs should be reset + checkOffloadStructReset(cpuResult, gpuResult_once, gpuResult_always, + gpuResult_unified); + + // Check if offload threshold has been achieved for each GPU offload type. + updateOffloadStructs(cpuResult, gpuResult_once, gpuResult_always, + gpuResult_unified, M, N, K, probSize); + + // Update previous results + prev_gpuResult_once = gpuResult_once; + prev_gpuResult_always = gpuResult_always; + prev_gpuResult_unified = gpuResult_unified; + } +#endif + } + + + /** Ensure all CPU and GPU checksums are within the permitted limit of + * eachother. */ + // Todo - think of a sensible way to do this for sparse!!! + void checkChecksums(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified, const int M, + const int N, const int K) { + // Ensure that each checksum difference is less than 0.1% +// double hundredOverChecksum = 100 / std::fabs(cpuResult.checksum); +// if (((std::fabs(cpuResult.checksum - gpuResult_once.checksum) * +// hundredOverChecksum)) > 0.1 && +// ((std::fabs(cpuResult.checksum - gpuResult_always.checksum) * +// hundredOverChecksum)) > 0.1 && +// ((std::fabs(cpuResult.checksum - gpuResult_unified.checksum) * +// hundredOverChecksum)) > 0.1) { +// std::cerr << "ERROR - " << getKernelName() +// << " kernel checksums do not match:\n\tInput " +// "dimensions: M=" +// << M << ", N=" << N << ", K=" << K << std::endl; +// std::cerr << std::setprecision(10) +// << "\tCPU Checksum = " << cpuResult.checksum << std::endl; +// std::cerr << std::setprecision(10) +// << "\tGPU (Once) Checksum = " << gpuResult_once.checksum +// << std::endl; +// std::cerr << std::setprecision(10) +// << "\tGPU (Always) Checksum = " << gpuResult_always.checksum +// << std::endl; +// std::cerr << std::setprecision(10) +// << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum +// << std::endl; +// exit(1); +// } + } + + /** Check whether the offload structures need to be reset; and doing so if + * required. + * - If CPU.gflops >= GPU.gflops for last two problem sizes, then reset + * offload structures as GPU may not necessarily have reached the offload + * threshold. */ + void checkOffloadStructReset(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified) { + if ((cpuGpu_once_.M != 0) && (cpuResult.gflops >= gpuResult_once.gflops) && + (cpuResult.gflops >= prev_gpuResult_once.gflops)) { + cpuGpu_once_.cpuGflops = 0.0; + cpuGpu_once_.gpuGflops = 0.0; + cpuGpu_once_.probSize_kib = 0.0; + cpuGpu_once_.M = 0; + cpuGpu_once_.N = 0; + cpuGpu_once_.K = 0; + } + if ((cpuGpu_always_.M != 0) && + (cpuResult.gflops >= gpuResult_always.gflops) && + (cpuResult.gflops >= prev_gpuResult_always.gflops)) { + cpuGpu_always_.cpuGflops = 0.0; + cpuGpu_always_.gpuGflops = 0.0; + cpuGpu_always_.probSize_kib = 0.0; + cpuGpu_always_.M = 0; + cpuGpu_always_.N = 0; + cpuGpu_always_.K = 0; + } + if ((cpuGpu_unified_.M != 0) && + (cpuResult.gflops >= gpuResult_unified.gflops) && + (cpuResult.gflops >= prev_gpuResult_unified.gflops)) { + cpuGpu_unified_.cpuGflops = 0.0; + cpuGpu_unified_.gpuGflops = 0.0; + cpuGpu_unified_.probSize_kib = 0.0; + cpuGpu_unified_.M = 0; + cpuGpu_unified_.N = 0; + cpuGpu_unified_.K = 0; + } + } + + /** Update the offload threshold structs if GPU.gflops > CPU.gflops. */ + void updateOffloadStructs(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified, const int M, + const int N, const int K, const double probSize) { + if ((cpuGpu_once_.M == 0) && cpuResult.gflops < gpuResult_once.gflops) { + cpuGpu_once_.cpuGflops = cpuResult.gflops; + cpuGpu_once_.gpuGflops = gpuResult_once.gflops; + cpuGpu_once_.probSize_kib = probSize; + cpuGpu_once_.M = M; + cpuGpu_once_.N = N; + cpuGpu_once_.K = K; + } + if ((cpuGpu_always_.M == 0) && cpuResult.gflops < gpuResult_always.gflops) { + cpuGpu_always_.cpuGflops = cpuResult.gflops; + cpuGpu_always_.gpuGflops = gpuResult_always.gflops; + cpuGpu_always_.probSize_kib = probSize; + cpuGpu_always_.M = M; + cpuGpu_always_.N = N; + cpuGpu_always_.K = K; + } + if ((cpuGpu_unified_.M == 0) && + cpuResult.gflops < gpuResult_unified.gflops) { + cpuGpu_unified_.cpuGflops = cpuResult.gflops; + cpuGpu_unified_.gpuGflops = gpuResult_unified.gflops; + cpuGpu_unified_.probSize_kib = probSize; + cpuGpu_unified_.M = M; + cpuGpu_unified_.N = N; + cpuGpu_unified_.K = K; + } + } + + /** A function for calculating FLOPs performed by a GEMM. + * C = alpha*AB + beta*C */ + // ToDo -- Work out how to do this for an unknown algorithm + constexpr uint64_t calcFlops(const int M, const int N, const int K) const { + // A * B = 2*M*N*K (FMA) + // alpha * AB = M*N (multiplication) + // beta * C = M*N (multiplication) + // AB + C = M*N (addition) + // = 2MNK + MN + MN + MN + + // If beta==0; = 2MNK + MN ------- alpha*AB Always done + // Else; = 2MNK + 3MN + uint64_t scalar = (BETA != 0) ? 3 : 1; + return (2 * (uint64_t)M * (uint64_t)N * (uint64_t)K) + + (scalar * (uint64_t)M * (uint64_t)N); + } + + /** A function for calculating the total GEMM problem size in KiB. */ + constexpr double calcKib(const int M, const int N, const int K) const { + uint64_t M_ = (uint64_t)M, N_ = (uint64_t)N, K_ = (uint64_t)K; + uint64_t probSize = (M_ * K_) + (K_ * N_) + (M_ * N_); + return ((double)(probSize * (sizeof(T))) / 1024); + } + + /** Get the name of the kernel being run. */ + std::string getKernelName() const { + switch (sizeof(T)) { + case 4: + return "sgemm"; + case 8: + return "dgemm"; + default: + return "unknown"; + } + } + + /** Print to stdout the offload thresholds. */ + void printOffloadThreshold(const std::string& problemName) const { + std::vector<std::string> header = { + "Device", "M", "N", "K", "Total Prob. Size (KiB)", + "GFLOP/s", "CPU GFLOP/s"}; + + std::vector<std::vector<std::string>> rows; + // Initialise GPU_Once row + std::stringstream probSize_o; + std::stringstream gpuGflops_o; + std::stringstream cpuGflops_o; + probSize_o << std::fixed << std::setprecision(2) + << cpuGpu_once_.probSize_kib; + gpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.gpuGflops; + cpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.cpuGflops; + if (cpuGpu_once_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Offload Once)", std::to_string(0), + std::to_string(0), std::to_string(0), probSize_o.str(), + "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Offload Once)", std::to_string(cpuGpu_once_.M), + std::to_string(cpuGpu_once_.N), + std::to_string(cpuGpu_once_.K), probSize_o.str(), + gpuGflops_o.str(), cpuGflops_o.str()}); + } + + // Initialise GPU_always row + std::stringstream probSize_a; + std::stringstream gpuGflops_a; + std::stringstream cpuGflops_a; + probSize_a << std::fixed << std::setprecision(2) + << cpuGpu_always_.probSize_kib; + gpuGflops_a << std::fixed << std::setprecision(2) + << cpuGpu_always_.gpuGflops; + cpuGflops_a << std::fixed << std::setprecision(2) + << cpuGpu_always_.cpuGflops; + if (cpuGpu_always_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Offload Always)", std::to_string(0), + std::to_string(0), std::to_string(0), probSize_a.str(), + "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Offload Always)", std::to_string(cpuGpu_always_.M), + std::to_string(cpuGpu_always_.N), + std::to_string(cpuGpu_always_.K), probSize_a.str(), + gpuGflops_a.str(), cpuGflops_a.str()}); + } + + // Initialise GPU_unified row + std::stringstream probSize_u; + std::stringstream gpuGflops_u; + std::stringstream cpuGflops_u; + probSize_u << std::fixed << std::setprecision(2) + << cpuGpu_unified_.probSize_kib; + gpuGflops_u << std::fixed << std::setprecision(2) + << cpuGpu_unified_.gpuGflops; + cpuGflops_u << std::fixed << std::setprecision(2) + << cpuGpu_unified_.cpuGflops; + if (cpuGpu_unified_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Unified Memory)", std::to_string(0), + std::to_string(0), std::to_string(0), probSize_u.str(), + "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Unified Memory)", std::to_string(cpuGpu_unified_.M), + std::to_string(cpuGpu_unified_.N), + std::to_string(cpuGpu_unified_.K), probSize_u.str(), + gpuGflops_u.str(), cpuGflops_u.str()}); + } + + // Print table + tablePrinter tPrinter( + problemName + " Problem Domian GPU Offload Thresholds:", header, rows); + tPrinter.print(1); + } + + /** The output directory where CSV files should be saved to. */ + const std::string CSV_DIR; + + /** The number of iterations to perform per problem size. */ + const int iterations_; + + /** The value of the first probelm size dimention run. */ + const int startDimention_; + + /** The maximum value of the largest problem size dimention. */ + const int upperLimit_; + + /** Whether the CPU kernels should be run. */ + const bool doCPU_ = true; + + /** Whether the GPU kernels should be run. */ + const bool doGPU_ = true; + +#if CPU_ENABLED + /** The GEMM CPU kernel. */ + cpu::spgemm_cpu<T> cpu_; +#endif + +#if GPU_ENABLED + /** The GEMM GPU kernel. */ + gpu::spgemm_gpu<T> gpu_; +#endif + + /** The point at which offloading to GPU (offload once) becomes worthwhile. */ + cpuGpu_offloadThreshold cpuGpu_once_; + + /** The point at which offloading to GPU (offload always) becomes worthwhile. + */ + cpuGpu_offloadThreshold cpuGpu_always_; + + /** The point at which offloading to GPU (unified memory) becomes worthwhile. + */ + cpuGpu_offloadThreshold cpuGpu_unified_; + + /** The previous problem size's GPU (offload once) performance results. */ + time_checksum_gflop prev_gpuResult_once; + + /** The previous problem size's GPU (offload always) performance results. */ + time_checksum_gflop prev_gpuResult_always; + + /** The previous problem size's GPU (unified memory) performance results. */ + time_checksum_gflop prev_gpuResult_unified; +}; \ No newline at end of file diff --git a/include/doSpmm.hh b/include/doSpmm.hh index 2321636..51f3aba 100644 --- a/include/doSpmm.hh +++ b/include/doSpmm.hh @@ -12,19 +12,19 @@ #elif defined CPU_ONEMKL // Todo #include "../oneMKL/CPU/spmm.hh" #elif defined CPU_AOCL -// Todo #include "../AOCL/gemm.hh" +// Todo #include "../AOCL/spmm.hh" #elif defined CPU_NVPL - // Todo #include "../NVPL/gemm.hh" + // Todo #include "../NVPL/spmm.hh" #elif defined CPU_OPENBLAS -// Todo #include "../OpenBLAS/gemm.hh" +// Todo #include "../OpenBLAS/spmm.hh" #endif #if defined GPU_CUBLAS #include "../cuBLAS/spmm.hh" #elif defined GPU_ONEMKL -// Todo #include "../oneMKL/GPU/gemm.hh" +// Todo #include "../oneMKL/GPU/spmm.hh" #elif defined GPU_ROCBLAS -// Todo #include "../rocBLAS/gemm.hh" +// Todo #include "../rocBLAS/spmm.hh" #endif /** `T` represents the type of kernel that will be run - i.e. T=float is for diff --git a/include/kernels/CPU/spgemm.hh b/include/kernels/CPU/spgemm.hh new file mode 100644 index 0000000..03f897d --- /dev/null +++ b/include/kernels/CPU/spgemm.hh @@ -0,0 +1,56 @@ +#pragma once + +#include "../spgemm.hh" + +namespace cpu { + +/** + * An abstract class for sparse matrix-dense matrix BLAS kernels + */ +template <typename T> +class spgemm : public :: spgemm<T> { +public: + using ::spgemm<T>::spgemm; + using ::spgemm<T>::initInputMatrices; + using ::spgemm<T>::iterations_; + using ::spgemm<T>::nnz_; + using ::spgemm<T>::sparsity_; + using ::spgemm<T>::m_; + using ::spgemm<T>::n_; + using ::spgemm<T>::k_; + using ::spgemm<T>::A_; + using ::spgemm<T>::B_; + using ::spgemm<T>::C_; + +public: + /** + * Initialise the required data structures. + */ + void initialise(int n, int m, int k, double sparsity, + bool binary = false) { + n_ = n; + m_ = m; + k_ = k; + + sparsity_ = sparsity; + + nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + + A_ = (T*)malloc(sizeof(T) * m_ * k_); + B_ = (T*)malloc(sizeof(T) * k_ * n_); + C_ = (T*)calloc(sizeof(T) * m_ * n_); + + initInputMatrices(); + } + +private: + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() { + free(A_); + free(B_); + free(C_); + } +}; + +} \ No newline at end of file diff --git a/include/kernels/CPU/spgmm.hh b/include/kernels/CPU/spgmm.hh deleted file mode 100644 index 59856ed..0000000 --- a/include/kernels/CPU/spgmm.hh +++ /dev/null @@ -1,8 +0,0 @@ -// -// Created by Alexander Cockrean on 07/01/2025. -// - -#ifndef GPU_BLAS_OFFLOAD_BENCHMARK_SPGMM_HH -#define GPU_BLAS_OFFLOAD_BENCHMARK_SPGMM_HH - -#endif //GPU_BLAS_OFFLOAD_BENCHMARK_SPGMM_HH diff --git a/include/kernels/CPU/spmm.hh b/include/kernels/CPU/spmm.hh index d90f48b..c698101 100644 --- a/include/kernels/CPU/spmm.hh +++ b/include/kernels/CPU/spmm.hh @@ -14,7 +14,6 @@ class spmm : public ::spmm<T> { public: using ::spmm<T>::spmm; using ::spmm<T>::initInputMatrices; - using ::spmm<T>::toCSR_int; using ::spmm<T>::iterations_; using ::spmm<T>::nnzA_; using ::spmm<T>::nnzB_; @@ -29,7 +28,7 @@ public: public: /** Initialise the required data structures. */ void initialise(int n, int m, int k, double sparsity, - bool binary = false) { + bool binary = false) { n_ = n; m_ = m; k_ = k; diff --git a/include/kernels/GPU/spgemm.hh b/include/kernels/GPU/spgemm.hh index 917469b..13aa4b9 100644 --- a/include/kernels/GPU/spgemm.hh +++ b/include/kernels/GPU/spgemm.hh @@ -1,8 +1,28 @@ -// -// Created by Alexander Cockrean on 07/01/2025. -// +#pragma once -#ifndef GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMM_HH -#define GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMM_HH +#include "../spgemm.hh" -#endif //GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMM_HH +namespace gpu { + +/** An abstract class for sparse matrix-dense matrix BLAS kernels. */ + template <typename T> + class spgemm : public ::spgemm<T> { + public: + using ::spgemm<T>::spgemm; + + /** Initialise the required data structures. + * `offload` refers to the data offload type: + * - Once: Move data from host to device before all iterations & move from + * device to host after all iterations + * - Always: Move data from host to device and device to host each iteration + * - Unified: Initialise data as unified memory; no data movement semantics + * required */ + virtual void initialise(gpuOffloadType offload, int m, int n, int k, + double sparsity, bool binary = false) = 0; + + protected: + /** Whether data should be offloaded to/from the GPU each iteration, or just + * before & after. */ + gpuOffloadType offload_ = gpuOffloadType::always; + }; +} // namespace gpu \ No newline at end of file diff --git a/include/kernels/spgemm.hh b/include/kernels/spgemm.hh index 917469b..eb0594c 100644 --- a/include/kernels/spgemm.hh +++ b/include/kernels/spgemm.hh @@ -1,8 +1,130 @@ -// -// Created by Alexander Cockrean on 07/01/2025. -// +#pragma once -#ifndef GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMM_HH -#define GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMM_HH +#include <algorithm> +#include <chrono> +#include <cmath> +#include <limits> +#include <random> +#include <iostream> -#endif //GPU_BLAS_OFFLOAD_BENCHMARK_SPGEMM_HH +#include "../utilities.hh" + +/** +* A generic abstract class defining the operation of timing a sparse GEMM + * BLAS kernel for n iterations +*/ +template <typename T> +class spgemm { +public: + spgemm(const int iters) : iterations_(iters) {} + + /** Call the kernel n times. Returns the time elapsed for all n calls + * in seconds */ + time_checksum_gflop compute() { + // Start the timer + std::chrono::time_point<std::chrono::high_resolution_clock> startTime = + std::chrono::high_resolution_clock::now(); + + // perform tje SPMM calls + preLoopRequirements(); + for (int i = 0; i < iterations_; i++) { + callSpmm(); + } + postLoopRequirements(); + + // Stop the timer + std::chrono::time_point<std::chrono::high_resolution_clock> endTime = + std::chrono::high_resolution_clock::now(); + std::chrono::duration<double> time_s = endTime - startTime; + + double checksum = calcChecksum(); + + postCallKernelCleanup(); + + return {time_s.count(), checksum, 0.0}; + } + + int64_t nnz_ = 0; + +private: + /** Performs the steps required before calling the SPMM kernel that + * should be timed */ + virtual void preLoopRequirements() = 0; + + /** Perform the SPMM kernel. */ + virtual void callSpmm() = 0; + + /** Perform any steps required after calling the SPMM kernel that should + * be timed */ + virtual void postLoopRequirements() = 0; + + /** Do the necessary cleanup after the kernel has been finished that + * should not be timed */ + virtual void postCallKernelCleanup() = 0; + + /** Calculate a checksum from the result matrix C. */ + constexpr double calcChecksum() { + // Todo -- think about how this can sensibly be done for SPMM + return 0.0; + } + +protected: + /** Set up the starting matrices */ + void initInputMatrices() { + for (size_t i = 0; i < (m_ * k_); i++) { + A_[i] = 0.0; + } + + srand(SEED); + for (size_t i = 0; i < (k_ * n_); i++) { + B_[i] = (T)((double)(rand() % 100) / 7.0); + } + + for (size_t i = 0; i < (m_ * n_); i++) { + C_[i] = (T)0.0; + } + + // Random number generator objects for use in descent + std::default_random_engine gen; + gen.seed(std::chrono::system_clock::now() + .time_since_epoch().count()); + std::uniform_real_distribution<double> dist(0.0, 1.0); + + // Using a=0.45 and b=c=0.22 as default probabilities + for (size_t i = 0; i < nnz_; i++) { + while (!rMat(A_, k_, 0, k_ - 1, 0, m_ - 1, 0.45, 0.22, 0.22, &gen, dist, + false)) {} + } + + toSparseFormat(); + } + + /** Move matrices into the sparse representation of for the given library */ + virtual void toSparseFormat() = 0; + + /** Call the external consume() function on the matrices */ + void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); }/** Recursive function to populate sparse matrices */ + + /** The number of iterations to perform per problem size. */ + const int iterations_; + + /** Matrix dimension M. */ + int m_ = 0; + + /** Matrix dimension N. */ + int n_ = 0; + + /** Matrix dimension K. */ + int k_ = 0; + + /** Dense representation of input matrix A. */ + T* A_; + + /** Dense representation of input matrix B. */ + T* B_; + + /** Dense representation of output matrix C. */ + T* C_; + + double sparsity_; +}; \ No newline at end of file diff --git a/include/kernels/spmm.hh b/include/kernels/spmm.hh index 9d45f56..28993c8 100644 --- a/include/kernels/spmm.hh +++ b/include/kernels/spmm.hh @@ -76,6 +76,9 @@ protected: for (size_t i = 0; i < (k_ * n_); i++) { B_[i] = 0.0; } + for (size_t i = 0; i < (m_ * n_); i++) { + C_[i] = 0.0; + } // Random number generator objects for use in descent std::default_random_engine gen; From 8bc912593093c0f8acf10c0e5059f552ee49e758 Mon Sep 17 00:00:00 2001 From: Alex Cockrean <84676155+ABenC377@users.noreply.github.com> Date: Tue, 14 Jan 2025 11:58:37 +0000 Subject: [PATCH 38/38] Finishing off OneMKL CPU support --- .idea/workspace.xml | 37 ++++--- ArmPL/spgemm.hh | 2 +- ArmPL/spgemv.hh | 4 +- ArmPL/spmm.hh | 2 +- cuBLAS/spgemm.hh | 2 +- cuBLAS/spmm.hh | 2 +- include/doSpgemm.hh | 14 +-- include/doSpgemv.hh | 2 +- include/doSpmm.hh | 10 +- include/kernels/spgemm.hh | 14 +-- include/kernels/spgemv.hh | 4 +- include/kernels/spmm.hh | 12 +- oneMKL/CPU/spgemm.hh | 177 +++++++++++++++++++++++++++++ oneMKL/CPU/spgemv.hh | 155 ++++++++++++++++++++++++++ oneMKL/CPU/spmm.hh | 228 ++++++++++++++++++++++++++++++++++++++ 15 files changed, 613 insertions(+), 52 deletions(-) create mode 100644 oneMKL/CPU/spgemm.hh create mode 100644 oneMKL/CPU/spgemv.hh create mode 100644 oneMKL/CPU/spmm.hh diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 8556bf2..9fb6a86 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -15,18 +15,21 @@ </configurations> </component> <component name="ChangeListManager"> - <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Moving spgemv into new format"> - <change afterPath="$PROJECT_DIR$/ArmPL/spgemm.hh" afterDir="false" /> - <change afterPath="$PROJECT_DIR$/cuBLAS/spgemm.hh" afterDir="false" /> + <list default="true" id="0893f9af-dab8-4239-8892-923019f84a19" name="Changes" comment="Finishing off armpl and cusparse kernels"> + <change afterPath="$PROJECT_DIR$/oneMKL/CPU/spgemm.hh" afterDir="false" /> + <change afterPath="$PROJECT_DIR$/oneMKL/CPU/spgemv.hh" afterDir="false" /> + <change afterPath="$PROJECT_DIR$/oneMKL/CPU/spmm.hh" afterDir="false" /> <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/ArmPL/spgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spgemm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/ArmPL/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spgemv.hh" afterDir="false" /> <change beforePath="$PROJECT_DIR$/ArmPL/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/ArmPL/spmm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/cuBLAS/spgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spgemm.hh" afterDir="false" /> <change beforePath="$PROJECT_DIR$/cuBLAS/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/cuBLAS/spmm.hh" afterDir="false" /> <change beforePath="$PROJECT_DIR$/include/doSpgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpgemm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/doSpgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpgemv.hh" afterDir="false" /> <change beforePath="$PROJECT_DIR$/include/doSpmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/doSpmm.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/kernels/CPU/spgmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spgemm.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/CPU/spmm.hh" afterDir="false" /> - <change beforePath="$PROJECT_DIR$/include/kernels/GPU/spgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/GPU/spgemm.hh" afterDir="false" /> <change beforePath="$PROJECT_DIR$/include/kernels/spgemm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spgemm.hh" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/include/kernels/spgemv.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spgemv.hh" afterDir="false" /> <change beforePath="$PROJECT_DIR$/include/kernels/spmm.hh" beforeDir="false" afterPath="$PROJECT_DIR$/include/kernels/spmm.hh" afterDir="false" /> </list> <option name="SHOW_DIALOG" value="false" /> @@ -191,14 +194,6 @@ <workItem from="1729503392250" duration="1773000" /> <workItem from="1730878516596" duration="9915000" /> </task> - <task id="LOCAL-00001" summary="trivial changes"> - <option name="closed" value="true" /> - <created>1706261672580</created> - <option name="number" value="00001" /> - <option name="presentableId" value="LOCAL-00001" /> - <option name="project" value="LOCAL" /> - <updated>1706261672580</updated> - </task> <task id="LOCAL-00002" summary="Adding sparse algorithm"> <option name="closed" value="true" /> <created>1706568127804</created> @@ -583,7 +578,15 @@ <option name="project" value="LOCAL" /> <updated>1736345071717</updated> </task> - <option name="localTasksCounter" value="50" /> + <task id="LOCAL-00050" summary="Finishing off armpl and cusparse kernels"> + <option name="closed" value="true" /> + <created>1736437501127</created> + <option name="number" value="00050" /> + <option name="presentableId" value="LOCAL-00050" /> + <option name="project" value="LOCAL" /> + <updated>1736437501127</updated> + </task> + <option name="localTasksCounter" value="51" /> <servers /> </component> <component name="TypeScriptGeneratedFilesManager"> @@ -601,7 +604,6 @@ </option> </component> <component name="VcsManagerConfiguration"> - <MESSAGE value="Now compiles with fewer runtime errors" /> <MESSAGE value="Implementing other offload types - still some runtime errors" /> <MESSAGE value="All implemented and running. No checksum at the end" /> <MESSAGE value="All three offload types working for large problem sizes" /> @@ -626,6 +628,7 @@ <MESSAGE value="Getting rid of old oneMKL sparse file" /> <MESSAGE value="Refactoring to make individual files relate to a single kernel" /> <MESSAGE value="Moving spgemv into new format" /> - <option name="LAST_COMMIT_MESSAGE" value="Moving spgemv into new format" /> + <MESSAGE value="Finishing off armpl and cusparse kernels" /> + <option name="LAST_COMMIT_MESSAGE" value="Finishing off armpl and cusparse kernels" /> </component> </project> \ No newline at end of file diff --git a/ArmPL/spgemm.hh b/ArmPL/spgemm.hh index 0f9e81d..85eb117 100644 --- a/ArmPL/spgemm.hh +++ b/ArmPL/spgemm.hh @@ -185,7 +185,7 @@ protected: private: /** Make call to the GEMM kernel. */ - void callGemm() override { + void callSpgemm() override { /** * Flow of ARMPL Sparse LA: diff --git a/ArmPL/spgemv.hh b/ArmPL/spgemv.hh index 5045062..e64a665 100644 --- a/ArmPL/spgemv.hh +++ b/ArmPL/spgemv.hh @@ -78,8 +78,6 @@ class spgemv_cpu : public spgemv<T> { /** Perform any required steps before calling the GEMM kernel that should * be timed. */ void preLoopRequirements() override { - // Need to put A_ and B_ into A_armpl_ and B_armpl_ - toCSR_armpl(); /** providing hints to ARMPL and optimizing the matrix datastructures */ // TODO -- is noallocs best here? @@ -162,7 +160,7 @@ class spgemv_cpu : public spgemv<T> { flags_ = 0; // Move A to CSR - A_armpl_row_ptr_ = new armpl_int_t[n_ + 1]; + A_armpl_row_ptr_ = new armpl_int_t[m_ + 1]; A_armpl_col_index_ = new armpl_int_t[nnz_]; A_vals_ = new T[nnz_]; A_armpl_row_ptr_[0] = 0; diff --git a/ArmPL/spmm.hh b/ArmPL/spmm.hh index 9680f09..889cb23 100644 --- a/ArmPL/spmm.hh +++ b/ArmPL/spmm.hh @@ -182,7 +182,7 @@ class spmm_cpu : public spmm<T> { private: /** Make call to the GEMM kernel. */ - void callGemm() override { + void callSpmm() override { /** * Flow of ARMPL Sparse LA: diff --git a/cuBLAS/spgemm.hh b/cuBLAS/spgemm.hh index d4233fd..73e1dfb 100644 --- a/cuBLAS/spgemm.hh +++ b/cuBLAS/spgemm.hh @@ -180,7 +180,7 @@ private: } } - void callGemm() override { + void callSpgemm() override { switch(offload_) { case gpuOffloadType::always: { // Clean up old descriptors diff --git a/cuBLAS/spmm.hh b/cuBLAS/spmm.hh index 249f1ea..8db845a 100644 --- a/cuBLAS/spmm.hh +++ b/cuBLAS/spmm.hh @@ -242,7 +242,7 @@ class spmm_gpu : public spmm<T> { } /** Make a call to the BLAS Library Kernel. */ - void callGemm() override { + void callSpmm() override { switch(offload_) { case gpuOffloadType::always: { if (C_mem_allocated_always_) { diff --git a/include/doSpgemm.hh b/include/doSpgemm.hh index b8d1d9b..be3a77b 100644 --- a/include/doSpgemm.hh +++ b/include/doSpgemm.hh @@ -9,7 +9,7 @@ #if defined CPU_ARMPL #include "../ArmPL/spgemm.hh" #elif defined CPU_ONEMKL -// Todo #include "../oneMKL/CPU/spgemm.hh" +#include "../oneMKL/CPU/spgemm.hh" #elif defined CPU_AOCL // Todo #include "../AOCL/spgemm.hh" #elif defined CPU_NVPL @@ -38,10 +38,10 @@ public: const int upperlimit, const bool cpuEnabled = true, const bool gpuEnabled = true) : CSV_DIR(csvDir), - iterations_(iterations), + iterations_(iters), startDimention_(startDim), - upperLimit_(upperLimit), - doCPU_(cpuEnables), + upperLimit_(upperlimit), + doCPU_(cpuEnabled), doGPU_(gpuEnabled) #if CPU_ENABLED , @@ -52,7 +52,7 @@ public: gpu_(iterations_) #endif { - static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>) && + static_assert((std::is_same_v<T, float> || std::is_same_v<T, double>) && "ERROR - doGemm can only be constructed using one of the " "following types: [float, double]."); } @@ -313,12 +313,12 @@ private: // Perform CPU kernel #if CPU_ENABLED if (doCPU_) { - cpu_.initialise(M, N, K); + cpu_.initialise(M, N, K, 0.99); cpuResult = cpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); // Write result to CSV file writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize, - 0.0, iterations_, cpuResult.runtime, cpuResult.gflops); + 0.99, iterations_, cpuResult.runtime, cpuResult.gflops); } #endif diff --git a/include/doSpgemv.hh b/include/doSpgemv.hh index c2c6a3d..3162736 100644 --- a/include/doSpgemv.hh +++ b/include/doSpgemv.hh @@ -9,7 +9,7 @@ #if defined CPU_ARMPL #include "../ArmPL/spgemv.hh" #elif defined CPU_ONEMKL -// Todo #include "../oneMKL/CPU/spgemv.hh" +#include "../oneMKL/CPU/spgemv.hh" #elif defined CPU_AOCL // Todo #include "../AOCL/spgemv.hh" #elif defined CPU_NVPL diff --git a/include/doSpmm.hh b/include/doSpmm.hh index 51f3aba..3ac1e66 100644 --- a/include/doSpmm.hh +++ b/include/doSpmm.hh @@ -10,7 +10,7 @@ #if defined CPU_ARMPL #include "../ArmPL/spmm.hh" #elif defined CPU_ONEMKL -// Todo #include "../oneMKL/CPU/spmm.hh" +#include "../oneMKL/CPU/spmm.hh" #elif defined CPU_AOCL // Todo #include "../AOCL/spmm.hh" #elif defined CPU_NVPL @@ -236,7 +236,7 @@ private: #if CPU_ENABLED if (doCPU_) { - cpu_.initialise(N, sparsity); + cpu_.initialise(N, N, N, sparsity); time_checksum_gflop cpuResult = cpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, @@ -249,19 +249,19 @@ private: // - UNIFIED : data passed from host to device (and device to host) as // needed if (doGPU_) { - gpu_.initialise(gpuOffloadType::unified, N, sparsity); + gpu_.initialise(gpuOffloadType::unified, N, N, N, sparsity); time_checksum_gflop gpuResult_unified = gpu_.compute(); gpuResult_unified.gflops = calcGflops(flops, iterations_, gpuResult_unified.runtime); // - ALWAYS: Offload to/from GPU every iteration - gpu_.initialise(gpuOffloadType::always, N, sparsity); + gpu_.initialise(gpuOffloadType::always, N, N, N, sparsity); time_checksum_gflop gpuResult_always = gpu_.compute(); gpuResult_always.gflops = calcGflops(flops, iterations_, gpuResult_always.runtime); // - ONCE : Offload to/from GPU once before all iterations and once // after - gpu_.initialise(gpuOffloadType::once, N, sparsity); + gpu_.initialise(gpuOffloadType::once, N, N, N, sparsity); time_checksum_gflop gpuResult_once = gpu_.compute(); gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime); diff --git a/include/kernels/spgemm.hh b/include/kernels/spgemm.hh index eb0594c..3aacf77 100644 --- a/include/kernels/spgemm.hh +++ b/include/kernels/spgemm.hh @@ -28,7 +28,7 @@ public: // perform tje SPMM calls preLoopRequirements(); for (int i = 0; i < iterations_; i++) { - callSpmm(); + callSpgemm(); } postLoopRequirements(); @@ -51,8 +51,8 @@ private: * should be timed */ virtual void preLoopRequirements() = 0; - /** Perform the SPMM kernel. */ - virtual void callSpmm() = 0; + /** Perform the sparse GEMM kernel. */ + virtual void callSpgemm() = 0; /** Perform any steps required after calling the SPMM kernel that should * be timed */ @@ -71,16 +71,16 @@ private: protected: /** Set up the starting matrices */ void initInputMatrices() { - for (size_t i = 0; i < (m_ * k_); i++) { + for (int i = 0; i < (m_ * k_); i++) { A_[i] = 0.0; } srand(SEED); - for (size_t i = 0; i < (k_ * n_); i++) { + for (int i = 0; i < (k_ * n_); i++) { B_[i] = (T)((double)(rand() % 100) / 7.0); } - for (size_t i = 0; i < (m_ * n_); i++) { + for (int i = 0; i < (m_ * n_); i++) { C_[i] = (T)0.0; } @@ -91,7 +91,7 @@ protected: std::uniform_real_distribution<double> dist(0.0, 1.0); // Using a=0.45 and b=c=0.22 as default probabilities - for (size_t i = 0; i < nnz_; i++) { + for (int i = 0; i < nnz_; i++) { while (!rMat(A_, k_, 0, k_ - 1, 0, m_ - 1, 0.45, 0.22, 0.22, &gen, dist, false)) {} } diff --git a/include/kernels/spgemv.hh b/include/kernels/spgemv.hh index 297b406..b07be26 100644 --- a/include/kernels/spgemv.hh +++ b/include/kernels/spgemv.hh @@ -72,7 +72,7 @@ private: protected: void initInputMatrixVector() { // Initialise matric to - for (size_t i = 0; i < (n_ * m_); i++) { + for (int i = 0; i < (n_ * m_); i++) { A_[i] = 0.0; } @@ -83,7 +83,7 @@ protected: std::uniform_real_distribution<double> dist(0.0, 1.0); // Using a=0.45 and b=c=0.22 as default probabilities - for (size_t i = 0; i < nnz_; i++) { + for (int i = 0; i < nnz_; i++) { while (!rMat(A_, m_, 0, n_ - 1, 0, m_ - 1, 0.45, 0.22, 0.22, &gen, dist, false)) {} } diff --git a/include/kernels/spmm.hh b/include/kernels/spmm.hh index 28993c8..8dbb501 100644 --- a/include/kernels/spmm.hh +++ b/include/kernels/spmm.hh @@ -1,4 +1,4 @@ -#pragma one +#pragma once #include <algorithm> #include <chrono> @@ -70,13 +70,13 @@ private: protected: /** Set up the starting matrices */ void initInputMatrices() { - for (size_t i = 0; i < (m_ * k_); i++) { + for (int i = 0; i < (m_ * k_); i++) { A_[i] = 0.0; } - for (size_t i = 0; i < (k_ * n_); i++) { + for (int i = 0; i < (k_ * n_); i++) { B_[i] = 0.0; } - for (size_t i = 0; i < (m_ * n_); i++) { + for (int i = 0; i < (m_ * n_); i++) { C_[i] = 0.0; } @@ -87,11 +87,11 @@ protected: std::uniform_real_distribution<double> dist(0.0, 1.0); // Using a=0.45 and b=c=0.22 as default probabilities - for (size_t i = 0; i < nnzA_; i++) { + for (int i = 0; i < nnzA_; i++) { while (!rMat(A_, k_, 0, k_ - 1, 0, m_ - 1, 0.45, 0.22, 0.22, &gen, dist, false)) {} } - for (size_t i = 0; i < nnzB_; i++) { + for (int i = 0; i < nnzB_; i++) { while (!rMat(B_, n_, 0, n_ - 1, 0, k_ - 1, 0.45, 0.22, 0.22, &gen, dist, false)) {} } diff --git a/oneMKL/CPU/spgemm.hh b/oneMKL/CPU/spgemm.hh new file mode 100644 index 0000000..318bdb2 --- /dev/null +++ b/oneMKL/CPU/spgemm.hh @@ -0,0 +1,177 @@ +#pragma once + +#ifdef CPU_ONEMKL +#include <mkl.h> + +#include <algorithm> + +#include "../../include/kernels/CPU/spgemm.hh" +#include "../../include/utilities.hh" + +namespace cpu { +/** A class for sparse matrix-dense matrix BLAS kernels. */ +template <typename T> +class spgemm_cpu : public spgemm<T> { +public: + using spgemm<T>::spgemm; + using spgemm<T>::callConsume; + using spgemm<T>::initInputMatrices; + using spgemm<T>::m_; + using spgemm<T>::n_; + using spgemm<T>::k_; + using spgemm<T>::A_; + using spgemm<T>::B_; + using spgemm<T>::C_; + using spgemm<T>::sparsity_; + using spgemm<T>::nnz_; + + void initialise(int m, int n, int k, double sparsity, + bool binary = false) { + m_mkl_ = m; + n_mkl_ = n; + k_mkl_ = k; + + sparsity_ = sparsity; + + /** Determine the number of nnz elements in A and B */ + nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + + A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64); + B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64); + C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64); + + initInputMatrices(); + } + +protected: + void toSparseFormat() override { + A_vals_ = new T[nnz_]; + A_cols_ = new MKL_INT[nnz_]; + A_rowsb_ = new MKL_INT[m_ + 1]; + A_rowse_ = new MKL_INT[m_ + 1]; + + int nnz_encountered = 0; + + A_rowsb_[0] = 0; + A_rowse_[0] = 0; + + for (int row = 0; row < m_; row++) { + A_rowsb_[row + 1] = nnz_encountered; + for (int col = 0; col < k_; col++) { + if (A_[(row * k_) + col] != 0.0) { + A_cols_[nnz_encountered] = col; + A_vals_[nnz_encountered] = static_cast<T>(A_[(row * k_) + col]); + nnz_encountered++; + } + } + A_rowse_[row + 1] = nnz_encountered; + } + } + +private: + void callSpgemm() override { + /** + * Using: + * sparse_status_t mkl_sparse_s_mm ( + * const sparse_operation_t operation, + * const float alpha, + * const sparse_matrix_t A, + * const struct matrix_descr descr, + * const sparse_layout_t layout, + * const float *B, + * const MKL_INT columns, + * const MKL_INT ldb, + * const float beta, + * float *C, + * const MKL_INT ldc); + */ + if constexpr (std::is_same_v<T, float>) { + status_ = mkl_sparse_s_mm(operation_, alpha, A_csr_, description_, + layout_, B_, n_mkl_, k_mkl_, beta, C_, + m_mkl_); + } else if constexpr (std::is_same_v<T, double>) { + status_ = mkl_sparse_d_mm(operation_, alpha, A_csr_, description_, + layout_, B_, n_mkl_, k_mkl_, beta, C_, + m_mkl_); + } else { + // Un-specialised class will not do any work - print error and exit. + std::cout << "ERROR - Datatype for OneMKL CPU SpGEMV kernel not " + "supported." << std::endl; + exit(1); + } + + callConsume(); + } + + void preLoopRequirements() override { + if constexpr (std::is_same_v<T, float>) { + status_ = mkl_sparse_s_create_csr(&A_csr_, + indexing_, + m_, + k_, + A_rowsb_, + A_rowse_, + A_cols_, + A_vals_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } else if constexpr (std::is_same_v<T, double>) { + status_ = mkl_sparse_d_create_csr(&A_csr_, + indexing_, + m_, + k_, + A_rowsb_, + A_rowse_, + A_cols_, + A_vals_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } + } + + void postLoopRequirements() override { + status_ = mkl_sparse_destroy(A_csr_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } + + void postCallKernelCleanup() override { + mkl_free(A_); + mkl_free(B_); + mkl_free(C_); + } + + sparse_status_t status_; + + sparse_index_base_t indexing_ = SPARSE_INDEX_BASE_ZERO; + sparse_operation_t operation_ = SPARSE_OPERATION_NON_TRANSPOSE; + // Todo -- investigate if other options for description_ improve performance + matrix_descr description_ = {SPARSE_MATRIX_TYPE_GENERAL, + SPARSE_FILL_MODE_LOWER, + SPARSE_DIAG_NON_UNIT}; + sparse_layout_t layout_ = SPARSE_LAYOUT_COLUMN_MAJOR; + + MKL_INT m_mkl_; + MKL_INT n_mkl_; + MKL_INT k_mkl_; + + T* A_vals_; + MKL_INT* A_cols_; + MKL_INT* A_rowsb_; + MKL_INT* A_rowse_; + + sparse_matrix_t A_csr_; + + const T alpha = ALPHA; + const T beta = BETA; +}; +} + + +#endif \ No newline at end of file diff --git a/oneMKL/CPU/spgemv.hh b/oneMKL/CPU/spgemv.hh new file mode 100644 index 0000000..bac5e32 --- /dev/null +++ b/oneMKL/CPU/spgemv.hh @@ -0,0 +1,155 @@ +#pragma once + +#ifdef CPU_ONEMKL +#include <mkl.h> + +#include <algorithm> + +#include "../../include/kernels/CPU/spgemv.hh" +#include "../../include/utilities.hh" + +namespace cpu { +template <typename T> +class spgemv_cpu : public spgemv<T> { +public: + using spgemv<T>::spgemv; + using spgemv<T>::callConsume; + using spgemv<T>::initInputMatrices; + using spgemv<T>::m_; + using spgemv<T>::n_; + using spgemv<T>::A_; + using spgemv<T>::x_; + using spgemv<T>::y_; + using spgemv<T>::sparsity_; + using spgemv<T>::nnz_; + + void initialise(int m, int n, double sparsity) { + m_ = m; + n_ = n; + sparsity_ = sparsity; + + nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_)); + + A_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64); + x_ = (T*)mkl_malloc(sizeof(T) * n_, 64); + y_ = (T*)mkl_malloc(sizeof(T) * m_, 64); + + initInputMatrices(); + } + +protected: + void toSparseFormat() override { + A_vals_ = new T[nnz_]; + A_cols_ = new MKL_INT[nnz_]; + A_rowsb_ = new MKL_INT[m_ + 1]; + A_rowse_ = new MKL_INT[m_ + 1]; + + int nnz_encountered = 0; + + A_rowsb_[0] = 0; + A_rowse_[0] = 0; + + for (int row = 0; row < m_; row++) { + A_rowsb_[row + 1] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (A_[(row * n_) + col] != 0.0) { + A_cols_[nnz_encountered] = col; + A_vals_[nnz_encountered] = static_cast<T>(A_[(row * n_) + col]); + nnz_encountered++; + } + } + A_rowse_[row + 1] = nnz_encountered; + } + } + +private: + + void callGemv() override { + /** + * sparse_status_t mkl_sparse_s_mv ( + * const sparse_operation_t operation, + * const float alpha, + * const sparse_matrix_t A, + * const struct matrix_descr descr, + * const float *x, + * const float beta, + * float *y); + */ + if constexpr (std::is_same_v<T, float>) { + status_ = mkl_sparse_s_mv(operation_, alpha, A_csr_, description_, x_, + beta, y_); + } else if constexpr (std::is_same_v<T, float>) { + status_ = mkl_sparse_s_mv(operation_, alpha, A_csr_, description_, x_, + beta, y_); + } + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + callConsume(); + } + + void preLoopRequirements() override { + if constexpr (std::is_same_v<T, float>) { + status_ = mkl_sparse_s_create_csr(&A_csr_, + indexing_, + m_, + n_, + A_rowsb_, + A_rowse_, + A_cols_, + A_vals_); + } else if constexpr (std::is_same_v<T, double>) { + status_ = mkl_sparse_d_create_csr(&A_csr_, + indexing_, + m_, + n_, + A_rowsb_, + A_rowse_, + A_cols_, + A_vals_); + } + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + } + + void postLoopRequirements() override { + status_ = mkl_sparse_destroy(A_csr_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } + + void postKernelCleanup() override { + mkl_free(A_); + mkl_free(x_); + mkl_free(y_); + } + + sparse_status_t status_; + + sparse_index_base_t indexing_ = SPARSE_INDEX_BASE_ZERO; + sparse_operation_t operation_ = SPARSE_OPERATION_NON_TRANSPOSE; + sparse_matrix_type_t description_ = SPARSE_MATRIX_TYPE_GENERAL; + + MKL_INT m_mkl_; + MKL_INT n_mkl_; + + T* A_vals_; + MKL_INT* A_cols_; + MKL_INT* A_rowsb_; + MKL_INT* A_rowse_; + + sparse_matrix_t A_csr_; + + const T alpha = ALPHA; + const T beta = BETA; +}; +} + + +#endif diff --git a/oneMKL/CPU/spmm.hh b/oneMKL/CPU/spmm.hh new file mode 100644 index 0000000..936aeb5 --- /dev/null +++ b/oneMKL/CPU/spmm.hh @@ -0,0 +1,228 @@ +#pragma once + +#ifdef CPU_ONEMKL +#include <mkl.h> +#include <mkl_spblas.h> + +#include <algorithm> + +#include "../../include/kernels/CPU/spmm.hh" +#include "../../include/utilities.hh" + +namespace cpu { +/** A class for sparse matrix-sparse matrix CPU BLAS kernels. */ +template <typename T> +class spmm_cpu : public spmm<T> { +public: + using spmm<T>::spmm; + using spmm<T>::initInputMatrices; + using spmm<T>::callConsume; + using spmm<T>::m_; + using spmm<T>::n_; + using spmm<T>::k_; + using spmm<T>::A_; + using spmm<T>::B_; + using spmm<T>::C_; + using spmm<T>::sparsity_; + using spmm<T>::nnzA_; + using spmm<T>::nnzB_; + + void initialise(int m, int n, int k, double sparsity, + bool binary = false) { + m_mkl_ = m; + n_mkl_ = n; + k_mkl_ = k; + + sparsity_ = sparsity; + + /** Determine the number of nnz elements in A and B */ + nnzA_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + nnzB_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_)); + + A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64); + B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64); + C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64); + + initInputMatrices(); + } + +protected: + void toSparseFormat() override { + A_vals_ = new T[nnzA_]; + A_cols_ = new MKL_INT[nnzA_]; + A_rowsb_ = new MKL_INT[m_ + 1]; + A_rowse_ = new MKL_INT[m_ + 1]; + + int nnz_encountered = 0; + + A_rowsb_[0] = 0; + A_rowse_[0] = 0; + + for (int row = 0; row < m_; row++) { + A_rowsb_[row + 1] = nnz_encountered; + for (int col = 0; col < k_; col++) { + if (A_[(row * k_) + col] != 0.0) { + A_cols_[nnz_encountered] = col; + A_vals_[nnz_encountered] = static_cast<T>(A_[(row * k_) + col]); + nnz_encountered++; + } + } + A_rowse_[row + 1] = nnz_encountered; + } + + + B_vals_ = new T[nnzB_]; + B_cols_ = new MKL_INT[nnzB_]; + B_rowsb_ = new MKL_INT[k_ + 1]; + B_rowse_ = new MKL_INT[k_ + 1]; + + nnz_encountered = 0; + + B_rowsb_[0] = 0; + B_rowse_[0] = 0; + + for (int row = 0; row < k_; row++) { + B_rowsb_[row + 1] = nnz_encountered; + for (int col = 0; col < n_; col++) { + if (B_[(row * n_) + col] != 0.0) { + B_cols_[nnz_encountered] = col; + B_vals_[nnz_encountered] = static_cast<T>(B_[(row * n_) + col]); + nnz_encountered++; + } + } + B_rowse_[row + 1] = nnz_encountered; + } + } + +private: + void callSpmm() override { + /** + * sparse_status_t mkl_sparse_spmm ( + * const sparse_operation_t operation, + * const sparse_matrix_t A, + * const sparse_matrix_t B, + * sparse_matrix_t *C); + */ + status_ = mkl_sparse_spmm(operation_, A_csr_, B_csr_, &C_csr_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + callConsume(); + } + + void preLoopRequirements() override { + if constexpr (std::is_same_v<T, float>) { + status_ = mkl_sparse_s_create_csr(&A_csr_, + indexing_, + m_, + k_, + A_rowsb_, + A_rowse_, + A_cols_, + A_vals_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = mkl_sparse_s_create_csr(&B_csr_, + indexing_, + k_, + n_, + B_rowsb_, + B_rowse_, + B_cols_, + B_vals_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } else if constexpr (std::is_same_v<T, double>) { + status_ = mkl_sparse_d_create_csr(&A_csr_, + indexing_, + m_, + k_, + A_rowsb_, + A_rowse_, + A_cols_, + A_vals_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + + status_ = mkl_sparse_d_create_csr(&B_csr_, + indexing_, + k_, + n_, + B_rowsb_, + B_rowse_, + B_cols_, + B_vals_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } + } + void postLoopRequirements() override { + status_ = mkl_sparse_destroy(A_csr_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = mkl_sparse_destroy(B_csr_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = mkl_sparse_destroy(C_csr_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } + void postCallKernelCleanup() override { + mkl_free(A_); + mkl_free(B_); + mkl_free(C_); + } + + sparse_status_t status_; + + sparse_index_base_t indexing_ = SPARSE_INDEX_BASE_ZERO; + sparse_operation_t operation_ = SPARSE_OPERATION_NON_TRANSPOSE; + + MKL_INT m_mkl_; + MKL_INT n_mkl_; + MKL_INT k_mkl_; + + T* A_vals_; + MKL_INT* A_cols_; + MKL_INT* A_rowsb_; + MKL_INT* A_rowse_; + + T* B_vals_; + MKL_INT* B_cols_; + MKL_INT* B_rowsb_; + MKL_INT* B_rowse_; + + T* C_vals_; + MKL_INT* C_cols_; + MKL_INT* C_rowsb_; + MKL_INT* C_rowse_; + + sparse_matrix_t A_csr_; + sparse_matrix_t B_csr_; + sparse_matrix_t C_csr_; + + + const T alpha = ALPHA; + const T beta = BETA; +}; +} + + +#endif \ No newline at end of file