Skip to content

Commit

Permalink
use 1D arrays as matrices
Browse files Browse the repository at this point in the history
  • Loading branch information
mehmetyusufoglu committed Nov 22, 2024
1 parent 6b29767 commit a0dee21
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 89 deletions.
21 changes: 13 additions & 8 deletions example/useBLASInAlpaka/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# Copyright 2023 Benjamin Worpitz, Jan Stephan
# SPDX-License-Identifier: ISC
#

################################################################################
# Required CMake version.

Expand All @@ -17,12 +16,6 @@ set(_TARGET_NAME useBLASInAlpaka)

project(${_TARGET_NAME} LANGUAGES CXX)



# Add cuBLAS library
find_package(CUDA REQUIRED)
set(CUDA_LIBRARIES ${CUDA_LIBRARIES} cublas)

#-------------------------------------------------------------------------------
# Find alpaka.

Expand All @@ -38,6 +31,19 @@ if(NOT TARGET alpaka::alpaka)
endif()
endif()

# Check if Alpaka accelerator is CUDA-only
if(ALPAKA_ACC_GPU_CUDA_ONLY)
# Add your source files and include directories if CUDA-only is enabled
add_executable(my_project main.cpp)
target_link_libraries(my_project PRIVATE Alpaka::alpaka)
else()
# Print a warning and skip target creation
message(WARNING "Skipping build of 'my_project' because ALPAKA_ACC_GPU_CUDA_ONLY is not enabled.")
endif()
# Add cuBLAS library
find_package(CUDA REQUIRED)
set(CUDA_LIBRARIES ${CUDA_LIBRARIES} cublas)

#-------------------------------------------------------------------------------
# Add executable.

Expand All @@ -47,7 +53,6 @@ alpaka_add_executable(
target_link_libraries(
${_TARGET_NAME}
PUBLIC alpaka::alpaka ${CUDA_LIBRARIES})

set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER example)
set_target_properties(${_TARGET_NAME} PROPERTIES CUDA_STANDARD 14)
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME})
162 changes: 81 additions & 81 deletions example/useBLASInAlpaka/src/useBLASInAlpaka.cpp
Original file line number Diff line number Diff line change
@@ -1,37 +1,39 @@
#include <alpaka/alpaka.hpp>

#include <cublas_v2.h>
#include <iostream>

#include <cmath>
#include <iostream>

// Index type
using Idx = std::size_t;
// Set data type
using DataType = float;

// Initialize the matrix in column-major order
template<typename TMdSpan>
inline void initializeMatrix(TMdSpan& span, int value) {
auto const numRows = span.extent(0);
auto const numCols = span.extent(1);
for (Idx j = 0; j < numCols; ++j) {
for (Idx i = 0; i < numRows; ++i) {
span(i, j) = static_cast<DataType>(value);
// Initialize the matrix in column-major order (1D buffer)
void initializeMatrix(DataType* buffer, Idx rows, Idx cols, int value)
{
for(Idx j = 0; j < cols; ++j)
{
for(Idx i = 0; i < rows; ++i)
{
buffer[i + j * rows] = static_cast<DataType>(value);
}
}
}

int main() {
using Dim = alpaka::DimInt<2>;
int main()
{
using Dim1D = alpaka::DimInt<1>;

// Define matrix dimensions, A is MxK and B is KxN
Idx const M = 3; // Rows in A and C
// Define matrix dimensions, A is MxK and B is KxN
Idx const M = 4; // Rows in A and C
Idx const N = 2; // Columns in B and C
Idx const K = 1; // Columns in A and rows in B
Idx const K = 3; // Columns in A and rows in B

// Define device and queue
using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
// Define device and queue
using Acc = alpaka::AccGpuCudaRt<Dim1D, Idx>;
using Queue = alpaka::Queue<Acc, alpaka::Blocking>;
using Vec = alpaka::Vec<Dim, Idx>;

auto const platformHost = alpaka::PlatformCpu{};
auto const devHost = alpaka::getDevByIdx(platformHost, 0);
Expand All @@ -40,127 +42,125 @@ int main() {

Queue queue(devAcc);

// Define the 2D extents (dimensions)
Vec const extentA(static_cast<Idx>(M), static_cast<Idx>(K));
Vec const extentB(static_cast<Idx>(K), static_cast<Idx>(N));
Vec const extentC(static_cast<Idx>(M), static_cast<Idx>(N));

// Allocate host memory
auto bufHostA = alpaka::allocBuf<DataType, Idx>(devHost, extentA);
auto bufHostB = alpaka::allocBuf<DataType, Idx>(devHost, extentB);
auto bufHostC = alpaka::allocBuf<DataType, Idx>(devHost, extentC);
// Allocate 1D host memory
auto bufHostA = alpaka::allocBuf<DataType, Idx>(devHost, M * K);
auto bufHostB = alpaka::allocBuf<DataType, Idx>(devHost, K * N);
auto bufHostC = alpaka::allocBuf<DataType, Idx>(devHost, M * N);

// Create mdspan views for host buffers
auto mdHostA = alpaka::experimental::getMdSpan(bufHostA);
auto mdHostB = alpaka::experimental::getMdSpan(bufHostB);
auto mdHostC = alpaka::experimental::getMdSpan(bufHostC);
DataType* hostA = alpaka::getPtrNative(bufHostA);
DataType* hostB = alpaka::getPtrNative(bufHostB);
DataType* hostC = alpaka::getPtrNative(bufHostC);

// Initialize host matrices
initializeMatrix(mdHostA, 1); // All elements in A are 1
initializeMatrix(mdHostB, 2); // All elements in B are 2
// Initialize host matrices
initializeMatrix(hostA, M, K, 1); // All elements in A are 1
initializeMatrix(hostB, K, N, 2); // All elements in B are 2
std::fill(hostC, hostC + (M * N), 0); // Initialize C with 0s

// Print initialized matrices on the host
// Print initialized matrices
std::cout << "Matrix A (Host):" << std::endl;
for (Idx i = 0; i < M; ++i) {
for (Idx j = 0; j < K; ++j) {
std::cout << mdHostA(i, j) << "";
for(Idx i = 0; i < M; ++i)
{
for(Idx j = 0; j < K; ++j)
{
std::cout << hostA[i + j * M] << " ";
}
std::cout << std::endl;
}

std::cout << "Matrix B (Host):" << std::endl;
for (Idx i = 0; i < K; ++i) {
for (Idx j = 0; j < N; ++j) {
std::cout << mdHostB(i, j) << "";
for(Idx i = 0; i < K; ++i)
{
for(Idx j = 0; j < N; ++j)
{
std::cout << hostB[i + j * K] << " ";
}
std::cout << std::endl;
}

// Allocate device memory
auto bufDevA = alpaka::allocBuf<DataType, Idx>(devAcc, extentA);
auto bufDevB = alpaka::allocBuf<DataType, Idx>(devAcc, extentB);
auto bufDevC = alpaka::allocBuf<DataType, Idx>(devAcc, extentC);
// Allocate 1D device memory
auto bufDevA = alpaka::allocBuf<DataType, Idx>(devAcc, M * K);
auto bufDevB = alpaka::allocBuf<DataType, Idx>(devAcc, K * N);
auto bufDevC = alpaka::allocBuf<DataType, Idx>(devAcc, M * N);

// Copy data to device
// Copy data to device
alpaka::memcpy(queue, bufDevA, bufHostA);
alpaka::memcpy(queue, bufDevB, bufHostB);
alpaka::memcpy(queue, bufDevC, bufHostC); // Initialize device C with zeros
alpaka::wait(queue);




std::cout << "Copied matrices A and B to the device." << std::endl;

// Get the native CUDA stream from Alpaka queue
// Get the native CUDA stream from Alpaka queue
auto alpakaStream = alpaka::getNativeHandle(queue);

// cuBLAS setup
// cuBLAS setup
cublasHandle_t cublasHandle;
cublasCreate(&cublasHandle);
cublasSetStream(cublasHandle, alpakaStream);
auto pitchA = alpaka::getPitchesInBytes(bufDevA);
auto pitchB = alpaka::getPitchesInBytes(bufDevB);
auto pitchC = alpaka::getPitchesInBytes(bufDevC);

std::cout << "pitchA" << pitchA << std::endl;
std::cout << "pitchB" << pitchB << std::endl;
std::cout << "pitchC" << pitchC << std::endl;

// Perform matrix multiplication: C = A * B
// Perform matrix multiplication: C = A * B
float alpha = 1.0f, beta = 0.0f; // Set beta to 0.0f to overwrite C
cublasSgemm(
cublasHandle,
CUBLAS_OP_N, CUBLAS_OP_N, // No transpose
M, N, K, // Dimensions
CUBLAS_OP_N,
CUBLAS_OP_N, // No transpose for A and B
M,
N,
K, // Dimensions: C = A * B
&alpha,
alpaka::getPtrNative(bufDevA), M, // Leading dimension (rows of A)
alpaka::getPtrNative(bufDevB), K, // Leading dimension (rows of B)
alpaka::getPtrNative(bufDevA),
M, // Leading dimension of A
alpaka::getPtrNative(bufDevB),
K, // Leading dimension of B
&beta,
alpaka::getPtrNative(bufDevC), M // Leading dimension (rows of C)
);



alpaka::getPtrNative(bufDevC),
M // Leading dimension of C
);

alpaka::wait(queue); // Wait for multiplication to complete
std::cout << "Matrix multiplication completed." << std::endl;

// Copy result back to host
// Copy result back to host
alpaka::memcpy(queue, bufHostC, bufDevC);
alpaka::wait(queue);
std::cout << "Copied result matrix C back to the host." << std::endl;

// Print result matrix C
// Print result matrix C
std::cout << "Matrix C (Host):" << std::endl;
for (Idx i = 0; i < M; ++i) {
for (Idx j = 0; j < N; ++j) {
std::cout << mdHostC(i, j) << " ";
for(Idx i = 0; i < M; ++i)
{
for(Idx j = 0; j < N; ++j)
{
std::cout << hostC[i + j * M] << " ";
}
std::cout << std::endl;
}

// Verify the result
// Verify the result
bool success = true;
DataType expectedValue = 2 * K; // Expected value for all elements in C
for (Idx i = 0; i < M; ++i) {
for (Idx j = 0; j < N; ++j) {
if (std::fabs(mdHostC(i, j) - expectedValue) > 1e-5f) { // Allow small floating-point errors
std::cout << "Mismatch at (" << i << ", " << j << "): "
<< mdHostC(i, j) << " != " << expectedValue << std::endl;
for(Idx i = 0; i < M; ++i)
{
for(Idx j = 0; j < N; ++j)
{
if(std::fabs(hostC[i + j * M] - expectedValue) > 1e-5f)
{ // Allow small floating-point errors
std::cout << "Mismatch at (" << i << ", " << j << "): " << hostC[i + j * M] << " != " << expectedValue
<< std::endl;
success = false;
}
}
}

std::cout << "Multiplication of matrices of size " << M << "x" << K << " and " << K << "x" << N
<< " using mdspan " << (success ? "succeeded" : "failed") << "!" << std::endl;
<< (success ? " succeeded!" : " failed!") << std::endl;

if (!success) {
if(!success)
{
return EXIT_FAILURE;
}

// Cleanup cuBLAS
// Cleanup cuBLAS
cublasDestroy(cublasHandle);

return EXIT_SUCCESS;
Expand Down

0 comments on commit a0dee21

Please sign in to comment.