Skip to content

Commit

Permalink
set and read the alpaka buffer directly in examples (#2271)
Browse files Browse the repository at this point in the history
* set and get alpaka buf directly in examples

* replace getPtrNative with std::data in examples,integ tests and one unit test
  • Loading branch information
mehmetyusufoglu authored May 27, 2024
1 parent 9b15e66 commit 887aee9
Show file tree
Hide file tree
Showing 18 changed files with 82 additions and 125 deletions.
51 changes: 9 additions & 42 deletions benchmarks/babelstream/src/AlpakaStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,8 @@ void AlpakaStream<T>::init_arrays(T initA, T initB, T initC)
{
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
alpaka::exec<Acc>(
queue,
workdiv,
InitKernel{},
alpaka::getPtrNative(d_a),
alpaka::getPtrNative(d_b),
alpaka::getPtrNative(d_c),
initA,
initB,
initC);
alpaka::exec<
Acc>(queue, workdiv, InitKernel{}, std::data(d_a), std::data(d_b), std::data(d_c), initA, initB, initC);
alpaka::wait(queue);
}

Expand All @@ -87,7 +79,7 @@ void AlpakaStream<T>::copy()
{
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
alpaka::exec<Acc>(queue, workdiv, CopyKernel{}, alpaka::getPtrNative(d_a), alpaka::getPtrNative(d_c));
alpaka::exec<Acc>(queue, workdiv, CopyKernel{}, std::data(d_a), std::data(d_c));
alpaka::wait(queue);
}

Expand All @@ -107,7 +99,7 @@ void AlpakaStream<T>::mul()
{
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
alpaka::exec<Acc>(queue, workdiv, MulKernel{}, alpaka::getPtrNative(d_b), alpaka::getPtrNative(d_c));
alpaka::exec<Acc>(queue, workdiv, MulKernel{}, std::data(d_b), std::data(d_c));
alpaka::wait(queue);
}

Expand All @@ -126,13 +118,7 @@ void AlpakaStream<T>::add()
{
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
alpaka::exec<Acc>(
queue,
workdiv,
AddKernel{},
alpaka::getPtrNative(d_a),
alpaka::getPtrNative(d_b),
alpaka::getPtrNative(d_c));
alpaka::exec<Acc>(queue, workdiv, AddKernel{}, std::data(d_a), std::data(d_b), std::data(d_c));
alpaka::wait(queue);
}

Expand All @@ -152,13 +138,7 @@ void AlpakaStream<T>::triad()
{
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
alpaka::exec<Acc>(
queue,
workdiv,
TriadKernel{},
alpaka::getPtrNative(d_a),
alpaka::getPtrNative(d_b),
alpaka::getPtrNative(d_c));
alpaka::exec<Acc>(queue, workdiv, TriadKernel{}, std::data(d_a), std::data(d_b), std::data(d_c));
alpaka::wait(queue);
}

Expand All @@ -178,13 +158,7 @@ void AlpakaStream<T>::nstream()
{
auto const workdiv = WorkDiv{arraySize / blockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, arraySize);
alpaka::exec<Acc>(
queue,
workdiv,
NstreamKernel{},
alpaka::getPtrNative(d_a),
alpaka::getPtrNative(d_b),
alpaka::getPtrNative(d_c));
alpaka::exec<Acc>(queue, workdiv, NstreamKernel{}, std::data(d_a), std::data(d_b), std::data(d_c));
alpaka::wait(queue);
}

Expand Down Expand Up @@ -224,18 +198,11 @@ auto AlpakaStream<T>::dot() -> T
{
auto const workdiv = WorkDiv{dotBlockSize, blockSize, 1};
// auto const workdiv = alpaka::getValidWorkDiv(devAcc, dotBlockSize * blockSize);
alpaka::exec<Acc>(
queue,
workdiv,
DotKernel{},
alpaka::getPtrNative(d_a),
alpaka::getPtrNative(d_b),
alpaka::getPtrNative(d_sum),
arraySize);
alpaka::exec<Acc>(queue, workdiv, DotKernel{}, std::data(d_a), std::data(d_b), std::data(d_sum), arraySize);
alpaka::wait(queue);

alpaka::memcpy(queue, sums, d_sum);
T const* sumPtr = alpaka::getPtrNative(sums);
T const* sumPtr = std::data(sums);
// TODO(bgruber): replace by std::reduce, when gcc 9.3 is the baseline
return std::accumulate(sumPtr, sumPtr + dotBlockSize, T{0});
}
Expand Down
6 changes: 3 additions & 3 deletions example/convolution1D/src/convolution1D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,9 @@ auto main() -> int
ConvolutionKernel convolutionKernel;

// Native pointers needed for the kernel execution function
DataType* nativeFilterDeviceMemory = alpaka::getPtrNative(filterDeviceMemory);
DataType* nativeInputDeviceMemory = alpaka::getPtrNative(inputDeviceMemory);
DataType* nativeOutputDeviceMemory = alpaka::getPtrNative(outputDeviceMemory);
DataType* nativeFilterDeviceMemory = std::data(filterDeviceMemory);
DataType* nativeInputDeviceMemory = std::data(inputDeviceMemory);
DataType* nativeOutputDeviceMemory = std::data(outputDeviceMemory);

// Run the kernel
alpaka::exec<DevAcc>(
Expand Down
6 changes: 3 additions & 3 deletions example/convolution2D/src/convolution2D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,11 +314,11 @@ auto main() -> int
queueAcc,
workDiv,
convolutionKernel2D,
alpaka::getPtrNative(bufInputAcc),
alpaka::getPtrNative(outputDeviceMemory),
std::data(bufInputAcc),
std::data(outputDeviceMemory),
matrixWidth,
matrixHeight,
alpaka::getPtrNative(bufFilterAcc),
std::data(bufFilterAcc),
filterWidth,
intputWidthAllocated,
filterWidthAllocated);
Expand Down
4 changes: 0 additions & 4 deletions example/counterBasedRng/src/counterBasedRng.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,6 @@ auto main() -> int
auto bufHost(alpaka::allocBuf<Data, Idx>(devHost, extent));
auto bufHostDev(alpaka::allocBuf<Data, Idx>(devHost, extent));

// Initialize the host input vectors A and B
Data* const pBufHost(alpaka::getPtrNative(bufHost));
Data* const pBufHostDev(alpaka::getPtrNative(bufHostDev));

std::random_device rd{};
CounterBasedRngKernel::Key key = {rd(), rd()};

Expand Down
8 changes: 4 additions & 4 deletions example/heatEquation/src/heatEquation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,16 +117,16 @@ auto main() -> int
// This buffer will hold the current values (used for the next step)
auto uCurrBufHost = alpaka::allocBuf<double, Idx>(devHost, extent);

double* const pCurrHost = alpaka::getPtrNative(uCurrBufHost);
double* const pNextHost = alpaka::getPtrNative(uNextBufHost);
double* const pCurrHost = std::data(uCurrBufHost);
double* const pNextHost = std::data(uNextBufHost);

// Accelerator buffer
using BufAcc = alpaka::Buf<Acc, double, Dim, Idx>;
auto uNextBufAcc = BufAcc{alpaka::allocBuf<double, Idx>(devAcc, extent)};
auto uCurrBufAcc = BufAcc{alpaka::allocBuf<double, Idx>(devAcc, extent)};

double* pCurrAcc = alpaka::getPtrNative(uCurrBufAcc);
double* pNextAcc = alpaka::getPtrNative(uNextBufAcc);
double* pCurrAcc = std::data(uCurrBufAcc);
double* pNextAcc = std::data(uNextBufAcc);

// Apply initial conditions for the test problem
for(uint32_t i = 0; i < numNodesX; i++)
Expand Down
7 changes: 3 additions & 4 deletions example/monteCarloIntegration/src/monteCarloIntegration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,11 @@ auto main() -> int

// Setup buffer.
BufHost bufHost{alpaka::allocBuf<uint32_t, Idx>(devHost, extent)};
uint32_t* const ptrBufHost{alpaka::getPtrNative(bufHost)};
BufAcc bufAcc{alpaka::allocBuf<uint32_t, Idx>(devAcc, extent)};
uint32_t* const ptrBufAcc{alpaka::getPtrNative(bufAcc)};
uint32_t* const ptrBufAcc{std::data(bufAcc)};

// Initialize the global count to 0.
ptrBufHost[0] = 0.0f;
bufHost[0] = 0.0f;
alpaka::memcpy(queue, bufAcc, bufHost);

Kernel kernel;
Expand All @@ -119,7 +118,7 @@ auto main() -> int
alpaka::wait(queue);

// Check the result.
uint32_t globalCount = *ptrBufHost;
uint32_t globalCount = bufHost[0];

// Final result.
float finalResult = globalCount / static_cast<float>(numPoints);
Expand Down
13 changes: 6 additions & 7 deletions example/parallelLoopPatterns/src/parallelLoopPatterns.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,9 @@ void testResult(TQueue& queue, TBufAcc& bufAcc)
auto const byte(static_cast<uint8_t>(0u));
alpaka::memset(queue, bufAcc, byte);
// Test that all elements were processed
auto const* result = alpaka::getPtrNative(bufHost);
bool testPassed = true;
for(uint32_t i = 0u; i < n; i++)
testPassed = testPassed && (std::abs(result[i] - process(i)) < 1e-3);
testPassed = testPassed && (std::abs(bufHost[i] - process(i)) < 1e-3);
std::cout << (testPassed ? "Test passed.\n" : "Test failed.\n");
}

Expand Down Expand Up @@ -115,7 +114,7 @@ void naiveCudaStyle(TDev& dev, TQueue& queue, TBufAcc& bufAcc)
std::cout << "\nNaive CUDA style processing - each thread processes one data point:\n";
std::cout << " " << blocksPerGrid << " blocks, " << threadsPerBlock << " threads per block, "
<< "alpaka element layer not used\n";
alpaka::exec<TAcc>(queue, workDiv, NaiveCudaStyleKernel{}, alpaka::getPtrNative(bufAcc), n);
alpaka::exec<TAcc>(queue, workDiv, NaiveCudaStyleKernel{}, std::data(bufAcc), n);
testResult(queue, bufAcc);
}

Expand Down Expand Up @@ -178,7 +177,7 @@ void gridStridedLoop(TDev& dev, TQueue& queue, TBufAcc& bufAcc)
std::cout << "\nGrid strided loop processing - fixed number of threads and blocks:\n";
std::cout << " " << blocksPerGrid << " blocks, " << threadsPerBlock << " threads per block, "
<< "alpaka element layer not used\n";
alpaka::exec<TAcc>(queue, workDiv, GridStridedLoopKernel{}, alpaka::getPtrNative(bufAcc), n);
alpaka::exec<TAcc>(queue, workDiv, GridStridedLoopKernel{}, std::data(bufAcc), n);
testResult(queue, bufAcc);
}

Expand Down Expand Up @@ -253,7 +252,7 @@ void chunkedGridStridedLoop(TDev& dev, TQueue& queue, TBufAcc& bufAcc)
std::cout << "\nChunked grid strided loop processing - fixed number of threads and blocks:\n";
std::cout << " " << blocksPerGrid << " blocks, " << threadsPerBlock << " threads per block, "
<< elementsPerThread << " alpaka elements per thread\n";
alpaka::exec<TAcc>(queue, workDiv, ChunkedGridStridedLoopKernel{}, alpaka::getPtrNative(bufAcc), n);
alpaka::exec<TAcc>(queue, workDiv, ChunkedGridStridedLoopKernel{}, std::data(bufAcc), n);
testResult(queue, bufAcc);
}

Expand Down Expand Up @@ -319,7 +318,7 @@ void naiveOpenMPStyle(TDev& dev, TQueue& queue, TBufAcc& bufAcc)
std::cout << "\nNaive OpenMP style processing - each thread processes a single consecutive range of elements:\n";
std::cout << " " << blocksPerGrid << " blocks, " << threadsPerBlock << " threads per block, "
<< "alpaka element layer not used\n";
alpaka::exec<TAcc>(queue, workDiv, NaiveOpenMPStyleKernel{}, alpaka::getPtrNative(bufAcc), n);
alpaka::exec<TAcc>(queue, workDiv, NaiveOpenMPStyleKernel{}, std::data(bufAcc), n);
testResult(queue, bufAcc);
}

Expand Down Expand Up @@ -397,7 +396,7 @@ void openMPSimdStyle(TDev& dev, TQueue& queue, TBufAcc& bufAcc)
std::cout << "\nOpenMP SIMD style processing - each thread processes a single consecutive range of elements:\n";
std::cout << " " << blocksPerGrid << " blocks, " << threadsPerBlock << " threads per block, "
<< elementsPerThread << " alpaka elements per thread\n";
alpaka::exec<TAcc>(queue, workDiv, OpenMPSimdStyleKernel{}, alpaka::getPtrNative(bufAcc), n);
alpaka::exec<TAcc>(queue, workDiv, OpenMPSimdStyleKernel{}, std::data(bufAcc), n);
testResult(queue, bufAcc);
}

Expand Down
12 changes: 6 additions & 6 deletions example/randomCells2D/src/randomCells2D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,22 +181,22 @@ auto main() -> int

// Setup buffer.
BufHost bufHostS{alpaka::allocBuf<float, Idx>(devHost, extent)};
float* const ptrBufHostS{alpaka::getPtrNative(bufHostS)};
float* const ptrBufHostS{std::data(bufHostS)};
BufAcc bufAccS{alpaka::allocBuf<float, Idx>(devAcc, extent)};
float* const ptrBufAccS{alpaka::getPtrNative(bufAccS)};
float* const ptrBufAccS{std::data(bufAccS)};

BufHost bufHostV{alpaka::allocBuf<float, Idx>(devHost, extent)};
float* const ptrBufHostV{alpaka::getPtrNative(bufHostV)};
float* const ptrBufHostV{std::data(bufHostV)};
BufAcc bufAccV{alpaka::allocBuf<float, Idx>(devAcc, extent)};
float* const ptrBufAccV{alpaka::getPtrNative(bufAccV)};
float* const ptrBufAccV{std::data(bufAccV)};

BufHostRand bufHostRandS{alpaka::allocBuf<RandomEngineSingle, Idx>(devHost, extent)};
BufAccRand bufAccRandS{alpaka::allocBuf<RandomEngineSingle, Idx>(devAcc, extent)};
RandomEngineSingle* const ptrBufAccRandS{alpaka::getPtrNative(bufAccRandS)};
RandomEngineSingle* const ptrBufAccRandS{std::data(bufAccRandS)};

BufHostRandVec bufHostRandV{alpaka::allocBuf<RandomEngineVector, Idx>(devHost, extent)};
BufAccRandVec bufAccRandV{alpaka::allocBuf<RandomEngineVector, Idx>(devAcc, extent)};
RandomEngineVector* const ptrBufAccRandV{alpaka::getPtrNative(bufAccRandV)};
RandomEngineVector* const ptrBufAccRandV{std::data(bufAccRandV)};

InitRandomKernel initRandomKernel;
auto pitchBufAccRandS = alpaka::getPitchesInBytes(bufAccRandS)[0];
Expand Down
8 changes: 4 additions & 4 deletions example/randomStrategies/src/randomStrategies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ template<Strategy TStrategy>
void runStrategy(Box& box)
{
// Set up the pointer to the PRNG states buffer
RandomEngine* const ptrBufAccRand{alpaka::getPtrNative(box.bufAccRand)};
RandomEngine* const ptrBufAccRand{std::data(box.bufAccRand)};

// Initialize the PRNG and its states on the device
InitRandomKernel<TStrategy> initRandomKernel;
Expand All @@ -264,13 +264,13 @@ void runStrategy(Box& box)
alpaka::wait(box.queue);

// OPTIONAL: copy the the initial states to host if you want to check them yourself
// alpaka_rand::Philox4x32x10<Box::Acc>* const ptrBufHostRand{alpaka::getPtrNative(box.bufHostRand)};
// alpaka_rand::Philox4x32x10<Box::Acc>* const ptrBufHostRand{std::data(box.bufHostRand)};
// alpaka::memcpy(box.queue, box.bufHostRand, box.bufAccRand);
// alpaka::wait(box.queue);

// Set up the pointers to the results buffers
float* const ptrBufHostResult{alpaka::getPtrNative(box.bufHostResult)};
float* const ptrBufAccResult{alpaka::getPtrNative(box.bufAccResult)};
float* const ptrBufHostResult{std::data(box.bufHostResult)};
float* const ptrBufAccResult{std::data(box.bufAccResult)};

// Initialise the results buffer to zero
for(Box::Idx i = 0; i < box.extentResult[0]; ++i)
Expand Down
10 changes: 5 additions & 5 deletions example/reduce/src/reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,17 @@ auto reduce(
auto const taskKernelReduceMain = alpaka::createTaskKernel<Acc>(
workDiv1,
kernel1,
alpaka::getPtrNative(sourceDeviceMemory),
alpaka::getPtrNative(destinationDeviceMemory),
std::data(sourceDeviceMemory),
std::data(destinationDeviceMemory),
n,
func);

// create last block reduction kernel execution task
auto const taskKernelReduceLastBlock = alpaka::createTaskKernel<Acc>(
workDiv2,
kernel2,
alpaka::getPtrNative(destinationDeviceMemory),
alpaka::getPtrNative(destinationDeviceMemory),
std::data(destinationDeviceMemory),
std::data(destinationDeviceMemory),
blockCount,
func);

Expand Down Expand Up @@ -131,7 +131,7 @@ auto main() -> int
// allocate memory
auto hostMemory = alpaka::allocBuf<T, Idx>(devHost, n);

T* nativeHostMemory = alpaka::getPtrNative(hostMemory);
T* nativeHostMemory = std::data(hostMemory);

// fill array with data
for(uint64_t i = 0; i < n; i++)
Expand Down
22 changes: 9 additions & 13 deletions example/vectorAdd/src/vectorAdd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class VectorAddKernel
auto main() -> int
{
// Define the index domain
// Set the number of dimensions as an integral constant. Set to 1 for 1D.
using Dim = alpaka::DimInt<1u>;
using Idx = std::size_t;

Expand Down Expand Up @@ -115,21 +116,16 @@ auto main() -> int
BufHost bufHostB(alpaka::allocBuf<Data, Idx>(devHost, extent));
BufHost bufHostC(alpaka::allocBuf<Data, Idx>(devHost, extent));

// Initialize the host input vectors A and B
Data* const pBufHostA(alpaka::getPtrNative(bufHostA));
Data* const pBufHostB(alpaka::getPtrNative(bufHostB));
Data* const pBufHostC(alpaka::getPtrNative(bufHostC));

// C++14 random generator for uniformly distributed numbers in {1,..,42}
std::random_device rd{};
std::default_random_engine eng{rd()};
std::uniform_int_distribution<Data> dist(1, 42);

for(Idx i(0); i < numElements; ++i)
{
pBufHostA[i] = dist(eng);
pBufHostB[i] = dist(eng);
pBufHostC[i] = 0;
bufHostA[i] = dist(eng);
bufHostB[i] = dist(eng);
bufHostC[i] = 0;
}

// Allocate 3 buffers on the accelerator
Expand All @@ -150,9 +146,9 @@ auto main() -> int
auto const taskKernel = alpaka::createTaskKernel<Acc>(
workDiv,
kernel,
alpaka::getPtrNative(bufAccA),
alpaka::getPtrNative(bufAccB),
alpaka::getPtrNative(bufAccC),
std::data(bufAccA),
std::data(bufAccB),
std::data(bufAccC),
numElements);

// Enqueue the kernel execution task
Expand All @@ -179,8 +175,8 @@ auto main() -> int
static constexpr int MAX_PRINT_FALSE_RESULTS = 20;
for(Idx i(0u); i < numElements; ++i)
{
Data const& val(pBufHostC[i]);
Data const correctResult(pBufHostA[i] + pBufHostB[i]);
Data const& val(bufHostC[i]);
Data const correctResult(bufHostA[i] + bufHostB[i]);
if(val != correctResult)
{
if(falseResults < MAX_PRINT_FALSE_RESULTS)
Expand Down
Loading

0 comments on commit 887aee9

Please sign in to comment.