Skip to content

Commit 74fe223

Browse files
committed
Fixing after rebase
1 parent 99e97eb commit 74fe223

File tree

4 files changed

+124
-60
lines changed

4 files changed

+124
-60
lines changed

include/doGemm.hh

+42-58
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,10 @@ struct cpuGpu_offloadThreshold {
3535
template <typename T>
3636
class doGemm {
3737
public:
38-
doGemm(const int iters, const int upperLimit)
38+
doGemm(const int iters, const int startDim, const int upperLimit, const
39+
bool cpuEnabled = true, const bool gpuEnabled = true)
3940
: iterations_(iters),
41+
startDimention_(startDim),
4042
upperLimit_(upperLimit),
4143
doCPU_(cpuEnabled),
4244
doGPU_(gpuEnabled)
@@ -68,12 +70,9 @@ class doGemm {
6870
"_square_square_M=N=K.csv");
6971
for (int dim = startDimention_; dim <= upperLimit_; dim++) {
7072
// M = dim, N = dim, K = dim;
71-
callKernels(csvFile, dim, dim, dim);
72-
std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
73-
getKernelName() + "_square.csv");
74-
for (int dim = 1; dim <= upperLimit_; dim++) {
75-
const int M = dim, N = dim, K = dim;
76-
callDenseKernels(csvFile, M, N, K);
73+
callDenseKernels(csvFile, dim, dim, dim);
74+
std::ofstream csvFile = initCSVFile(std::string(CSV_DIR) + "/" +
75+
getKernelName() + "_square.csv");
7776
}
7877
// Close file
7978
csvFile.close();
@@ -92,15 +91,11 @@ class doGemm {
9291
cpuGpu_unified_ = cpuGpu_offloadThreshold();
9392
csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() +
9493
"_rectangular_16MxK.csv");
95-
for (int dim = 16; dim <= upperLimit_; dim += 16) {
96-
const int M = dim, N = dim, K = (dim / 16);
97-
callDenseKernels(csvFile, M, N, K);
98-
"_tall-thin_short-wide_M=N_M=16K.csv");
9994
int K = startDimention_;
10095
int M = 16 * K;
10196
int N = 16 * K;
10297
while (M <= upperLimit_) {
103-
callKernels(csvFile, M, N, K);
98+
callDenseKernels(csvFile, M, N, K);
10499
M += 16;
105100
N += 16;
106101
K++;
@@ -123,11 +118,7 @@ class doGemm {
123118
"_rectangular_Mx32.csv");
124119
if (upperLimit_ >= 32) {
125120
for (int dim = 1; dim <= upperLimit_; dim++) {
126-
const int M = dim, N = dim, K = 32;
127-
callDenseKernels(csvFile, M, N, K);
128-
for (int dim = startDimention_; dim <= upperLimit_; dim++) {
129-
// M = dim, N = dim, K = 32;
130-
callKernels(csvFile, dim, dim, 32);
121+
callDenseKernels(csvFile, dim, dim, 32);
131122
}
132123
}
133124
// Close file
@@ -150,7 +141,7 @@ class doGemm {
150141
N = startDimention_;
151142
K = 16 * M;
152143
while (K <= upperLimit_) {
153-
callKernels(csvFile, M, N, K);
144+
callDenseKernels(csvFile, M, N, K);
154145
M++;
155146
N++;
156147
K += 16;
@@ -174,7 +165,7 @@ class doGemm {
174165
if (upperLimit_ >= 32) {
175166
for (int dim = startDimention_; dim <= upperLimit_; dim++) {
176167
// M = 32, N = 32, K = dim;
177-
callKernels(csvFile, 32, 32, dim);
168+
callDenseKernels(csvFile, 32, 32, dim);
178169
}
179170
}
180171
// Close file
@@ -193,15 +184,8 @@ class doGemm {
193184
cpuGpu_unified_ = cpuGpu_offloadThreshold();
194185
csvFile = initCSVFile(std::string(CSV_DIR) + "/" + getKernelName() +
195186
"_rectangular_Mx16K.csv");
196-
for (int dim = 16; dim <= upperLimit_; dim += 16) {
197-
const int M = (dim / 16), N = (dim / 16), K = dim;
198-
callDenseKernels(csvFile, M, N, K);
199-
"_tall-thin_square_K=N_M=16K.csv");
200-
K = startDimention_;
201-
N = startDimention_;
202-
M = 16 * K;
203187
while (M <= upperLimit_) {
204-
callKernels(csvFile, M, N, K);
188+
callDenseKernels(csvFile, M, N, K);
205189
M += 16;
206190
N++;
207191
K++;
@@ -225,7 +209,7 @@ class doGemm {
225209
if (upperLimit_ >= 32) {
226210
for (int dim = startDimention_; dim <= upperLimit_; dim++) {
227211
// M = dim, N = 32, K = 32;
228-
callKernels(csvFile, dim, 32, 32);
212+
callDenseKernels(csvFile, dim, 32, 32);
229213
}
230214
}
231215
// Close file
@@ -248,19 +232,19 @@ class doGemm {
248232
K = startDimention_;
249233
N = 16 * K;
250234
while (N <= upperLimit_) {
251-
callKernels(csvFile, M, N, K);
235+
callDenseKernels(csvFile, M, N, K);
252236
M++;
253237
N += 16;
254238
K++;
255-
for (int dim = 1; dim <= upperLimit_; dim++) {
256-
const int M = 32, N = 32, K = dim;
257-
callDenseKernels(csvFile, M, N, K);
258-
}
259239
}
260240
// Close file
261241
csvFile.close();
262-
// Print offload results to stdout
263-
printOffloadThreshold("Short and Wide (32 x K)");
242+
#if CPU_ENABLED && GPU_ENABLED
243+
if (doCPU_ && doGPU_) {
244+
// Print offload results to stdout
245+
printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)");
246+
}
247+
#endif
264248

265249
// Square sparse matrix - sparse matrix multiplication
266250
cpuGpu_always_ = cpuGpu_offloadThreshold();
@@ -270,19 +254,15 @@ class doGemm {
270254
"_sparse_square.csv");
271255
if (upperLimit_ >= 32) {
272256
for (int dim = 1; dim <= upperLimit_; dim++) {
273-
const int N = dim;
274-
callSparseKernels(csvFile, N, 0.99);
257+
callSparseKernels(csvFile, dim, 0.99);
275258
}
276259
}
277260
// Close file
278261
csvFile.close();
279-
// Print offload results to stdout
280-
printOffloadThreshold("Sparse Square");
281-
282262
#if CPU_ENABLED && GPU_ENABLED
283-
if (doCPU_ && doGPU_) {
263+
if (doCPU_ && dpGPU_) {
284264
// Print offload results to stdout
285-
printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)");
265+
printOffloadThreshold("Sparse Square");
286266
}
287267
#endif
288268

@@ -296,7 +276,7 @@ class doGemm {
296276
if (upperLimit_ >= 32) {
297277
for (int dim = startDimention_; dim <= upperLimit_; dim++) {
298278
// M = 32, N = dim, K = 32;
299-
callKernels(csvFile, 32, dim, 32);
279+
callDenseKernels(csvFile, 32, dim, 32);
300280
}
301281
}
302282
// Close file
@@ -501,14 +481,20 @@ class doGemm {
501481
const uint64_t flops = calcFlops(N, N, N);
502482
std::string kernelName = getKernelName();
503483

504-
spGemmCpu_.initialise(N, sparsity);
505-
time_checksum_gflop cpuResult = spGemmCpu_.compute();
506-
cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
507-
508-
// Perform the GPU kernels
509-
484+
#if CPU_ENABLED
485+
if (doCPU_) {
486+
spGemmCpu_.initialise(N, sparsity);
487+
time_checksum_gflop cpuResult = spGemmCpu_.compute();
488+
cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime);
489+
writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
490+
cpuResult.runtime, cpuResult.gflops);
491+
}
492+
#endif
493+
#if GPU_ENABLED
494+
// Perform the GPU kernels
510495
// - UNIFIED : data passed from host to device (and device to host) as
511496
// needed
497+
if (doGPU_) {
512498
spGemmGpu_.initialise(gpuOffloadType::unified, N, sparsity);
513499
time_checksum_gflop gpuResult_unified = spGemmGpu_.compute();
514500
gpuResult_unified.gflops =
@@ -525,13 +511,9 @@ class doGemm {
525511
time_checksum_gflop gpuResult_once = spGemmGpu_.compute();
526512
gpuResult_once.gflops =
527513
calcGflops(flops, iterations_, gpuResult_once.runtime);
528-
529-
530514
// ToDo -- non-default GPU operations
531515

532516
// Write lines to CSV file
533-
writeLineToCsv(csvFile, "cpu", kernelName, N, N, N, probSize, iterations_,
534-
cpuResult.runtime, cpuResult.gflops);
535517
writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, N, N, probSize,
536518
iterations_, gpuResult_once.runtime, gpuResult_once.gflops);
537519
writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, N, N, probSize,
@@ -540,6 +522,10 @@ class doGemm {
540522
writeLineToCsv(csvFile, "gpu_unified", kernelName, N, N, N, probSize,
541523
iterations_, gpuResult_unified.runtime,
542524
gpuResult_unified.gflops);
525+
526+
}
527+
#endif
528+
543529
}
544530

545531
/** A function for calculating FLOPs performed by a GEMM.
@@ -569,7 +555,7 @@ class doGemm {
569555
}
570556

571557
/** Print to stdout the offload thresholds. */
572-
void printOffloadThreshold(std::string problemName) const {
558+
void printOffloadThreshold(const std::string& problemName) const {
573559
std::vector<std::string> header = {
574560
"Device", "M", "N", "K", "Total Prob. Size (KiB)",
575561
"GFLOP/s", "CPU GFLOP/s"};
@@ -663,16 +649,14 @@ class doGemm {
663649
#if CPU_ENABLED
664650
/** The GEMM CPU kernel. */
665651
cpu::gemm_cpu<T> gemmCpu_;
652+
cpu::sp_gemm_cpu<T> spGemmCpu_;
666653
#endif
667654

668-
cpu::sp_gemm_cpu<T> spGemmCpu_;
669-
670655
#if GPU_ENABLED
671656
/** The GEMM GPU kernel. */
672657
gpu::gemm_gpu<T> gemmGpu_;
673-
#endif
674-
675658
gpu::sp_gemm_gpu<T> spGemmGpu_;
659+
#endif
676660

677661
/** The point at which offloading to GPU (offload once) becomes worthwhile. */
678662
cpuGpu_offloadThreshold cpuGpu_once_;

include/main.hh

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ void printBenchmarkConfig(const int iters, const int upperLimit);
1414
int parseInt(const char* str);
1515

1616
/** A function which parsen the runtime arguments. */
17-
void getParameters(int argc, char* argv[]);
17+
void getParameters(int argc, char** argv);

oneMKL/CPU/sp_gemm.hh

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#pragma once
2+
3+
#ifdef CPU_ONEMKL
4+
#include <mkl.h>
5+
6+
#include <algorithm>
7+
8+
#include "../../include/kernels/CPU/sp_gemm.hh"
9+
#include "../../include/utilities.hh"
10+
11+
namespace cpu {
12+
/** A class for GEMM CPU BLAS kernels. */
13+
template <typename T>
14+
class sp_gemm_cpu : public sp_gemm<T> {
15+
public:
16+
using sp_gemm<T>::sp_gemm;
17+
using sp_gemm<T>::initInputMatrices;
18+
using sp_gemm<T>::callConsume;
19+
using sp_gemm<T>::m_;
20+
using sp_gemm<T>::n_;
21+
using sp_gemm<T>::k_;
22+
using sp_gemm<T>::A_;
23+
using sp_gemm<T>::B_;
24+
using sp_gemm<T>::C_;
25+
26+
/** Initialise the required data structures. */
27+
void initialise(int m, int n, int k) {
28+
m_ = m;
29+
n_ = n;
30+
k_ = k;
31+
32+
A_ = (T*)mkl_malloc(sizeof(T) * m_ * k_, 64);
33+
B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64);
34+
C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64);
35+
36+
// Initialise the matricies
37+
initInputMatrices();
38+
}
39+
40+
private:
41+
/** Make call to the GEMM kernel. */
42+
void callGemm() override {
43+
if constexpr (std::is_same_v<T, float>) {
44+
cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_,
45+
(float)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_),
46+
(float)BETA, C_, std::max(1, m_));
47+
} else if constexpr (std::is_same_v<T, double>) {
48+
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m_, n_, k_,
49+
(double)ALPHA, A_, std::max(1, m_), B_, std::max(1, k_),
50+
(double)BETA, C_, std::max(1, m_));
51+
} else {
52+
// Un-specialised class will not do any work - print error and exit.
53+
std::cout << "ERROR - Datatype for OneMKL CPU GEMM kernel not supported."
54+
<< std::endl;
55+
exit(1);
56+
}
57+
// Ensure compiler doesn't optimise away the work being done
58+
callConsume();
59+
}
60+
61+
/** Perform any required steps before calling the GEMM kernel that should
62+
* be timed. */
63+
void preLoopRequirements() override {}
64+
65+
/** Perform any required steps after calling the GEMM kernel that should
66+
* be timed. */
67+
void postLoopRequirements() override {}
68+
69+
/** Do any necessary cleanup (free pointers, close library handles, etc.)
70+
* after Kernel has been called. */
71+
void postCallKernelCleanup() override {
72+
mkl_free_buffers();
73+
mkl_free(A_);
74+
mkl_free(B_);
75+
mkl_free(C_);
76+
}
77+
};
78+
} // namespace cpu
79+
#endif

src/main.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "../include/main.hh"
22

33
int iters = 10;
4+
int startDim = 1;
45
int upperLimit = 128;
56
bool sgemm = true;
67
bool dgemm = true;
@@ -79,7 +80,7 @@ int parseInt(const char* str) {
7980
return strlen(next) ? -1 : value;
8081
}
8182

82-
void getParameters(int argc, char* argv[]) {
83+
void getParameters(int argc, char** argv) {
8384
for (int i = 1; i < argc; i++) {
8485
if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i")) {
8586
if (++i >= argc || (iters = parseInt(argv[i])) < 0) {

0 commit comments

Comments
 (0)