@@ -35,8 +35,10 @@ struct cpuGpu_offloadThreshold {
35
35
template <typename T>
36
36
class doGemm {
37
37
public:
38
- doGemm (const int iters, const int upperLimit)
38
+ doGemm (const int iters, const int startDim, const int upperLimit, const
39
+ bool cpuEnabled = true , const bool gpuEnabled = true )
39
40
: iterations_(iters),
41
+ startDimention_(startDim),
40
42
upperLimit_(upperLimit),
41
43
doCPU_(cpuEnabled),
42
44
doGPU_(gpuEnabled)
@@ -68,12 +70,9 @@ class doGemm {
68
70
" _square_square_M=N=K.csv" );
69
71
for (int dim = startDimention_; dim <= upperLimit_; dim++) {
70
72
// M = dim, N = dim, K = dim;
71
- callKernels (csvFile, dim, dim, dim);
72
- std::ofstream csvFile = initCSVFile (std::string (CSV_DIR) + " /" +
73
- getKernelName () + " _square.csv" );
74
- for (int dim = 1 ; dim <= upperLimit_; dim++) {
75
- const int M = dim, N = dim, K = dim;
76
- callDenseKernels (csvFile, M, N, K);
73
+ callDenseKernels (csvFile, dim, dim, dim);
74
+ std::ofstream csvFile = initCSVFile (std::string (CSV_DIR) + " /" +
75
+ getKernelName () + " _square.csv" );
77
76
}
78
77
// Close file
79
78
csvFile.close ();
@@ -92,15 +91,11 @@ class doGemm {
92
91
cpuGpu_unified_ = cpuGpu_offloadThreshold ();
93
92
csvFile = initCSVFile (std::string (CSV_DIR) + " /" + getKernelName () +
94
93
" _rectangular_16MxK.csv" );
95
- for (int dim = 16 ; dim <= upperLimit_; dim += 16 ) {
96
- const int M = dim, N = dim, K = (dim / 16 );
97
- callDenseKernels (csvFile, M, N, K);
98
- " _tall-thin_short-wide_M=N_M=16K.csv" );
99
94
int K = startDimention_;
100
95
int M = 16 * K;
101
96
int N = 16 * K;
102
97
while (M <= upperLimit_) {
103
- callKernels (csvFile, M, N, K);
98
+ callDenseKernels (csvFile, M, N, K);
104
99
M += 16 ;
105
100
N += 16 ;
106
101
K++;
@@ -123,11 +118,7 @@ class doGemm {
123
118
" _rectangular_Mx32.csv" );
124
119
if (upperLimit_ >= 32 ) {
125
120
for (int dim = 1 ; dim <= upperLimit_; dim++) {
126
- const int M = dim, N = dim, K = 32 ;
127
- callDenseKernels (csvFile, M, N, K);
128
- for (int dim = startDimention_; dim <= upperLimit_; dim++) {
129
- // M = dim, N = dim, K = 32;
130
- callKernels (csvFile, dim, dim, 32 );
121
+ callDenseKernels (csvFile, dim, dim, 32 );
131
122
}
132
123
}
133
124
// Close file
@@ -150,7 +141,7 @@ class doGemm {
150
141
N = startDimention_;
151
142
K = 16 * M;
152
143
while (K <= upperLimit_) {
153
- callKernels (csvFile, M, N, K);
144
+ callDenseKernels (csvFile, M, N, K);
154
145
M++;
155
146
N++;
156
147
K += 16 ;
@@ -174,7 +165,7 @@ class doGemm {
174
165
if (upperLimit_ >= 32 ) {
175
166
for (int dim = startDimention_; dim <= upperLimit_; dim++) {
176
167
// M = 32, N = 32, K = dim;
177
- callKernels (csvFile, 32 , 32 , dim);
168
+ callDenseKernels (csvFile, 32 , 32 , dim);
178
169
}
179
170
}
180
171
// Close file
@@ -193,15 +184,8 @@ class doGemm {
193
184
cpuGpu_unified_ = cpuGpu_offloadThreshold ();
194
185
csvFile = initCSVFile (std::string (CSV_DIR) + " /" + getKernelName () +
195
186
" _rectangular_Mx16K.csv" );
196
- for (int dim = 16 ; dim <= upperLimit_; dim += 16 ) {
197
- const int M = (dim / 16 ), N = (dim / 16 ), K = dim;
198
- callDenseKernels (csvFile, M, N, K);
199
- " _tall-thin_square_K=N_M=16K.csv" );
200
- K = startDimention_;
201
- N = startDimention_;
202
- M = 16 * K;
203
187
while (M <= upperLimit_) {
204
- callKernels (csvFile, M, N, K);
188
+ callDenseKernels (csvFile, M, N, K);
205
189
M += 16 ;
206
190
N++;
207
191
K++;
@@ -225,7 +209,7 @@ class doGemm {
225
209
if (upperLimit_ >= 32 ) {
226
210
for (int dim = startDimention_; dim <= upperLimit_; dim++) {
227
211
// M = dim, N = 32, K = 32;
228
- callKernels (csvFile, dim, 32 , 32 );
212
+ callDenseKernels (csvFile, dim, 32 , 32 );
229
213
}
230
214
}
231
215
// Close file
@@ -248,19 +232,19 @@ class doGemm {
248
232
K = startDimention_;
249
233
N = 16 * K;
250
234
while (N <= upperLimit_) {
251
- callKernels (csvFile, M, N, K);
235
+ callDenseKernels (csvFile, M, N, K);
252
236
M++;
253
237
N += 16 ;
254
238
K++;
255
- for (int dim = 1 ; dim <= upperLimit_; dim++) {
256
- const int M = 32 , N = 32 , K = dim;
257
- callDenseKernels (csvFile, M, N, K);
258
- }
259
239
}
260
240
// Close file
261
241
csvFile.close ();
262
- // Print offload results to stdout
263
- printOffloadThreshold (" Short and Wide (32 x K)" );
242
+ #if CPU_ENABLED && GPU_ENABLED
243
+ if (doCPU_ && doGPU_) {
244
+ // Print offload results to stdout
245
+ printOffloadThreshold (" Square x Short-and-Wide (M=K, N=16K)" );
246
+ }
247
+ #endif
264
248
265
249
// Square sparse matrix - sparse matrix multiplication
266
250
cpuGpu_always_ = cpuGpu_offloadThreshold ();
@@ -270,19 +254,15 @@ class doGemm {
270
254
" _sparse_square.csv" );
271
255
if (upperLimit_ >= 32 ) {
272
256
for (int dim = 1 ; dim <= upperLimit_; dim++) {
273
- const int N = dim;
274
- callSparseKernels (csvFile, N, 0.99 );
257
+ callSparseKernels (csvFile, dim, 0.99 );
275
258
}
276
259
}
277
260
// Close file
278
261
csvFile.close ();
279
- // Print offload results to stdout
280
- printOffloadThreshold (" Sparse Square" );
281
-
282
262
#if CPU_ENABLED && GPU_ENABLED
283
- if (doCPU_ && doGPU_ ) {
263
+ if (doCPU_ && dpGPU_ ) {
284
264
// Print offload results to stdout
285
- printOffloadThreshold (" Square x Short-and-Wide (M=K, N=16K) " );
265
+ printOffloadThreshold (" Sparse Square " );
286
266
}
287
267
#endif
288
268
@@ -296,7 +276,7 @@ class doGemm {
296
276
if (upperLimit_ >= 32 ) {
297
277
for (int dim = startDimention_; dim <= upperLimit_; dim++) {
298
278
// M = 32, N = dim, K = 32;
299
- callKernels (csvFile, 32 , dim, 32 );
279
+ callDenseKernels (csvFile, 32 , dim, 32 );
300
280
}
301
281
}
302
282
// Close file
@@ -501,14 +481,20 @@ class doGemm {
501
481
const uint64_t flops = calcFlops (N, N, N);
502
482
std::string kernelName = getKernelName ();
503
483
504
- spGemmCpu_.initialise (N, sparsity);
505
- time_checksum_gflop cpuResult = spGemmCpu_.compute ();
506
- cpuResult.gflops = calcGflops (flops, iterations_, cpuResult.runtime );
507
-
508
- // Perform the GPU kernels
509
-
484
+ #if CPU_ENABLED
485
+ if (doCPU_) {
486
+ spGemmCpu_.initialise (N, sparsity);
487
+ time_checksum_gflop cpuResult = spGemmCpu_.compute ();
488
+ cpuResult.gflops = calcGflops (flops, iterations_, cpuResult.runtime );
489
+ writeLineToCsv (csvFile, " cpu" , kernelName, N, N, N, probSize, iterations_,
490
+ cpuResult.runtime , cpuResult.gflops );
491
+ }
492
+ #endif
493
+ #if GPU_ENABLED
494
+ // Perform the GPU kernels
510
495
// - UNIFIED : data passed from host to device (and device to host) as
511
496
// needed
497
+ if (doGPU_) {
512
498
spGemmGpu_.initialise (gpuOffloadType::unified, N, sparsity);
513
499
time_checksum_gflop gpuResult_unified = spGemmGpu_.compute ();
514
500
gpuResult_unified.gflops =
@@ -525,13 +511,9 @@ class doGemm {
525
511
time_checksum_gflop gpuResult_once = spGemmGpu_.compute ();
526
512
gpuResult_once.gflops =
527
513
calcGflops (flops, iterations_, gpuResult_once.runtime );
528
-
529
-
530
514
// ToDo -- non-default GPU operations
531
515
532
516
// Write lines to CSV file
533
- writeLineToCsv (csvFile, " cpu" , kernelName, N, N, N, probSize, iterations_,
534
- cpuResult.runtime , cpuResult.gflops );
535
517
writeLineToCsv (csvFile, " gpu_offloadOnce" , kernelName, N, N, N, probSize,
536
518
iterations_, gpuResult_once.runtime , gpuResult_once.gflops );
537
519
writeLineToCsv (csvFile, " gpu_offloadAlways" , kernelName, N, N, N, probSize,
@@ -540,6 +522,10 @@ class doGemm {
540
522
writeLineToCsv (csvFile, " gpu_unified" , kernelName, N, N, N, probSize,
541
523
iterations_, gpuResult_unified.runtime ,
542
524
gpuResult_unified.gflops );
525
+
526
+ }
527
+ #endif
528
+
543
529
}
544
530
545
531
/* * A function for calculating FLOPs performed by a GEMM.
@@ -569,7 +555,7 @@ class doGemm {
569
555
}
570
556
571
557
/* * Print to stdout the offload thresholds. */
572
- void printOffloadThreshold (std::string problemName) const {
558
+ void printOffloadThreshold (const std::string& problemName) const {
573
559
std::vector<std::string> header = {
574
560
" Device" , " M" , " N" , " K" , " Total Prob. Size (KiB)" ,
575
561
" GFLOP/s" , " CPU GFLOP/s" };
@@ -663,16 +649,14 @@ class doGemm {
663
649
#if CPU_ENABLED
664
650
/* * The GEMM CPU kernel. */
665
651
cpu::gemm_cpu<T> gemmCpu_;
652
+ cpu::sp_gemm_cpu<T> spGemmCpu_;
666
653
#endif
667
654
668
- cpu::sp_gemm_cpu<T> spGemmCpu_;
669
-
670
655
#if GPU_ENABLED
671
656
/* * The GEMM GPU kernel. */
672
657
gpu::gemm_gpu<T> gemmGpu_;
673
- #endif
674
-
675
658
gpu::sp_gemm_gpu<T> spGemmGpu_;
659
+ #endif
676
660
677
661
/* * The point at which offloading to GPU (offload once) becomes worthwhile. */
678
662
cpuGpu_offloadThreshold cpuGpu_once_;
0 commit comments