@@ -84,6 +84,13 @@ int cs_glob_cuda_n_mp = -1;
84
84
static int _cs_glob_cuda_n_streams = -1 ;
85
85
static cudaStream_t *_cs_glob_cuda_streams = nullptr ;
86
86
87
+ /* Reduce buffers associated with streams in pool */
88
+
89
+ static unsigned *_r_elt_size = nullptr ;
90
+ static unsigned *_r_grid_size = nullptr ;
91
+ static void **_r_reduce = nullptr ;
92
+ static void **_r_grid = nullptr ;
93
+
87
94
static cudaStream_t _cs_glob_stream_pf = 0 ;
88
95
89
96
/* Allow graphs for kernel launches ? May interfere with profiling (nsys),
@@ -107,10 +114,19 @@ finalize_streams_(void)
107
114
cs_mem_cuda_set_prefetch_stream (0 );
108
115
cudaStreamDestroy (_cs_glob_stream_pf);
109
116
110
- for (int i = 0 ; i < _cs_glob_cuda_n_streams; i++)
117
+ for (int i = 0 ; i < _cs_glob_cuda_n_streams; i++) {
111
118
cudaStreamDestroy (_cs_glob_cuda_streams[i]);
119
+ CS_FREE (_r_reduce[i]);
120
+ CS_FREE (_r_grid[i]);
121
+ }
112
122
113
123
CS_FREE (_cs_glob_cuda_streams);
124
+
125
+ CS_FREE (_r_elt_size);
126
+ CS_FREE (_r_grid_size);
127
+ CS_FREE (_r_reduce);
128
+ CS_FREE (_r_grid);
129
+
114
130
_cs_glob_cuda_n_streams = 0 ;
115
131
}
116
132
@@ -323,8 +339,18 @@ cs_cuda_get_stream(int stream_id)
323
339
}
324
340
325
341
CS_REALLOC (_cs_glob_cuda_streams, stream_id+1 , cudaStream_t);
326
- for (int i = _cs_glob_cuda_n_streams; i < stream_id+1 ; i++)
342
+ CS_REALLOC (_r_elt_size, stream_id+1 , unsigned );
343
+ CS_REALLOC (_r_grid_size, stream_id+1 , unsigned );
344
+ CS_REALLOC (_r_reduce, stream_id+1 , void *);
345
+ CS_REALLOC (_r_grid, stream_id+1 , void *);
346
+
347
+ for (int i = _cs_glob_cuda_n_streams; i < stream_id+1 ; i++) {
327
348
cudaStreamCreate (&_cs_glob_cuda_streams[i]);
349
+ _r_elt_size[i] = 0 ;
350
+ _r_grid_size[i] = 0 ;
351
+ _r_reduce[i] = nullptr ;
352
+ _r_grid[i] = nullptr ;
353
+ }
328
354
329
355
_cs_glob_cuda_n_streams = stream_id+1 ;
330
356
@@ -347,6 +373,77 @@ cs_cuda_get_stream_prefetch(void)
347
373
return _cs_glob_stream_pf;
348
374
}
349
375
376
+ /* ----------------------------------------------------------------------------*/
377
+ /* !
378
+ * \brief Return stream id in stream pool matching a given CUDA stream.
379
+ *
380
+ * If the stream is not presnet in the stream pool, return -1.
381
+ *
382
+ * \param [in] handle to given streams
383
+ *
384
+ * \returns if of stream in pool, or -1.
385
+ */
386
+ /* ----------------------------------------------------------------------------*/
387
+
388
+ int
389
+ cs_cuda_get_stream_id (cudaStream_t stream)
390
+ {
391
+ for (int i = 0 ; i < _cs_glob_cuda_n_streams; i++)
392
+ if (stream == _cs_glob_cuda_streams[i])
393
+ return i;
394
+
395
+ return -1 ;
396
+ }
397
+
398
+ /* ----------------------------------------------------------------------------*/
399
+ /* !
400
+ * \brief Return pointers to reduction buffers needed for 2-stage reductions.
401
+ *
402
+ * These buffers are used internally by CUDA 2-stage operations, and are
403
+ * allocated and resized updon demand.
404
+ *
405
+ * \param[in] stream_id stream id in pool
406
+ * \param[in] n_elts size of arrays
407
+ * \param[in] n_elts size of arrays
408
+ * \param[in] elt_size size of element or structure simultaneously reduced
409
+ * \param[in] grid_size associated grid size
410
+ * \param[out] r_grid first stage reduce buffer
411
+ * \param[out] r_reduce second stage (final result) reduce buffer
412
+ */
413
+ /* ----------------------------------------------------------------------------*/
414
+
415
+ void
416
+ cs_cuda_get_2_stage_reduce_buffers (int stream_id,
417
+ cs_lnum_t n_elts,
418
+ size_t elt_size,
419
+ unsigned int grid_size,
420
+ void * &r_grid,
421
+ void * &r_reduce)
422
+ {
423
+ assert (stream_id > -1 && stream_id < _cs_glob_cuda_n_streams);
424
+
425
+ unsigned int t_grid_size = grid_size * elt_size;
426
+
427
+ if (_r_elt_size[stream_id] < elt_size) {
428
+ _r_elt_size[stream_id] = elt_size;
429
+ CS_FREE_HD (_r_reduce[stream_id]);
430
+ unsigned char *b_ptr;
431
+ CS_MALLOC_HD (b_ptr, elt_size, unsigned char , CS_ALLOC_HOST_DEVICE_SHARED);
432
+ _r_reduce[stream_id] = b_ptr;
433
+ }
434
+
435
+ if (_r_grid_size[stream_id] < t_grid_size) {
436
+ _r_grid_size[stream_id] = t_grid_size;
437
+ CS_FREE (_r_grid[stream_id]);
438
+ unsigned char *b_ptr;
439
+ CS_MALLOC_HD (b_ptr, _r_grid_size[stream_id], unsigned char , CS_ALLOC_DEVICE);
440
+ _r_grid[stream_id] = b_ptr;
441
+ }
442
+
443
+ r_grid = _r_grid[stream_id];
444
+ r_reduce = _r_reduce[stream_id];
445
+ }
446
+
350
447
#endif /* defined(__CUDACC__) */
351
448
352
449
/* ----------------------------------------------------------------------------*/
0 commit comments