Skip to content

Commit 5b8275e

Browse files
committed
Dispatch: implement general reduction for CUDA.
1 parent 34cbe82 commit 5b8275e

7 files changed

+863
-120
lines changed

Diff for: src/base/Makefile.am

+2-1
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,8 @@ csbase_include_HEADERS += \
208208
cs_base_cuda.h \
209209
cs_math_cuda.cuh \
210210
cs_array_cuda.h \
211-
cs_cuda_contrib.h
211+
cs_cuda_contrib.h \
212+
cs_cuda_reduce.h
212213

213214
noinst_HEADERS += \
214215
cs_halo_cuda.h

Diff for: src/base/cs_base_cuda.cu

+99-2
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,13 @@ int cs_glob_cuda_n_mp = -1;
8484
static int _cs_glob_cuda_n_streams = -1;
8585
static cudaStream_t *_cs_glob_cuda_streams = nullptr;
8686

87+
/* Reduce buffers associated with streams in pool */
88+
89+
static unsigned *_r_elt_size = nullptr;
90+
static unsigned *_r_grid_size = nullptr;
91+
static void **_r_reduce = nullptr;
92+
static void **_r_grid = nullptr;
93+
8794
static cudaStream_t _cs_glob_stream_pf = 0;
8895

8996
/* Allow graphs for kernel launches ? May interfere with profiling (nsys),
@@ -107,10 +114,19 @@ finalize_streams_(void)
107114
cs_mem_cuda_set_prefetch_stream(0);
108115
cudaStreamDestroy(_cs_glob_stream_pf);
109116

110-
for (int i = 0; i < _cs_glob_cuda_n_streams; i++)
117+
for (int i = 0; i < _cs_glob_cuda_n_streams; i++) {
111118
cudaStreamDestroy(_cs_glob_cuda_streams[i]);
119+
CS_FREE(_r_reduce[i]);
120+
CS_FREE(_r_grid[i]);
121+
}
112122

113123
CS_FREE(_cs_glob_cuda_streams);
124+
125+
CS_FREE(_r_elt_size);
126+
CS_FREE(_r_grid_size);
127+
CS_FREE(_r_reduce);
128+
CS_FREE(_r_grid);
129+
114130
_cs_glob_cuda_n_streams = 0;
115131
}
116132

@@ -323,8 +339,18 @@ cs_cuda_get_stream(int stream_id)
323339
}
324340

325341
CS_REALLOC(_cs_glob_cuda_streams, stream_id+1, cudaStream_t);
326-
for (int i = _cs_glob_cuda_n_streams; i < stream_id+1; i++)
342+
CS_REALLOC(_r_elt_size, stream_id+1, unsigned);
343+
CS_REALLOC(_r_grid_size, stream_id+1, unsigned);
344+
CS_REALLOC(_r_reduce, stream_id+1, void *);
345+
CS_REALLOC(_r_grid, stream_id+1, void *);
346+
347+
for (int i = _cs_glob_cuda_n_streams; i < stream_id+1; i++) {
327348
cudaStreamCreate(&_cs_glob_cuda_streams[i]);
349+
_r_elt_size[i] = 0;
350+
_r_grid_size[i] = 0;
351+
_r_reduce[i] = nullptr;
352+
_r_grid[i] = nullptr;
353+
}
328354

329355
_cs_glob_cuda_n_streams = stream_id+1;
330356

@@ -347,6 +373,77 @@ cs_cuda_get_stream_prefetch(void)
347373
return _cs_glob_stream_pf;
348374
}
349375

376+
/*----------------------------------------------------------------------------*/
377+
/*!
378+
* \brief Return stream id in stream pool matching a given CUDA stream.
379+
*
380+
* If the stream is not presnet in the stream pool, return -1.
381+
*
382+
* \param [in] handle to given streams
383+
*
384+
* \returns if of stream in pool, or -1.
385+
*/
386+
/*----------------------------------------------------------------------------*/
387+
388+
int
389+
cs_cuda_get_stream_id(cudaStream_t stream)
390+
{
391+
for (int i = 0; i < _cs_glob_cuda_n_streams; i++)
392+
if (stream == _cs_glob_cuda_streams[i])
393+
return i;
394+
395+
return -1;
396+
}
397+
398+
/*----------------------------------------------------------------------------*/
399+
/*!
400+
* \brief Return pointers to reduction buffers needed for 2-stage reductions.
401+
*
402+
* These buffers are used internally by CUDA 2-stage operations, and are
403+
* allocated and resized updon demand.
404+
*
405+
* \param[in] stream_id stream id in pool
406+
* \param[in] n_elts size of arrays
407+
* \param[in] n_elts size of arrays
408+
* \param[in] elt_size size of element or structure simultaneously reduced
409+
* \param[in] grid_size associated grid size
410+
* \param[out] r_grid first stage reduce buffer
411+
* \param[out] r_reduce second stage (final result) reduce buffer
412+
*/
413+
/*----------------------------------------------------------------------------*/
414+
415+
void
416+
cs_cuda_get_2_stage_reduce_buffers(int stream_id,
417+
cs_lnum_t n_elts,
418+
size_t elt_size,
419+
unsigned int grid_size,
420+
void* &r_grid,
421+
void* &r_reduce)
422+
{
423+
assert(stream_id > -1 && stream_id < _cs_glob_cuda_n_streams);
424+
425+
unsigned int t_grid_size = grid_size * elt_size;
426+
427+
if (_r_elt_size[stream_id] < elt_size) {
428+
_r_elt_size[stream_id] = elt_size;
429+
CS_FREE_HD(_r_reduce[stream_id]);
430+
unsigned char *b_ptr;
431+
CS_MALLOC_HD(b_ptr, elt_size, unsigned char, CS_ALLOC_HOST_DEVICE_SHARED);
432+
_r_reduce[stream_id] = b_ptr;
433+
}
434+
435+
if (_r_grid_size[stream_id] < t_grid_size) {
436+
_r_grid_size[stream_id] = t_grid_size;
437+
CS_FREE(_r_grid[stream_id]);
438+
unsigned char *b_ptr;
439+
CS_MALLOC_HD(b_ptr, _r_grid_size[stream_id], unsigned char, CS_ALLOC_DEVICE);
440+
_r_grid[stream_id] = b_ptr;
441+
}
442+
443+
r_grid = _r_grid[stream_id];
444+
r_reduce = _r_reduce[stream_id];
445+
}
446+
350447
#endif /* defined(__CUDACC__) */
351448

352449
/*----------------------------------------------------------------------------*/

Diff for: src/base/cs_base_cuda.h

+40
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,46 @@ cs_cuda_get_stream(int stream_id);
347347
cudaStream_t
348348
cs_cuda_get_stream_prefetch(void);
349349

350+
/*----------------------------------------------------------------------------*/
351+
/*!
352+
* \brief Return stream id in stream pool matching a given CUDA stream.
353+
*
354+
* If the stream is not presnet in the stream pool, return -1.
355+
*
356+
* \param [in] handle to given streams
357+
*
358+
* \returns if of stream in pool, or -1.
359+
*/
360+
/*----------------------------------------------------------------------------*/
361+
362+
int
363+
cs_cuda_get_stream_id(cudaStream_t stream);
364+
365+
/*----------------------------------------------------------------------------*/
366+
/*
367+
* \brief Return pointers to reduction buffers needed for 2-stage reductions.
368+
*
369+
* These buffers are used internally by CUDA 2-stage operations, and are
370+
* allocated and resized updon demand.
371+
*
372+
* \param[in] stream_id stream id in pool
373+
* \param[in] n_elts size of arrays
374+
* \param[in] n_elts size of arrays
375+
* \param[in] elt_size size of element or structure simultaneously reduced
376+
* \param[in] grid_size associated grid size
377+
* \param[out] r_grid first stage reduce buffer
378+
* \param[out] r_reduce second stage (final result) reduce buffer
379+
*/
380+
/*----------------------------------------------------------------------------*/
381+
382+
void
383+
cs_cuda_get_2_stage_reduce_buffers(int stream_id,
384+
cs_lnum_t n_elts,
385+
size_t elt_size,
386+
unsigned int grid_size,
387+
void* &r_grid,
388+
void* &r_reduce);
389+
350390
#endif /* defined(__NVCC__) */
351391

352392
BEGIN_C_DECLS

0 commit comments

Comments
 (0)