-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwarps.h
More file actions
47 lines (39 loc) · 1.29 KB
/
warps.h
File metadata and controls
47 lines (39 loc) · 1.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#ifndef __WARPS_H__
#define __WARPS_H__
#include "core_types.h"
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <bit>
constexpr u64 WARP_SIZE = 32u;
constexpr u64 WARP_SZ_SHIFT = std::bit_width( WARP_SIZE ) - 1u;
static_assert( std::has_single_bit( WARP_SIZE ), "WARP_SIZE not POT" );
__device__ __forceinline__ inline u64 LaneId() { return threadIdx.x & ( WARP_SIZE - 1 ); }
__device__ __forceinline__ inline u64 WarpId() { return threadIdx.x >> WARP_SZ_SHIFT; }
// NOTE: pre-Turing gpus don't support __reduce warp ops
template<typename T>
__device__ T WarpReduceShflDownSync( const T in )
{
T sum = in;
#pragma unroll
for( u64 offsetWithinWarp = WARP_SIZE >> 1; offsetWithinWarp > 0; offsetWithinWarp >>= 1 )
{
sum += __shfl_down_sync( u32( -1 ), sum, offsetWithinWarp );
}
return sum;
}
template<typename T>
__device__ T WarpInclusiveScanShflUpSync( const T val )
{
T inclusvieScan = val;
#pragma unroll
for( u64 offset = 1; offset < WARP_SIZE; offset <<= 1 )
{
const T warpLaneValAtOffset = __shfl_up_sync( 0xffffffff, inclusvieScan, offset );
if( LaneId() >= offset ) // NOTE: Distributes values
{
inclusvieScan += warpLaneValAtOffset;
}
}
return inclusvieScan;
};
#endif // !__WARPS_H__