This repository was archived by the owner on Mar 12, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 83
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
13edf28
commit b87e8c3
Showing
18 changed files
with
1,361 additions
and
494 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
// Compute the offsets into the given tensors for a linear index. For the 't2' | ||
// tensor, dimension 'dim' is skipped. The tensors are assumed to have the same | ||
// size (with the exception of 't2' in dimension 'dim'). | ||
// This version uses a static number of dimensions. | ||
template <typename IndexType, int Dims> | ||
struct IndexToScatterGatherOffsets { | ||
static __device__ void compute( | ||
IndexType linearId, const int dim, | ||
const TensorInfo<IndexType>& index, IndexType* indexOffset, | ||
const TensorInfo<IndexType>& t1, IndexType* t1Offset, | ||
const TensorInfo<IndexType>& t2, IndexType* t2Offset) { | ||
for (int d = Dims - 1; d >= 0; d--) { | ||
IndexType curDimIndex = linearId % index.sizes[d]; | ||
*indexOffset += curDimIndex * index.strides[d]; | ||
*t1Offset += curDimIndex * t1.strides[d]; | ||
if (d != dim) { | ||
*t2Offset += curDimIndex * t2.strides[d]; | ||
} | ||
linearId /= index.sizes[d]; | ||
} | ||
} | ||
|
||
static __device__ void compute( | ||
IndexType linearId, const int dim, | ||
const TensorInfo<IndexType>& index, IndexType* indexOffset, | ||
const TensorInfo<IndexType>& t2, IndexType* t2Offset) { | ||
for (int d = Dims - 1; d >= 0; d--) { | ||
IndexType curDimIndex = linearId % index.sizes[d]; | ||
*indexOffset += curDimIndex * index.strides[d]; | ||
if (d != dim) { | ||
*t2Offset += curDimIndex * t2.strides[d]; | ||
} | ||
linearId /= index.sizes[d]; | ||
} | ||
} | ||
} | ||
|
||
// Same as above but using a dynamic number of dimensions. | ||
template <typename IndexType> | ||
struct IndexToScatterGatherOffsets<IndexType, -1> { | ||
static __device__ void compute( | ||
IndexType linearId, const int dim, | ||
const TensorInfo<IndexType>& index, IndexType* indexOffset, | ||
const TensorInfo<IndexType>& t1, IndexType* t1Offset, | ||
const TensorInfo<IndexType>& t2, IndexType* t2Offset) { | ||
for (int d = index.dims - 1; d >= 0; d--) { | ||
IndexType curDimIndex = linearId % index.sizes[d]; | ||
*indexOffset += curDimIndex * index.strides[d]; | ||
*t1Offset += curDimIndex * t1.strides[d]; | ||
if (d != dim) { | ||
*t2Offset += curDimIndex * t2.strides[d]; | ||
} | ||
linearId /= index.sizes[d]; | ||
} | ||
} | ||
|
||
static __device__ void compute( | ||
IndexType linearId, const int dim, | ||
const TensorInfo<IndexType>& index, IndexType* indexOffset, | ||
const TensorInfo<IndexType>& t2, IndexType* t2Offset) { | ||
for (int d = index.dims - 1; d >= 0; d--) { | ||
IndexType curDimIndex = linearId % index.sizes[d]; | ||
*indexOffset += curDimIndex * index.strides[d]; | ||
if (d != dim) { | ||
*t2Offset += curDimIndex * t2.strides[d]; | ||
} | ||
linearId /= index.sizes[d]; | ||
} | ||
} | ||
} | ||
|
||
|
||
template <typename IndexType, int Dims> | ||
__global__ void gather_kernel( | ||
TensorInfo<IndexType> tensor, | ||
TensorInfo<IndexType> src, | ||
TensorInfo<IndexType> index, | ||
const int dim, | ||
const IndexType totalElements) { | ||
for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x; | ||
linearId < totalElements; | ||
linearId += gridDim.x * blockDim.x) { | ||
IndexType tensorOffset = 0; | ||
IndexType srcOffset = 0; | ||
IndexType indexOffset = 0; | ||
|
||
IndexToScatterGatherOffsets<IndexType, Dims>::compute(linearId, dim, | ||
index, &indexOffset, | ||
tensor, &tensorOffset, | ||
src, &srcOffset); | ||
|
||
IndexType indexValue = (IndexType)index.data[indexOffset]; | ||
srcOffset += indexValue * src.strides[dim]; | ||
|
||
tensor.data[tensorOffset] = src.data[srcOffset]; | ||
} | ||
} | ||
|
||
template <typename IndexType, int Dims> | ||
__global__ void scatter_kernel( | ||
TensorInfo<IndexType> tensor, | ||
TensorInfo<IndexType> src, | ||
TensorInfo<IndexType> index, | ||
const int dim, | ||
const IndexType totalElements) { | ||
for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x; | ||
linearId < totalElements; | ||
linearId += gridDim.x * blockDim.x) { | ||
IndexType tensorOffset = 0; | ||
IndexType srcOffset = 0; | ||
IndexType indexOffset = 0; | ||
|
||
IndexToScatterGatherOffsets<IndexType, Dims>::compute(linearId, dim, | ||
index, &indexOffset, | ||
src, &srcOffset, | ||
tensor, &tensorOffset); | ||
|
||
IndexType indexValue = (IndexType)index.data[indexOffset]; | ||
tensorOffset += indexValue * tensor.strides[dim]; | ||
|
||
tensor.data[tensorOffset] = src.data[srcOffset]; | ||
} | ||
} | ||
|
||
|
||
template <typename IndexType, int Dims> | ||
__global__ void scatterFill_kernel( | ||
TensorInfo<IndexType> tensor, | ||
TensorInfo<IndexType> index, | ||
float value, | ||
const int dim, | ||
const IndexType totalElements) { | ||
for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x; | ||
linearId < totalElements; | ||
linearId += gridDim.x * blockDim.x) { | ||
IndexType tensorOffset = 0; | ||
IndexType indexOffset = 0; | ||
|
||
IndexToScatterGatherOffsets<IndexType, Dims>::compute(linearId, dim, | ||
index, &indexOffset, | ||
tensor, &tensorOffset); | ||
|
||
IndexType indexValue = (IndexType)index.data[indexOffset]; | ||
tensorOffset += indexValue * tensor.strides[dim]; | ||
|
||
tensor.data[tensorOffset] = value; | ||
} | ||
} | ||
|
||
|
||
#define DECLARE_GATHER(KERNEL_NAME, INDEX_TYPE, DIMS) \ | ||
extern "C" {\ | ||
__global__ void KERNEL_NAME(\ | ||
TensorInfo<INDEX_TYPE> tensor,\ | ||
TensorInfo<INDEX_TYPE> src,\ | ||
TensorInfo<INDEX_TYPE> indices,\ | ||
const int dim,\ | ||
INDEX_TYPE totalElements)\ | ||
{\ | ||
gather_kernel<INDEX_TYPE, DIMS>(tensor, src, indices, dim, totalElements);\ | ||
}\ | ||
} | ||
|
||
#define DECLARE_SCATTER(KERNEL_NAME, INDEX_TYPE, DIMS) \ | ||
extern "C" {\ | ||
__global__ void KERNEL_NAME(\ | ||
TensorInfo<INDEX_TYPE> tensor,\ | ||
TensorInfo<INDEX_TYPE> src,\ | ||
TensorInfo<INDEX_TYPE> indices,\ | ||
const int dim,\ | ||
INDEX_TYPE totalElements)\ | ||
{\ | ||
scatter_kernel<INDEX_TYPE, DIMS>(tensor, src, indices, dim, totalElements);\ | ||
}\ | ||
} | ||
|
||
#define DECLARE_SCATTERFILL(KERNEL_NAME, INDEX_TYPE, DIMS) \ | ||
extern "C" {\ | ||
__global__ void KERNEL_NAME(\ | ||
TensorInfo<INDEX_TYPE> tensor,\ | ||
TensorInfo<INDEX_TYPE> indices,\ | ||
float value,\ | ||
const int dim,\ | ||
INDEX_TYPE totalElements)\ | ||
{\ | ||
scatterFill_kernel<INDEX_TYPE, DIMS>(tensor, indices, value, dim, totalElements);\ | ||
}\ | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
#define CUDA_KERNEL_LOOP(i, n) \ | ||
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) | ||
|
||
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu) | ||
template <typename Dtype> | ||
__device__ void im2col_kernel_t(const int n, const Dtype* data_im, | ||
const int height, const int width, const int channels, | ||
const int ksize_h, const int ksize_w, | ||
const int pad_h, const int pad_w, | ||
const int stride_h, const int stride_w, | ||
const int dilation_h, const int dilation_w, | ||
const int height_col, const int width_col, | ||
Dtype* data_col) { | ||
CUDA_KERNEL_LOOP(index, n) { | ||
int w_out = index % width_col; | ||
index /= width_col; | ||
int h_out = index % height_col; | ||
int channel_in = channels; //index / height_col; | ||
int channel_out = channel_in * ksize_h * ksize_w; | ||
int h_in = h_out * stride_h - pad_h; | ||
int w_in = w_out * stride_w - pad_w; | ||
data_col += (channel_out * height_col + h_out) * width_col + w_out; | ||
data_im += (channel_in * height + h_in) * width + w_in; | ||
const int channel_size = height * width; | ||
for (int i = 0; i < ksize_h; i++) { | ||
for (int j = 0; j < ksize_w; j++) { | ||
int h = h_in + i * dilation_h; | ||
int w = w_in + j * dilation_w; | ||
*data_col = (h >= 0 && w >= 0 && h < height && w < width) ? | ||
data_im[i * dilation_h * width + j * dilation_w] : 0; | ||
data_col += height_col * width_col; | ||
} | ||
} | ||
} | ||
} | ||
|
||
template <typename Dtype> | ||
__device__ void col2im_kernel_t(const int n, const Dtype* data_col, | ||
const int height, const int width, const int channels, | ||
const int kernel_h, const int kernel_w, | ||
const int pad_h, const int pad_w, | ||
const int stride_h, const int stride_w, | ||
const int dilation_h, const int dilation_w, | ||
const int height_col, const int width_col, | ||
Dtype* data_im) { | ||
CUDA_KERNEL_LOOP(index, n) { | ||
Dtype val = 0; | ||
const int w_im = index % width + pad_w; | ||
const int h_im = (index / width) % height + pad_h; | ||
const int c_im = index / (width * height); | ||
int kernel_extent_w = (kernel_w - 1) * dilation_w + 1; | ||
int kernel_extent_h = (kernel_h - 1) * dilation_h + 1; | ||
// compute the start and end of the output | ||
const int w_col_start = | ||
(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1; | ||
const int w_col_end = min(w_im / stride_w + 1, width_col); | ||
const int h_col_start = | ||
(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1; | ||
const int h_col_end = min(h_im / stride_h + 1, height_col); | ||
// TODO: use LCM of stride and dilation to avoid unnecessary loops | ||
for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) { | ||
for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) { | ||
int h_k = (h_im - h_col * stride_h); | ||
int w_k = (w_im - w_col * stride_w); | ||
if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { | ||
h_k /= dilation_h; | ||
w_k /= dilation_w; | ||
int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) * | ||
height_col + h_col) * width_col + w_col; | ||
val += data_col[data_col_index]; | ||
} | ||
} | ||
} | ||
data_im[index] = val; | ||
} | ||
} | ||
|
||
extern "C" { | ||
__global__ void im2col_kernel(const int n, const float* data_im, | ||
const int height, const int width, const int channels, | ||
const int ksize_h, const int ksize_w, | ||
const int pad_h, const int pad_w, | ||
const int stride_h, const int stride_w, | ||
const int dilation_h, const int dilation_w, | ||
const int height_col, const int width_col, float* data_col) | ||
{ | ||
im2col_kernel_t(n, data_im, height, width, channels, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col, width_col, data_col); | ||
} | ||
|
||
__global__ void col2im_kernel(const int n, const float* data_col, | ||
const int height, const int width, const int channels, | ||
const int kernel_h, const int kernel_w, | ||
const int pad_h, const int pad_w, | ||
const int stride_h, const int stride_w, | ||
const int dilation_h, const int dilation_w, | ||
const int height_col, const int width_col, | ||
float* data_im) | ||
{ | ||
col2im_kernel_t(n, data_col, height, width, channels, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col, width_col, data_im); | ||
} | ||
} |
Oops, something went wrong.