Skip to content
This repository was archived by the owner on Mar 12, 2020. It is now read-only.

Commit

Permalink
....
Browse files Browse the repository at this point in the history
  • Loading branch information
deepakkumar1984 committed Feb 8, 2019
1 parent 13edf28 commit b87e8c3
Show file tree
Hide file tree
Showing 18 changed files with 1,361 additions and 494 deletions.
4 changes: 2 additions & 2 deletions Examples/MNIST/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ static void Main(string[] args)
Global.UseGpu();

string datasetFolder = @"C:\dataset\MNIST";
bool useDenseModel = true;
bool useDenseModel = false;

var ((trainX, trainY), (valX, valY)) = MNISTParser.LoadDataSet(datasetFolder, trainCount: 600, testCount: 100, flatten: useDenseModel);
var ((trainX, trainY), (valX, valY)) = MNISTParser.LoadDataSet(datasetFolder, trainCount: 60000, testCount: 10000, flatten: useDenseModel);
Console.WriteLine("Train and Test data loaded");
DataFrameIter trainIter = new DataFrameIter(trainX, trainY);
DataFrameIter valIter = new DataFrameIter(valX, valY);
Expand Down
5 changes: 3 additions & 2 deletions SiaNet/Data/Parser/MNISTParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,12 @@ private static (ImageFrame, ImageFrame) BuildSet(DigitImage[] images, bool flatt
{
var target = inputs.Select(0, i);

Variable.FromArray(images[i].pixels, Global.Device)
Variable.FromArray(images[i].pixels, cpuAllocator)
.AsType(DType.Float32)
.ToDevice(Global.Device)
.Evaluate(target);

//target = target / 255;
target = target / 255;
}

Ops.FillOneHot(outputs, LabelCount, images.Select(x => (int)x.label).ToArray());
Expand Down
7 changes: 4 additions & 3 deletions SiaNet/Layers/Activations/Softmax.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ public override void Forward(Tensor x)

public override void Backward(Tensor outputgrad)
{
var s = Output.Reshape(-1, 1);
var d = Diag(s) - Dot(s, s.Transpose());
Input.Grad = outputgrad * Sum(d, -1).Reshape(Input.Data.Shape);
Input.Grad = outputgrad * Output * (1 - Output);
//var s = Output.Reshape(-1, 1);
//var d = Diag(s) - Dot(s, s.Transpose());
//Input.Grad = outputgrad * Sum(d, -1).Reshape(Input.Data.Shape);
}
}
}
188 changes: 188 additions & 0 deletions Tensor/TensorSharp/Cuda/DeviceCode/CU/GatherScatter.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
// Compute the offsets into the given tensors for a linear index. For the 't2'
// tensor, dimension 'dim' is skipped. The tensors are assumed to have the same
// size (with the exception of 't2' in dimension 'dim').
// This version uses a static number of dimensions.
template <typename IndexType, int Dims>
struct IndexToScatterGatherOffsets {
static __device__ void compute(
IndexType linearId, const int dim,
const TensorInfo<IndexType>& index, IndexType* indexOffset,
const TensorInfo<IndexType>& t1, IndexType* t1Offset,
const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
for (int d = Dims - 1; d >= 0; d--) {
IndexType curDimIndex = linearId % index.sizes[d];
*indexOffset += curDimIndex * index.strides[d];
*t1Offset += curDimIndex * t1.strides[d];
if (d != dim) {
*t2Offset += curDimIndex * t2.strides[d];
}
linearId /= index.sizes[d];
}
}

static __device__ void compute(
IndexType linearId, const int dim,
const TensorInfo<IndexType>& index, IndexType* indexOffset,
const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
for (int d = Dims - 1; d >= 0; d--) {
IndexType curDimIndex = linearId % index.sizes[d];
*indexOffset += curDimIndex * index.strides[d];
if (d != dim) {
*t2Offset += curDimIndex * t2.strides[d];
}
linearId /= index.sizes[d];
}
}
}

// Same as above but using a dynamic number of dimensions.
template <typename IndexType>
struct IndexToScatterGatherOffsets<IndexType, -1> {
static __device__ void compute(
IndexType linearId, const int dim,
const TensorInfo<IndexType>& index, IndexType* indexOffset,
const TensorInfo<IndexType>& t1, IndexType* t1Offset,
const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
for (int d = index.dims - 1; d >= 0; d--) {
IndexType curDimIndex = linearId % index.sizes[d];
*indexOffset += curDimIndex * index.strides[d];
*t1Offset += curDimIndex * t1.strides[d];
if (d != dim) {
*t2Offset += curDimIndex * t2.strides[d];
}
linearId /= index.sizes[d];
}
}

static __device__ void compute(
IndexType linearId, const int dim,
const TensorInfo<IndexType>& index, IndexType* indexOffset,
const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
for (int d = index.dims - 1; d >= 0; d--) {
IndexType curDimIndex = linearId % index.sizes[d];
*indexOffset += curDimIndex * index.strides[d];
if (d != dim) {
*t2Offset += curDimIndex * t2.strides[d];
}
linearId /= index.sizes[d];
}
}
}


template <typename IndexType, int Dims>
__global__ void gather_kernel(
TensorInfo<IndexType> tensor,
TensorInfo<IndexType> src,
TensorInfo<IndexType> index,
const int dim,
const IndexType totalElements) {
for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
linearId < totalElements;
linearId += gridDim.x * blockDim.x) {
IndexType tensorOffset = 0;
IndexType srcOffset = 0;
IndexType indexOffset = 0;

IndexToScatterGatherOffsets<IndexType, Dims>::compute(linearId, dim,
index, &indexOffset,
tensor, &tensorOffset,
src, &srcOffset);

IndexType indexValue = (IndexType)index.data[indexOffset];
srcOffset += indexValue * src.strides[dim];

tensor.data[tensorOffset] = src.data[srcOffset];
}
}

template <typename IndexType, int Dims>
__global__ void scatter_kernel(
TensorInfo<IndexType> tensor,
TensorInfo<IndexType> src,
TensorInfo<IndexType> index,
const int dim,
const IndexType totalElements) {
for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
linearId < totalElements;
linearId += gridDim.x * blockDim.x) {
IndexType tensorOffset = 0;
IndexType srcOffset = 0;
IndexType indexOffset = 0;

IndexToScatterGatherOffsets<IndexType, Dims>::compute(linearId, dim,
index, &indexOffset,
src, &srcOffset,
tensor, &tensorOffset);

IndexType indexValue = (IndexType)index.data[indexOffset];
tensorOffset += indexValue * tensor.strides[dim];

tensor.data[tensorOffset] = src.data[srcOffset];
}
}


template <typename IndexType, int Dims>
__global__ void scatterFill_kernel(
TensorInfo<IndexType> tensor,
TensorInfo<IndexType> index,
float value,
const int dim,
const IndexType totalElements) {
for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
linearId < totalElements;
linearId += gridDim.x * blockDim.x) {
IndexType tensorOffset = 0;
IndexType indexOffset = 0;

IndexToScatterGatherOffsets<IndexType, Dims>::compute(linearId, dim,
index, &indexOffset,
tensor, &tensorOffset);

IndexType indexValue = (IndexType)index.data[indexOffset];
tensorOffset += indexValue * tensor.strides[dim];

tensor.data[tensorOffset] = value;
}
}


#define DECLARE_GATHER(KERNEL_NAME, INDEX_TYPE, DIMS) \
extern "C" {\
__global__ void KERNEL_NAME(\
TensorInfo<INDEX_TYPE> tensor,\
TensorInfo<INDEX_TYPE> src,\
TensorInfo<INDEX_TYPE> indices,\
const int dim,\
INDEX_TYPE totalElements)\
{\
gather_kernel<INDEX_TYPE, DIMS>(tensor, src, indices, dim, totalElements);\
}\
}

#define DECLARE_SCATTER(KERNEL_NAME, INDEX_TYPE, DIMS) \
extern "C" {\
__global__ void KERNEL_NAME(\
TensorInfo<INDEX_TYPE> tensor,\
TensorInfo<INDEX_TYPE> src,\
TensorInfo<INDEX_TYPE> indices,\
const int dim,\
INDEX_TYPE totalElements)\
{\
scatter_kernel<INDEX_TYPE, DIMS>(tensor, src, indices, dim, totalElements);\
}\
}

#define DECLARE_SCATTERFILL(KERNEL_NAME, INDEX_TYPE, DIMS) \
extern "C" {\
__global__ void KERNEL_NAME(\
TensorInfo<INDEX_TYPE> tensor,\
TensorInfo<INDEX_TYPE> indices,\
float value,\
const int dim,\
INDEX_TYPE totalElements)\
{\
scatterFill_kernel<INDEX_TYPE, DIMS>(tensor, indices, value, dim, totalElements);\
}\
}
101 changes: 101 additions & 0 deletions Tensor/TensorSharp/Cuda/DeviceCode/CU/Im2Col.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)

// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
template <typename Dtype>
__device__ void im2col_kernel_t(const int n, const Dtype* data_im,
const int height, const int width, const int channels,
const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int height_col, const int width_col,
Dtype* data_col) {
CUDA_KERNEL_LOOP(index, n) {
int w_out = index % width_col;
index /= width_col;
int h_out = index % height_col;
int channel_in = channels; //index / height_col;
int channel_out = channel_in * ksize_h * ksize_w;
int h_in = h_out * stride_h - pad_h;
int w_in = w_out * stride_w - pad_w;
data_col += (channel_out * height_col + h_out) * width_col + w_out;
data_im += (channel_in * height + h_in) * width + w_in;
const int channel_size = height * width;
for (int i = 0; i < ksize_h; i++) {
for (int j = 0; j < ksize_w; j++) {
int h = h_in + i * dilation_h;
int w = w_in + j * dilation_w;
*data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
data_im[i * dilation_h * width + j * dilation_w] : 0;
data_col += height_col * width_col;
}
}
}
}

template <typename Dtype>
__device__ void col2im_kernel_t(const int n, const Dtype* data_col,
const int height, const int width, const int channels,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int height_col, const int width_col,
Dtype* data_im) {
CUDA_KERNEL_LOOP(index, n) {
Dtype val = 0;
const int w_im = index % width + pad_w;
const int h_im = (index / width) % height + pad_h;
const int c_im = index / (width * height);
int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
// compute the start and end of the output
const int w_col_start =
(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
const int w_col_end = min(w_im / stride_w + 1, width_col);
const int h_col_start =
(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
const int h_col_end = min(h_im / stride_h + 1, height_col);
// TODO: use LCM of stride and dilation to avoid unnecessary loops
for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
int h_k = (h_im - h_col * stride_h);
int w_k = (w_im - w_col * stride_w);
if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
h_k /= dilation_h;
w_k /= dilation_w;
int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
height_col + h_col) * width_col + w_col;
val += data_col[data_col_index];
}
}
}
data_im[index] = val;
}
}

extern "C" {
__global__ void im2col_kernel(const int n, const float* data_im,
const int height, const int width, const int channels,
const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int height_col, const int width_col, float* data_col)
{
im2col_kernel_t(n, data_im, height, width, channels, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col, width_col, data_col);
}

__global__ void col2im_kernel(const int n, const float* data_col,
const int height, const int width, const int channels,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int height_col, const int width_col,
float* data_im)
{
col2im_kernel_t(n, data_col, height, width, channels, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col, width_col, data_im);
}
}
Loading

0 comments on commit b87e8c3

Please sign in to comment.