....

SciSharp · Feb 8, 2019 · b87e8c3 · b87e8c3
1 parent 13edf28
commit b87e8c3
Show file tree

Hide file tree

Showing 18 changed files with 1,361 additions and 494 deletions.
diff --git a/Examples/MNIST/Program.cs b/Examples/MNIST/Program.cs
@@ -15,9 +15,9 @@ static void Main(string[] args)
             Global.UseGpu();
 
             string datasetFolder = @"C:\dataset\MNIST";
-            bool useDenseModel = true;
+            bool useDenseModel = false;
 
-            var ((trainX, trainY), (valX, valY)) = MNISTParser.LoadDataSet(datasetFolder, trainCount: 600, testCount: 100, flatten: useDenseModel);
+            var ((trainX, trainY), (valX, valY)) = MNISTParser.LoadDataSet(datasetFolder, trainCount: 60000, testCount: 10000, flatten: useDenseModel);
             Console.WriteLine("Train and Test data loaded");
             DataFrameIter trainIter = new DataFrameIter(trainX, trainY);
             DataFrameIter valIter = new DataFrameIter(valX, valY);

diff --git a/SiaNet/Data/Parser/MNISTParser.cs b/SiaNet/Data/Parser/MNISTParser.cs
@@ -55,11 +55,12 @@ private static (ImageFrame, ImageFrame) BuildSet(DigitImage[] images, bool flatt
             {
                 var target = inputs.Select(0, i);
 
-                Variable.FromArray(images[i].pixels, Global.Device)
+                Variable.FromArray(images[i].pixels, cpuAllocator)
                     .AsType(DType.Float32)
+                    .ToDevice(Global.Device)
                     .Evaluate(target);
 
-                //target = target / 255;
+                target = target / 255;
             }
 
             Ops.FillOneHot(outputs, LabelCount, images.Select(x => (int)x.label).ToArray());

diff --git a/SiaNet/Layers/Activations/Softmax.cs b/SiaNet/Layers/Activations/Softmax.cs
@@ -22,9 +22,10 @@ public override void Forward(Tensor x)
 
         public override void Backward(Tensor outputgrad)
         {
-            var s = Output.Reshape(-1, 1);
-            var d = Diag(s) - Dot(s, s.Transpose());
-            Input.Grad = outputgrad * Sum(d, -1).Reshape(Input.Data.Shape);
+            Input.Grad = outputgrad * Output * (1 - Output);
+            //var s = Output.Reshape(-1, 1);
+            //var d = Diag(s) - Dot(s, s.Transpose());
+            //Input.Grad = outputgrad * Sum(d, -1).Reshape(Input.Data.Shape);
         }
     }
 }
diff --git a/Tensor/TensorSharp/Cuda/DeviceCode/CU/GatherScatter.c b/Tensor/TensorSharp/Cuda/DeviceCode/CU/GatherScatter.c
@@ -0,0 +1,188 @@
+// Compute the offsets into the given tensors for a linear index. For the 't2'
+// tensor, dimension 'dim' is skipped. The tensors are assumed to have the same
+// size (with the exception of 't2' in dimension 'dim').
+// This version uses a static number of dimensions.
+template <typename IndexType, int Dims>
+struct IndexToScatterGatherOffsets {
+	static __device__ void compute(
+		IndexType linearId, const int dim,
+		const TensorInfo<IndexType>& index, IndexType* indexOffset,
+		const TensorInfo<IndexType>& t1, IndexType* t1Offset,
+		const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
+		for (int d = Dims - 1; d >= 0; d--) {
+			IndexType curDimIndex = linearId % index.sizes[d];
+			*indexOffset += curDimIndex * index.strides[d];
+			*t1Offset += curDimIndex * t1.strides[d];
+			if (d != dim) {
+				*t2Offset += curDimIndex * t2.strides[d];
+			}
+			linearId /= index.sizes[d];
+		}
+	}
+
+	static __device__ void compute(
+		IndexType linearId, const int dim,
+		const TensorInfo<IndexType>& index, IndexType* indexOffset,
+		const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
+		for (int d = Dims - 1; d >= 0; d--) {
+			IndexType curDimIndex = linearId % index.sizes[d];
+			*indexOffset += curDimIndex * index.strides[d];
+			if (d != dim) {
+				*t2Offset += curDimIndex * t2.strides[d];
+			}
+			linearId /= index.sizes[d];
+		}
+	}
+}
+
+// Same as above but using a dynamic number of dimensions.
+template <typename IndexType>
+struct IndexToScatterGatherOffsets<IndexType, -1> {
+	static __device__ void compute(
+		IndexType linearId, const int dim,
+		const TensorInfo<IndexType>& index, IndexType* indexOffset,
+		const TensorInfo<IndexType>& t1, IndexType* t1Offset,
+		const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
+		for (int d = index.dims - 1; d >= 0; d--) {
+			IndexType curDimIndex = linearId % index.sizes[d];
+			*indexOffset += curDimIndex * index.strides[d];
+			*t1Offset += curDimIndex * t1.strides[d];
+			if (d != dim) {
+				*t2Offset += curDimIndex * t2.strides[d];
+			}
+			linearId /= index.sizes[d];
+		}
+	}
+
+	static __device__ void compute(
+		IndexType linearId, const int dim,
+		const TensorInfo<IndexType>& index, IndexType* indexOffset,
+		const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
+		for (int d = index.dims - 1; d >= 0; d--) {
+			IndexType curDimIndex = linearId % index.sizes[d];
+			*indexOffset += curDimIndex * index.strides[d];
+			if (d != dim) {
+				*t2Offset += curDimIndex * t2.strides[d];
+			}
+			linearId /= index.sizes[d];
+		}
+	}
+}
+
+
+template <typename IndexType, int Dims>
+__global__ void gather_kernel(
+	TensorInfo<IndexType> tensor,
+	TensorInfo<IndexType> src,
+	TensorInfo<IndexType> index,
+	const int dim,
+	const IndexType totalElements) {
+	for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
+		linearId < totalElements;
+		linearId += gridDim.x * blockDim.x) {
+		IndexType tensorOffset = 0;
+		IndexType srcOffset = 0;
+		IndexType indexOffset = 0;
+
+		IndexToScatterGatherOffsets<IndexType, Dims>::compute(linearId, dim,
+			index, &indexOffset,
+			tensor, &tensorOffset,
+			src, &srcOffset);
+
+		IndexType indexValue = (IndexType)index.data[indexOffset];
+		srcOffset += indexValue * src.strides[dim];
+
+		tensor.data[tensorOffset] = src.data[srcOffset];
+	}
+}
+
+template <typename IndexType, int Dims>
+__global__ void scatter_kernel(
+	TensorInfo<IndexType> tensor,
+	TensorInfo<IndexType> src,
+	TensorInfo<IndexType> index,
+	const int dim,
+	const IndexType totalElements) {
+	for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
+		linearId < totalElements;
+		linearId += gridDim.x * blockDim.x) {
+		IndexType tensorOffset = 0;
+		IndexType srcOffset = 0;
+		IndexType indexOffset = 0;
+
+		IndexToScatterGatherOffsets<IndexType, Dims>::compute(linearId, dim,
+			index, &indexOffset,
+			src, &srcOffset,
+			tensor, &tensorOffset);
+
+		IndexType indexValue = (IndexType)index.data[indexOffset];
+		tensorOffset += indexValue * tensor.strides[dim];
+
+		tensor.data[tensorOffset] = src.data[srcOffset];
+	}
+}
+
+
+template <typename IndexType, int Dims>
+__global__ void scatterFill_kernel(
+	TensorInfo<IndexType> tensor,
+	TensorInfo<IndexType> index,
+	float value,
+	const int dim,
+	const IndexType totalElements) {
+	for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
+		linearId < totalElements;
+		linearId += gridDim.x * blockDim.x) {
+		IndexType tensorOffset = 0;
+		IndexType indexOffset = 0;
+
+		IndexToScatterGatherOffsets<IndexType, Dims>::compute(linearId, dim,
+			index, &indexOffset,
+			tensor, &tensorOffset);
+
+		IndexType indexValue = (IndexType)index.data[indexOffset];
+		tensorOffset += indexValue * tensor.strides[dim];
+
+		tensor.data[tensorOffset] = value;
+	}
+}
+
+
+#define DECLARE_GATHER(KERNEL_NAME, INDEX_TYPE, DIMS) \
+    extern "C" {\
+        __global__ void KERNEL_NAME(\
+                                          TensorInfo<INDEX_TYPE> tensor,\
+                                          TensorInfo<INDEX_TYPE> src,\
+                                          TensorInfo<INDEX_TYPE> indices,\
+                                          const int dim,\
+                                          INDEX_TYPE totalElements)\
+        {\
+            gather_kernel<INDEX_TYPE, DIMS>(tensor, src, indices, dim, totalElements);\
+        }\
+    }
+
+#define DECLARE_SCATTER(KERNEL_NAME, INDEX_TYPE, DIMS) \
+    extern "C" {\
+        __global__ void KERNEL_NAME(\
+                                          TensorInfo<INDEX_TYPE> tensor,\
+                                          TensorInfo<INDEX_TYPE> src,\
+                                          TensorInfo<INDEX_TYPE> indices,\
+                                          const int dim,\
+                                          INDEX_TYPE totalElements)\
+        {\
+            scatter_kernel<INDEX_TYPE, DIMS>(tensor, src, indices, dim, totalElements);\
+        }\
+    }
+
+#define DECLARE_SCATTERFILL(KERNEL_NAME, INDEX_TYPE, DIMS) \
+    extern "C" {\
+        __global__ void KERNEL_NAME(\
+                                          TensorInfo<INDEX_TYPE> tensor,\
+                                          TensorInfo<INDEX_TYPE> indices,\
+                                          float value,\
+                                          const int dim,\
+                                          INDEX_TYPE totalElements)\
+        {\
+            scatterFill_kernel<INDEX_TYPE, DIMS>(tensor, indices, value, dim, totalElements);\
+        }\
+    }
diff --git a/Tensor/TensorSharp/Cuda/DeviceCode/CU/Im2Col.c b/Tensor/TensorSharp/Cuda/DeviceCode/CU/Im2Col.c
@@ -0,0 +1,101 @@
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+
+// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
+template <typename Dtype>
+__device__ void im2col_kernel_t(const int n, const Dtype* data_im,
+	const int height, const int width, const int channels,
+	const int ksize_h, const int ksize_w,
+	const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w,
+	const int dilation_h, const int dilation_w,
+	const int height_col, const int width_col,
+	Dtype* data_col) {
+	CUDA_KERNEL_LOOP(index, n) {
+		int w_out = index % width_col;
+		index /= width_col;
+		int h_out = index % height_col;
+		int channel_in = channels; //index / height_col;
+		int channel_out = channel_in * ksize_h * ksize_w;
+		int h_in = h_out * stride_h - pad_h;
+		int w_in = w_out * stride_w - pad_w;
+		data_col += (channel_out * height_col + h_out) * width_col + w_out;
+		data_im += (channel_in * height + h_in) * width + w_in;
+		const int channel_size = height * width;
+		for (int i = 0; i < ksize_h; i++) {
+			for (int j = 0; j < ksize_w; j++) {
+				int h = h_in + i * dilation_h;
+				int w = w_in + j * dilation_w;
+				*data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
+					data_im[i * dilation_h * width + j * dilation_w] : 0;
+				data_col += height_col * width_col;
+			}
+		}
+	}
+}
+
+template <typename Dtype>
+__device__ void col2im_kernel_t(const int n, const Dtype* data_col,
+	const int height, const int width, const int channels,
+	const int kernel_h, const int kernel_w,
+	const int pad_h, const int pad_w,
+	const int stride_h, const int stride_w,
+	const int dilation_h, const int dilation_w,
+	const int height_col, const int width_col,
+	Dtype* data_im) {
+	CUDA_KERNEL_LOOP(index, n) {
+		Dtype val = 0;
+		const int w_im = index % width + pad_w;
+		const int h_im = (index / width) % height + pad_h;
+		const int c_im = index / (width * height);
+		int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+		int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+		// compute the start and end of the output
+		const int w_col_start =
+			(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+		const int w_col_end = min(w_im / stride_w + 1, width_col);
+		const int h_col_start =
+			(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+		const int h_col_end = min(h_im / stride_h + 1, height_col);
+		// TODO: use LCM of stride and dilation to avoid unnecessary loops
+		for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+			for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+				int h_k = (h_im - h_col * stride_h);
+				int w_k = (w_im - w_col * stride_w);
+				if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+					h_k /= dilation_h;
+					w_k /= dilation_w;
+					int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
+						height_col + h_col) * width_col + w_col;
+					val += data_col[data_col_index];
+				}
+			}
+		}
+		data_im[index] = val;
+	}
+}
+
+extern "C" {
+	__global__ void im2col_kernel(const int n, const float* data_im,
+		const int height, const int width, const int channels,
+		const int ksize_h, const int ksize_w,
+		const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w,
+		const int dilation_h, const int dilation_w,
+		const int height_col, const int width_col, float* data_col)
+	{
+		im2col_kernel_t(n, data_im, height, width, channels, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col, width_col, data_col);
+	}
+
+	__global__ void col2im_kernel(const int n, const float* data_col,
+		const int height, const int width, const int channels,
+		const int kernel_h, const int kernel_w,
+		const int pad_h, const int pad_w,
+		const int stride_h, const int stride_w,
+		const int dilation_h, const int dilation_w,
+		const int height_col, const int width_col,
+		float* data_im)
+	{
+		col2im_kernel_t(n, data_col, height, width, channels, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col, width_col, data_im);
+	}
+}