forked from Nik4053/Templated-Cpp-Tensor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cudaTest.cu
76 lines (66 loc) · 2.1 KB
/
cudaTest.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#include "tensorlib/tensor.hpp"
#include "tensorlib/cpu/tensoralloc.hpp"
#include "tensorlib/cuda/tensor_cuda.hpp"
#include "tensorlib/cuda/tensoralloc_cuda.hpp"
#include "tensorlib/cpu/batchedtensor.hpp"
// CUDA includes
#include <cuda_runtime.h>
__global__ void cuda_hello(){
printf("Hello World from GPU!\n");
}
template <typename T, std::size_t DIM3, std::size_t DIM2, std::size_t DIM1>
__global__ void cuda_hello_tensor(Tensor<float,DIM3,DIM2,DIM1> t){
t(0,0,0) = 5;
printf("Hello World from GPU! %lu\n", t.RANK);
}
template <typename T, std::size_t DIM3, std::size_t DIM2, std::size_t DIM1>
__global__ void cuda_hello_tensor(TensorCUDA<float,DIM3,DIM2,DIM1> t){
t(0,0,0) = 5;
printf("Hello World from GPU! %lu\n", t.RANK);
}
class Layer{
TensorAllocator<float,3,3,3> talloc;
Tensor<float,3,3,3> input = talloc.createTensor();
};
int main() {
const size_t dim1 =4;
const size_t dim2 =3;
const size_t dim3 =1;
float *data;
cudaMallocManaged(&data, dim1*dim2*dim3*sizeof(float));
Tensor<float,dim3,dim2,dim1> t(data);
cuda_hello<<<1,1>>>();
cudaDeviceSynchronize();
cuda_hello_tensor<float,dim3,dim2,dim1><<<1,1>>>(t);
cudaDeviceSynchronize();
// std::cout << t << std::endl;
TensorCUDA<float,dim3,dim2,dim1> t2(data);
// t2[0][0][1] = 7;
t2.print();
float dataHOST[dim1*dim2*dim3];
for (size_t i = 0; i < dim1*dim2*dim3; i++)
{
dataHOST[i] = i;
}
Tensor<float,dim3,dim2,dim1> tH(dataHOST);
tH.print();
float *dataDEVICE;
cudaMalloc(&dataDEVICE, dim1*dim2*dim3*sizeof(float));
TensorCUDA<float,dim3,dim2,dim1> tD2(dataDEVICE);
tD2.print();
tD2.copyFromHost(tH);
cuda_hello_tensor<float,dim3,dim2,dim1><<<1,1>>>(tD2);
cudaDeviceSynchronize();
tD2.print();
tD2.copyToHost(tH);
TensorAllocator<float,3,3,3> talloc;
auto t3 = talloc.createTensor();
t3.set(5);
TensorAllocatorCUDA<float,3,3,3> tallocCUDA;
auto t4 = tallocCUDA.createDeviceTensor();
t4.copyFromHost(t3);
t4.print();
// std::cout << t2 << std::endl;
Layer *l = new Layer();
return 0;
}