forked from mackstann/mona
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdiff.cu
153 lines (120 loc) · 4.83 KB
/
diff.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/* vim:set ts=8 sts=4 sw=4 expandtab: */
/* from https://github.com/howey/mona */
#include <cuda.h>
#include <cairo.h>
#include <math.h>
#include "mona.h"
extern "C" {
int difference_init();
void difference_clean();
int difference(cairo_surface_t * test_surf, cairo_surface_t * goal_surf);
int get_max_fitness(void);
}
//The CUDA block size
#define BLOCK_SIZE 16
__global__ void differenceKernel(unsigned char * test_data, unsigned char * goal_data, int * difference, int * my_max_fitness, int width, int height)
{
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int ty = threadIdx.y + blockIdx.y * blockDim.y;
int i = tx * width + ty;
int difference_s = 0;
int my_max_fitness_s = 0;
if(i < height*width) {
int thispixel = 4 * i;
//int thispixel = tx*WIDTH*4 + ty*4;
unsigned char test_a = test_data[thispixel];
unsigned char test_r = test_data[thispixel + 1];
unsigned char test_g = test_data[thispixel + 2];
unsigned char test_b = test_data[thispixel + 3];
unsigned char goal_a = goal_data[thispixel];
unsigned char goal_r = goal_data[thispixel + 1];
unsigned char goal_g = goal_data[thispixel + 2];
unsigned char goal_b = goal_data[thispixel + 3];
my_max_fitness_s += goal_a + goal_r + goal_g + goal_b;
difference_s += (ABS(test_a - goal_a) + ABS(test_r - goal_r) + ABS(test_g - goal_g) + ABS(test_b - goal_b));
}
my_max_fitness[i] = my_max_fitness_s;
difference[i] = difference_s;
}
int MAX_FITNESS = -1;
unsigned char * goal_data_d;
unsigned char * test_data_d;
int * difference_d;
int * my_max_fitness_d;
unsigned char * goal_data = NULL;
int * my_difference;
int * my_max_fitness;
int difference_init()
{
cudaMalloc((void **)&goal_data_d, sizeof(unsigned char)*4*WIDTH*HEIGHT);
cudaMalloc((void **)&test_data_d, sizeof(unsigned char)*4*WIDTH*HEIGHT);
cudaMalloc((void **)&difference_d, sizeof(int)*WIDTH*HEIGHT);
cudaMalloc((void **)&my_max_fitness_d, sizeof(int)*WIDTH*HEIGHT);
my_difference = (int *)malloc(sizeof(int)*WIDTH*HEIGHT);
my_max_fitness = (int *)malloc(sizeof(int)*WIDTH*HEIGHT);
return 0; // TODO: propagate errors.
}
void difference_clean()
{
cudaFree(goal_data_d);
cudaFree(test_data_d);
cudaFree(difference_d);
cudaFree(my_max_fitness_d);
free(my_difference);
free(my_max_fitness);
}
int difference(cairo_surface_t * test_surf, cairo_surface_t * goal_surf)
{
unsigned char * test_data = cairo_image_surface_get_data(test_surf);
if(!goal_data) {
goal_data = cairo_image_surface_get_data(goal_surf);
cudaMemcpy(goal_data_d, goal_data, sizeof(unsigned char)*4*WIDTH*HEIGHT, cudaMemcpyHostToDevice);
}
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE, 1);
dim3 gridDim(ceil((float)WIDTH/(float)BLOCK_SIZE), ceil((float)HEIGHT/(float)BLOCK_SIZE), 1);
//This will really slow things down. PCI-E bus will be a bottleneck.
cudaMemcpy(test_data_d, test_data, sizeof(unsigned char)*4*WIDTH*HEIGHT, cudaMemcpyHostToDevice);
//Launch the kernel to compute the difference
differenceKernel<<<gridDim, blockDim>>>(test_data_d, goal_data_d, difference_d, my_max_fitness_d, WIDTH, HEIGHT);
//Copy results from the device, another PCI-E bottleneck
cudaMemcpy(my_difference, difference_d, sizeof(int)*WIDTH*HEIGHT, cudaMemcpyDeviceToHost);
cudaMemcpy(my_max_fitness, my_max_fitness_d, sizeof(int)*WIDTH*HEIGHT, cudaMemcpyDeviceToHost);
/*
int difference = 0;
int my_max_fitness = 0;
#pragma omp parallel for
for(int y = 0; y < HEIGHT; y++)
{
for(int x = 0; x < WIDTH; x++)
{
int thispixel = y*WIDTH*4 + x*4;
unsigned char test_a = test_data[thispixel];
unsigned char test_r = test_data[thispixel + 1];
unsigned char test_g = test_data[thispixel + 2];
unsigned char test_b = test_data[thispixel + 3];
unsigned char goal_a = goal_data[thispixel];
unsigned char goal_r = goal_data[thispixel + 1];
unsigned char goal_g = goal_data[thispixel + 2];
unsigned char goal_b = goal_data[thispixel + 3];
if(MAX_FITNESS == -1)
my_max_fitness += goal_a + goal_r + goal_g + goal_b;
#pragma omp atomic
difference += (ABS(test_a - goal_a) + ABS(test_r - goal_r) + ABS(test_g - goal_g) + ABS(test_b - goal_b));
}
}
*/
//TODO: perform reduction on the GPU. Probalby won't be much speedup anyways
int my_max_fitness_total = 0;
int difference_total = 0;
for(int i = 0; i < WIDTH*HEIGHT; i++) {
my_max_fitness_total += my_max_fitness[i];
difference_total += my_difference[i];
}
if(MAX_FITNESS == -1)
MAX_FITNESS = my_max_fitness_total;
return difference_total;
}
int get_max_fitness()
{
return MAX_FITNESS;
}