Skip to content
This repository has been archived by the owner on Feb 6, 2022. It is now read-only.

Commit

Permalink
backup
Browse files Browse the repository at this point in the history
  • Loading branch information
Oscar Zhang committed Aug 7, 2019
1 parent e2d4d84 commit ff3b2c4
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 17 deletions.
Binary file modified gmon.out
Binary file not shown.
35 changes: 21 additions & 14 deletions layers.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,34 +110,40 @@ void conv_forward(conv_layer_t* l, volume_t** inputs, volume_t** outputs, int st
__m256d sum_v = _mm256_set1_pd(0.0);
for (int fd = 0; fd < filter->depth / 16 * 16; fd += 16) {
// loop unrolling + smid
__m256d vector1 = _mm256_loadu_pd( filter->weights + (((filter->width * fy) + fx) * filter->depth + fd) );
__m256d vector2 = _mm256_loadu_pd( in->weights + (((in->width * fy) + fx) * in->depth + fd) );
__m256d vector1 = _mm256_loadu_pd( filter->weights + ((filter->width * fy) + fx) * filter->depth + fd );
__m256d vector2 = _mm256_loadu_pd( in->weights + ((in->width * in_y) + in_x) * in->depth + fd );
__m256d temp_product = _mm256_mul_pd(vector1, vector2);
sum_v = _mm256_add_pd(sum_v, temp_product);

vector1 = _mm256_loadu_pd( filter->weights + (((filter->width * fy) + fx) * filter->depth + fd) + 4 );
vector2 = _mm256_loadu_pd( in->weights + (((in->width * fy) + fx) * in->depth + fd) + 4 );
__m256d temp_product = _mm256_mul_pd(vector1, vector2);
vector1 = _mm256_loadu_pd( filter->weights + ((filter->width * fy) + fx) * filter->depth + (fd + 4) );
vector2 = _mm256_loadu_pd( in->weights + ((in->width * in_y) + in_x) * in->depth + (fd + 4) );
temp_product = _mm256_mul_pd(vector1, vector2);
sum_v = _mm256_add_pd(sum_v, temp_product);

vector1 = _mm256_loadu_pd( filter->weights + (((filter->width * fy) + fx) * filter->depth + fd) + 8 );
vector2 = _mm256_loadu_pd( in->weights + (((in->width * fy) + fx) * in->depth + fd) + 8 );
__m256d temp_product = _mm256_mul_pd(vector1, vector2);
vector1 = _mm256_loadu_pd( filter->weights + ((filter->width * fy) + fx) * filter->depth + (fd + 8) );
vector2 = _mm256_loadu_pd( in->weights + ((in->width * in_y) + in_x) * in->depth + (fd + 8) );
temp_product = _mm256_mul_pd(vector1, vector2);
sum_v = _mm256_add_pd(sum_v, temp_product);

vector1 = _mm256_loadu_pd( filter->weights + (((filter->width * fy) + fx) * filter->depth + fd) + 12 );
vector2 = _mm256_loadu_pd( in->weights + (((in->width * fy) + fx) * in->depth + fd) + 12 );
__m256d temp_product = _mm256_mul_pd(vector1, vector2);
vector1 = _mm256_loadu_pd( filter->weights + ((filter->width * fy) + fx) * filter->depth + (fd + 12) );
vector2 = _mm256_loadu_pd( in->weights + ((in->width * in_y) + in_x) * in->depth + (fd + 12) );
temp_product = _mm256_mul_pd(vector1, vector2);
sum_v = _mm256_add_pd(sum_v, temp_product);

}
_mm256_store_pd(res, sum_v); // store the temp result
sum += res[0] + res[1] + res[2] + res[3];

// tail case
for (int t=filter->depth/16*16; t<filter->depth; t++) {
sum += filter->weights[((filter->width * fy) + fx) * filter->depth + t] * in->weights[((in->width * in_y) + in_x) * in->depth + t];
}
}
}
}
sum = sum + l->biases->weights[f];
volume_set(out, out_x, out_y, f, sum);
// volume_set(out, out_x, out_y, f, sum);
out->weights[((out->width * out_y) + out_x) * out->depth + f] = sum;
}
}
}
Expand Down Expand Up @@ -200,7 +206,8 @@ void relu_forward(relu_layer_t* l, volume_t** inputs, volume_t** outputs, int st
for (int x = 0; x < l->input_width; x++) {
for (int y = 0; y < l->input_height; y++) {
for (int d = 0; d < l->input_depth; d++) {
double value = (volume_get(inputs[i], x, y, d) < 0.0) ? 0.0 : volume_get(inputs[i], x, y, d);
double perspective = inputs[i]->weights[((inputs[i]->width * y) + x) * inputs[i]->depth + d];
double value = (perspective < 0.0) ? 0.0 : perspective;
volume_set(outputs[i], x, y, d, value);
}
}
Expand Down Expand Up @@ -257,7 +264,7 @@ void pool_forward(pool_layer_t* l, volume_t** inputs, volume_t** outputs, int st
int in_y = y + fy;
int in_x = x + fx;
if (in_x >= 0 && in_x < in->width && in_y >= 0 && in_y < in->height) {
double v = volume_get(in, in_x, in_y, d);
double v = in->weights[((in->width * in_y) + in_x) * in->depth + d];;
if (v > max) {
max = v;
}
Expand Down
4 changes: 2 additions & 2 deletions network.c
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ void net_classify(network_t* net, volume_t** input, double** likelihoods, int n)
}
}

// tail case
for (int ii = total_threads * num_per_chunck; ii < n; ii++) {
// tail case [make this a running on one single thread instead of all threading running this section]
for (int ii = total_threads * num_per_chunck; ii < n; ii++) {
copy_volume(b[0][0], input[ii]);
net_forward(net, b, 0, 0);
for (int jj=0; jj<NUM_CLASSES; jj++) {
Expand Down
3 changes: 2 additions & 1 deletion volume.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ void copy_volume(volume_t* dest, volume_t* src) {
for (int x = 0; x < dest->width; x++) {
for (int y = 0; y < dest->height; y++) {
for (int d = 0; d < dest->depth; d++) {
volume_set(dest, x, y, d, volume_get(src, x, y, d));
double val = src->weights[((src->width * y) + x) * src->depth + d];
volume_set(dest, x, y, d, val);
}
}
}
Expand Down

0 comments on commit ff3b2c4

Please sign in to comment.