Skip to content
This repository has been archived by the owner on Feb 6, 2022. It is now read-only.

Commit

Permalink
back up
Browse files Browse the repository at this point in the history
  • Loading branch information
Oscar Zhang committed Aug 7, 2019
1 parent ff3b2c4 commit f0fccab
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 34 deletions.
65 changes: 37 additions & 28 deletions layers.c
Original file line number Diff line number Diff line change
Expand Up @@ -108,36 +108,43 @@ void conv_forward(conv_layer_t* l, volume_t** inputs, volume_t** outputs, int st

double res[4];
__m256d sum_v = _mm256_set1_pd(0.0);
for (int fd = 0; fd < filter->depth / 16 * 16; fd += 16) {
// loop unrolling + smid
__m256d vector1 = _mm256_loadu_pd( filter->weights + ((filter->width * fy) + fx) * filter->depth + fd );
__m256d vector2 = _mm256_loadu_pd( in->weights + ((in->width * in_y) + in_x) * in->depth + fd );
__m256d temp_product = _mm256_mul_pd(vector1, vector2);
sum_v = _mm256_add_pd(sum_v, temp_product);

vector1 = _mm256_loadu_pd( filter->weights + ((filter->width * fy) + fx) * filter->depth + (fd + 4) );
vector2 = _mm256_loadu_pd( in->weights + ((in->width * in_y) + in_x) * in->depth + (fd + 4) );
temp_product = _mm256_mul_pd(vector1, vector2);
sum_v = _mm256_add_pd(sum_v, temp_product);

vector1 = _mm256_loadu_pd( filter->weights + ((filter->width * fy) + fx) * filter->depth + (fd + 8) );
vector2 = _mm256_loadu_pd( in->weights + ((in->width * in_y) + in_x) * in->depth + (fd + 8) );
temp_product = _mm256_mul_pd(vector1, vector2);
sum_v = _mm256_add_pd(sum_v, temp_product);

vector1 = _mm256_loadu_pd( filter->weights + ((filter->width * fy) + fx) * filter->depth + (fd + 12) );
vector2 = _mm256_loadu_pd( in->weights + ((in->width * in_y) + in_x) * in->depth + (fd + 12) );
temp_product = _mm256_mul_pd(vector1, vector2);
sum_v = _mm256_add_pd(sum_v, temp_product);

if (filter->depth == 3) {
sum += filter->weights[((filter->width * fy) + fx) * filter->depth ] * in->weights[((in->width * in_y) + in_x) * in->depth];
sum += filter->weights[((filter->width * fy) + fx) * filter->depth + 1] * in->weights[((in->width * in_y) + in_x) * in->depth + 1];
sum += filter->weights[((filter->width * fy) + fx) * filter->depth + 2] * in->weights[((in->width * in_y) + in_x) * in->depth + 2];
} else {
for (int fd = 0; fd < filter->depth / 16 * 16; fd += 16) {
// loop unrolling + smid
__m256d vector1 = _mm256_loadu_pd( filter->weights + ((filter->width * fy) + fx) * filter->depth + fd );
__m256d vector2 = _mm256_loadu_pd( in->weights + ((in->width * in_y) + in_x) * in->depth + fd );
__m256d temp_product = _mm256_mul_pd(vector1, vector2);
sum_v = _mm256_add_pd(sum_v, temp_product);

vector1 = _mm256_loadu_pd( filter->weights + ((filter->width * fy) + fx) * filter->depth + (fd + 4) );
vector2 = _mm256_loadu_pd( in->weights + ((in->width * in_y) + in_x) * in->depth + (fd + 4) );
temp_product = _mm256_mul_pd(vector1, vector2);
sum_v = _mm256_add_pd(sum_v, temp_product);

vector1 = _mm256_loadu_pd( filter->weights + ((filter->width * fy) + fx) * filter->depth + (fd + 8) );
vector2 = _mm256_loadu_pd( in->weights + ((in->width * in_y) + in_x) * in->depth + (fd + 8) );
temp_product = _mm256_mul_pd(vector1, vector2);
sum_v = _mm256_add_pd(sum_v, temp_product);

vector1 = _mm256_loadu_pd( filter->weights + ((filter->width * fy) + fx) * filter->depth + (fd + 12) );
vector2 = _mm256_loadu_pd( in->weights + ((in->width * in_y) + in_x) * in->depth + (fd + 12) );
temp_product = _mm256_mul_pd(vector1, vector2);
sum_v = _mm256_add_pd(sum_v, temp_product);

}
_mm256_store_pd(res, sum_v); // store the temp result
sum += res[0] + res[1] + res[2] + res[3];

// tail case
for (int t=filter->depth/16*16; t<filter->depth; t++) {
sum += filter->weights[((filter->width * fy) + fx) * filter->depth + t] * in->weights[((in->width * in_y) + in_x) * in->depth + t];
}
}
_mm256_store_pd(res, sum_v); // store the temp result
sum += res[0] + res[1] + res[2] + res[3];

// tail case
for (int t=filter->depth/16*16; t<filter->depth; t++) {
sum += filter->weights[((filter->width * fy) + fx) * filter->depth + t] * in->weights[((in->width * in_y) + in_x) * in->depth + t];
}
}
}
}
Expand Down Expand Up @@ -372,6 +379,7 @@ softmax_layer_t* make_softmax_layer(int input_width, int input_height, int input
void softmax_forward(softmax_layer_t* l, volume_t** inputs, volume_t** outputs, int start, int end) {
double likelihoods[l->output_depth];

#pragma omp parallel for
for (int j = start; j <= end; j++) {
volume_t* in = inputs[j];
volume_t* out = outputs[j];
Expand All @@ -397,5 +405,6 @@ void softmax_forward(softmax_layer_t* l, volume_t** inputs, volume_t** outputs,
out->weights[i] = likelihoods[i] / total;
}
}

}

14 changes: 8 additions & 6 deletions network.c
Original file line number Diff line number Diff line change
Expand Up @@ -157,15 +157,17 @@ void net_classify(network_t* net, volume_t** input, double** likelihoods, int n)
}

// tail case [make this a running on one single thread instead of all threading running this section]
for (int ii = total_threads * num_per_chunck; ii < n; ii++) {
copy_volume(b[0][0], input[ii]);
net_forward(net, b, 0, 0);
for (int jj=0; jj<NUM_CLASSES; jj++) {
likelihoods[ii][jj] = b[11][0]->weights[jj];
if (thread_id == omp_get_num_threads() - 1) {
for (int ii = total_threads * num_per_chunck; ii < n; ii++) {
copy_volume(b[0][0], input[ii]);
net_forward(net, b, 0, 0);
for (int jj=0; jj<NUM_CLASSES; jj++) {
likelihoods[ii][jj] = b[11][0]->weights[jj];
}
}
}

free_batch(b, 1);
}

}

0 comments on commit f0fccab

Please sign in to comment.