diff --git a/README.md b/README.md index 0e38ddb..90e79fd 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,101 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Yuru Wang + * yuruwang@seas.upenn.edu +* Tested on: Windows 10, i7-7700HQ @ 2.5GHz 128GB, GTX 1050 Ti 8GB (personal computer) +* Modified CMakeList.txt: changed sm_20 to sm_61 inside cuda_add_library -### (TODO: Your README) +## Project Description ## +This project aims at implementing GPU stream compaction algorithm in CUDA. In this project, the compaction algorithm simply removes all zeros from an array of int s. To compare and analyze the performance of GPU and CPU computation, a few different versions of Scan (Prefix Sum) algorithms are implemented. Then they are used in the scatter algorithm to do stream compaction. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +A list of features included in this project is as follows: +* CPU Scan: single for loop implementation +* CPU Stream Compaction: single for loop implementation +* Naive GPU Scan: GPU version of naive parallel reduction +* Work-Efficient GPU Scan: GPU version of work efficient parallel reduction with up sweep and down sweep phases +* Work-Efficient GPU Stream Compaction: GPU work efficient scan algorithm along with scatter algorithm +* Thrust's Implementation: using built-in thrust::exclusive_scan function +Extra Credit: +Part 5 (Optimize GPU work efficient scan) was implemented. The optimized efficient scan algorithm has obvious better performance than old one. The detail analysis and performance comparison are described at Next section. + +## Performance Analysis and Questions ## +![](img/blockSize.jpg) +As shown in above diagram, the block size does not affect the performance much, so a decent block size of 512 was chosen for comparing performance of various implementations. + + +![](img/scan_comparison.jpg) +The above plot demonstrates a rough ranking among various versions of implementations: Thrust has best performance, and GPU work efficient scan algorithm comes next. The GPU naive scan algorithm ranks the third place, and the CPU scan performs worst. + +When the array size is small (less than 2^20), there is no big performance difference between those implementations and we can even observe that CPU scan has best performance. I guess this is probably because GPU parallel computation would have more overhead than CPU when the array is too small. While the array size is getting larger, performances start diverging. GPU Efficient scan starts working better than CPU scan since the advantage of parallel computing exceeds its overhead. + +Thrust implementation works very well even for large array size, I guess that is because thrust implementation uses shared memory, which results in faster memory access compare to global memory. + + +![](img/compact.jpg) +From this graph, it is clear that the GPU compact with scan has best performance than other implementations. + + +![](img/optimization.jpg) +Above graph shows the performance improvement for the GPU work efficient scan after optimizing. About more than twice scan efficiency improvement can be observed especially for large array size that exceeds 2^26. This is achieved by decreasing hanging threads at up sweep and down sweep phases. The old implementation is slow because some threads are not working at each iteration of sweeping, which wastes the resources of SM. After decreasing removing those threads and compacting all working threads with indices hacks, the computing power of SM is fully used and the performance is thus improved. + +## Output ## +blockSize = 512, ArraySize = 2^26 + +``` +**************** +** SCAN TESTS ** +**************** + [ 29 28 49 36 34 5 16 37 28 9 17 18 23 ... 21 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 343.553ms (std::chrono Measured) + [ 0 29 57 106 142 176 181 197 234 262 271 288 306 ... 1643656586 1643656607 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 170.328ms (std::chrono Measured) + [ 0 29 57 106 142 176 181 197 234 262 271 288 306 ... 1643656538 1643656552 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 229.179ms (CUDA Measured) + [ 0 29 57 106 142 176 181 197 234 262 271 288 306 ... 1643656586 1643656607 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 228.234ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 59.0043ms (CUDA Measured) + [ 0 29 57 106 142 176 181 197 234 262 271 288 306 ... 1643656586 1643656607 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 59.0076ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 12.3873ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 9.8304ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 0 2 3 3 0 2 0 2 2 2 0 3 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 191.089ms (std::chrono Measured) + [ 2 2 3 3 2 2 2 2 3 2 2 3 1 ... 3 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 213.995ms (std::chrono Measured) + [ 2 2 3 3 2 2 2 2 3 2 2 3 1 ... 2 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 536.685ms (std::chrono Measured) + [ 2 2 3 3 2 2 2 2 3 2 2 3 1 ... 3 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 80.1091ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 80.2937ms (CUDA Measured) + passed +``` diff --git a/img/blockSize.jpg b/img/blockSize.jpg new file mode 100644 index 0000000..705d616 Binary files /dev/null and b/img/blockSize.jpg differ diff --git a/img/compact.jpg b/img/compact.jpg new file mode 100644 index 0000000..850db3c Binary files /dev/null and b/img/compact.jpg differ diff --git a/img/optimization.jpg b/img/optimization.jpg new file mode 100644 index 0000000..652e949 Binary files /dev/null and b/img/optimization.jpg differ diff --git a/img/scan_comparison.jpg b/img/scan_comparison.jpg new file mode 100644 index 0000000..2dc02ef Binary files /dev/null and b/img/scan_comparison.jpg differ diff --git a/src/main.cpp b/src/main.cpp index 1850161..0f73bcd 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 26; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -51,14 +51,14 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ + //For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + //onesArray(SIZE, c); + //printDesc("1s array for finding bugs"); + //StreamCompaction::Naive::scan(SIZE, c, a); + //printArray(SIZE, c, true); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); @@ -71,7 +71,7 @@ int main(int argc, char* argv[]) { printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..4bb0dc2 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_61 ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..56c84d6 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,17 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + if (idata[index] == 0) { + bools[index] = 0; + } + else { + bools[index] = 1; + } } /** @@ -33,6 +44,15 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + if (bools[index] == 1) { + int idx = indices[index]; + odata[idx] = idata[index]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 99a1b04..e0e8643 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -13,6 +13,8 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +const int blockSize = 512; + /** * Check for CUDA errors; print and exit if there was a problem. */ diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..bb8ec6f 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,15 +1,15 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** @@ -20,6 +20,11 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int i = 1; i < n; ++i) { + odata[i] = odata[i - 1] + idata[i - 1]; + } + timer().endCpuTimer(); } @@ -31,8 +36,16 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int count = 0; + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + odata[count] = idata[i]; + count++; + } + } + timer().endCpuTimer(); - return -1; + return count; } /** @@ -43,8 +56,34 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int *mapping = new int[n]; + for (int i = 0; i < n; ++i) { + if (idata[i] == 0) { + mapping[i] = 0; + } + else { + mapping[i] = 1; + } + } + + int *scanned = new int[n]; + scanned[0] = 0; + for (int i = 1; i < n; ++i) { + scanned[i] = scanned[i - 1] + mapping[i - 1]; + } + + int lastIdx = 0; + for (int i = 0; i < n; ++i) { + if (mapping[i] != 0) { + int idx = scanned[i]; + odata[idx] = idata[i]; + lastIdx = idx; + } + } + + timer().endCpuTimer(); - return -1; + return lastIdx + 1; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..bc59a9d 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,24 +3,138 @@ #include "common.h" #include "efficient.h" +#include +using namespace std; + + namespace StreamCompaction { namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } + __global__ void kernUpSweep(int n, int d, int *data) { + // old implementation + //int index = threadIdx.x + (blockIdx.x * blockDim.x); + //if (index >= n) { + // return; + //} + //int twoPowerDplus1 = 1 << (d + 1); + //int twoPowerD = 1 << d; + + + //if (index % twoPowerDplus1 == 0) { + // data[index + twoPowerDplus1 - 1] += data[index + twoPowerD - 1]; + //} + + // optimized implementation + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + int twoPowerDplus1 = 1 << (d + 1); + int twoPowerD = 1 << d; + + data[index * twoPowerDplus1 + twoPowerDplus1 - 1] += data[index * twoPowerDplus1 + twoPowerD - 1]; + + } + + __global__ void kernDownSweep(int n, int d, int *data) { + // old implementation + //int index = threadIdx.x + (blockIdx.x * blockDim.x); + //if (index >= n) { + // return; + //} + + //int twoPowerDplus1 = 1 << (d + 1); + //int twoPowerD = 1 << d; + + //if (index % twoPowerDplus1 == 0) { + // int temp = data[index + twoPowerD - 1]; + // data[index + twoPowerD - 1] = data[index + twoPowerDplus1 - 1]; + // data[index + twoPowerDplus1 - 1] += temp; + //} + + // optimized implementation + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + int twoPowerDplus1 = 1 << (d + 1); + int twoPowerD = 1 << d; + + int temp = data[index * twoPowerDplus1 + twoPowerD - 1]; + data[index * twoPowerDplus1 + twoPowerD - 1] = data[index * twoPowerDplus1 + twoPowerDplus1 - 1]; + data[index * twoPowerDplus1 + twoPowerDplus1 - 1] += temp; + } + + void efficientScan(int n, int depth, int *dev_tempData) { + // old implementation + //dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + //for (int d = 0; d < depth; ++d) { + // kernUpSweep <<>> (n, d, dev_tempData); + //} + + //cudaMemset(dev_tempData + n - 1, 0, sizeof(int)); + //for (int d = depth - 1; d >= 0; --d) { + // kernDownSweep << > > (n, d, dev_tempData); + //} + + // optimized implementation + for (int d = 0; d < depth; ++d) { + int twoPowerDPluse1 = 1 << (d + 1); + dim3 fullBlocksPerGrid((n / twoPowerDPluse1 + blockSize - 1) / blockSize); + kernUpSweep <<>> (n / twoPowerDPluse1, d, dev_tempData); + } + + cudaMemset(dev_tempData + n - 1, 0, sizeof(int)); + for (int d = depth - 1; d >= 0; --d) { + int twoPowerDPluse1 = 1 << (d + 1); + dim3 fullBlocksPerGrid((n / twoPowerDPluse1 + blockSize - 1) / blockSize); + kernDownSweep << > > (n / twoPowerDPluse1, d, dev_tempData); + } + + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int depth = ilog2ceil(n); + int totalLength = 1 << depth; + + int *dev_tempData; + int *dataWithPadding = new int[totalLength]; + std::memcpy(dataWithPadding, idata, n * sizeof(int)); + + for (int i = n; i < totalLength; ++i) {; + dataWithPadding[i] = 0; + } + + cudaMalloc((void**)&dev_tempData, totalLength * sizeof(int)); + cudaMemcpy(dev_tempData, dataWithPadding, totalLength * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); // TODO + efficientScan(totalLength, depth, dev_tempData); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_tempData, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_tempData); + + delete[] dataWithPadding; } + __global__ void kernCompact(int n, int *odata, int *mapping, int *scan ) { + + } + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -31,10 +145,63 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + // array size round up to power of two + int depth = ilog2ceil(n); + int totalLength = 1 << depth; + int *initIndices = new int[totalLength]; + + for (int i = 0; i < totalLength; ++i) { + initIndices[i] = 0; + } + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int *dev_bools; + int *dev_indices; + int *dev_idata; + int *dev_odata; + + + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_indices, totalLength * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_indices, initIndices, totalLength * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); // TODO + // mapping + StreamCompaction::Common::kernMapToBoolean <<>> (n, dev_bools, dev_idata); + + // scann + cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice); + efficientScan(totalLength, depth, dev_indices); + + // scatter + StreamCompaction::Common::kernScatter << > > (n, dev_odata, dev_idata, dev_bools, dev_indices); + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + int count = 0; + for (int i = 0; i < n; i++) { + if (odata[i]) { + count++; + } else { + break; + } + } + + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_idata); + cudaFree(dev_odata); + + delete[] initIndices; + return count; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..43ae7e2 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,23 +3,59 @@ #include "common.h" #include "naive.h" + namespace StreamCompaction { namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } // TODO: __global__ + __global__ void kernNaiveScan(int n, int d, int* odata, const int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + int dist = 1 << (d - 1); + if (index >= dist) { + odata[index] = idata[index - dist] + idata[index]; + } else{ + odata[index] = idata[index]; + } + + + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_idata; + int* dev_odata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + dim3 fullBlocksPerGrid((n + blockSize - 1)/ blockSize); + + int depth = ilog2ceil(n); timer().startGpuTimer(); // TODO + for (int d = 1; d <= depth; ++d) { + kernNaiveScan <<>> (n, d, dev_odata, dev_idata); + std::swap(dev_idata, dev_odata); + } + std::swap(dev_idata, dev_odata); + + cudaMemcpy(odata + 1, dev_odata, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + odata[0] = 0; timer().endGpuTimer(); + + cudaFree(dev_idata); + cudaFree(dev_odata); } + } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..3316f07 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -8,21 +8,25 @@ namespace StreamCompaction { namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::device_vector dev_in(idata, idata + n); + thrust::device_vector dev_out(odata, odata + n); timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin()); timer().endGpuTimer(); + thrust::copy(dev_out.begin(), dev_out.end(), odata); } } }