CIS565-Fall-2018 · Uchanka · Sep 30, 2018 · Sep 30, 2018 · Sep 30, 2018 · Sep 30, 2018
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,95 +1,101 @@
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.1)
 
 project(cis565_path_tracer)
 
-set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
-
-# Set up include and lib paths
-set(EXTERNAL "external")
-include_directories("${EXTERNAL}")
-include_directories("${EXTERNAL}/include")
-if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/osx")
-elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
-    set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/linux" "/usr/lib64")
-elseif(WIN32)
-    set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win")
-endif()
-link_directories(${EXTERNAL_LIB_PATH})
-list(APPEND CMAKE_LIBRARY_PATH "${EXTERNAL_LIB_PATH}")
-
-
-# Find up and set up core dependency libs
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
-set(GLFW_INCLUDE_DIR "${EXTERNAL}/include")
-set(GLFW_LIBRARY_DIR "${CMAKE_LIBRARY_PATH}")
-find_library(GLFW_LIBRARY "glfw3" HINTS "${GLFW_LIBRARY_DIR}")
-
-set(GLEW_INCLUDE_DIR "${EXTERNAL}/include")
-set(GLEW_LIBRARY_DIR "${CMAKE_LIBRARY_PATH}")
-add_definitions(-DGLEW_STATIC)
-find_package(GLEW)
+set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
 
-find_package(OpenGL)
-
-set(CORELIBS
-    "${GLFW_LIBRARY}"
-    "${OPENGL_LIBRARY}"
-    "${GLEW_LIBRARY}"
-    )
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 
 # Enable C++11 for host code
 set(CMAKE_CXX_STANDARD 11)
 
-# Enable CUDA debug info in debug mode builds
-list(APPEND CUDA_NVCC_FLAGS_DEBUG -G -g)
-
-# OSX-specific hacks/fixes
-if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    list(APPEND CORELIBS "-framework IOKit")
-    list(APPEND CORELIBS "-framework Cocoa")
-    list(APPEND CORELIBS "-framework CoreVideo")
+# Set a default build type if none was specified
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    SET(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
+    # Set the possible values of build type for cmake-gui
+    SET_PROPERTY(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
-# Linux-specific hacks/fixes
-if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
-    list(APPEND CMAKE_EXE_LINKER_FLAGS "-lX11 -lXxf86vm -lXrandr -lXi")
-endif()
-
-if (WIN32)
-	list(APPEND CORELIBS legacy_stdio_definitions.lib)
-endif()
+########################################
+# CUDA Setup
+########################################
+find_package(CUDA 10 REQUIRED)
+include(${CMAKE_MODULE_PATH}/CUDAComputesList.cmake)
+
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_GENERATE_CODE})
+list(APPEND CUDA_NVCC_FLAGS_DEBUG "-g -G")
+set(CUDA_VERBOSE_BUILD ON)
+
+if(WIN32)
+    # Set up include and lib paths
+    set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} CACHE FILEPATH "Host side compiler used by NVCC" FORCE)
+endif(WIN32)
+########################################
+
+find_package(OpenGL REQUIRED)
+
+if(UNIX)
+    find_package(glfw3 REQUIRED)
+    find_package(GLEW REQUIRED)
+    set(LIBRARIES glfw ${GLEW_LIBRARIES} ${OPENGL_gl_LIBRARY})
+else(UNIX)
+    set(EXTERNAL "external")
+
+    set(GLFW_ROOT_DIR ${EXTERNAL})
+    set(GLFW_USE_STATIC_LIBS ON)
+    find_package(GLFW REQUIRED)
+
+    set(GLEW_ROOT_DIR ${EXTERNAL})
+    set(GLEW_USE_STATIC_LIBS ON)
+    find_package(GLEW REQUIRED)
+
+    add_definitions(${GLEW_DEFINITIONS})
+    include_directories(${GLEW_INCLUDE_DIR} ${GLFW_INCLUDE_DIR})
+    set(LIBRARIES ${GLEW_LIBRARY} ${GLFW_LIBRARY} ${OPENGL_LIBRARY})
+endif(UNIX)
+
+set(GLM_ROOT_DIR "external")
+find_package(GLM REQUIRED)
+include_directories(${GLM_INCLUDE_DIRS})
+
+set(headers
+    src/main.h
+    src/image.h
+    src/interactions.h
+    src/intersections.h
+    src/glslUtility.hpp
+    src/pathtrace.h
+    src/scene.h
+    src/sceneStructs.h
+    src/preview.h
+    src/utilities.h
+    )
 
-# Crucial magic for CUDA linking
-find_package(Threads REQUIRED)
-find_package(CUDA 8.0 REQUIRED)
+set(sources
+    src/main.cpp
+    src/stb.cpp
+    src/image.cpp
+    src/glslUtility.cpp
+    src/pathtrace.cu
+    src/scene.cpp
+    src/preview.cpp
+    src/utilities.cpp
+    )
 
-set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON)
-set(CUDA_SEPARABLE_COMPILATION ON)
+list(SORT headers)
+list(SORT sources)
 
-if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-endif()
+source_group(Headers FILES ${headers})
+source_group(Sources FILES ${sources})
 
-include_directories(.)
 #add_subdirectory(stream_compaction)  # TODO: uncomment if using your stream compaction
-add_subdirectory(src)
-
-cuda_add_executable(${CMAKE_PROJECT_NAME}
-    "src/main.h"
-    "src/main.cpp"
-    )
 
+cuda_add_executable(${CMAKE_PROJECT_NAME} ${sources} ${headers})
 target_link_libraries(${CMAKE_PROJECT_NAME}
-    src
+    ${LIBRARIES}
     #stream_compaction  # TODO: uncomment if using your stream compaction
-    ${CORELIBS}
-    )
-
-add_custom_command(
-    TARGET ${CMAKE_PROJECT_NAME}
-    POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_directory
-        ${CMAKE_SOURCE_DIR}/shaders
-        ${CMAKE_BINARY_DIR}/shaders
     )
diff --git a/README.md b/README.md
@@ -3,11 +3,154 @@ CUDA Path Tracer
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Name: Bowen Yang
+  * [LinkedIn](https://www.linkedin.com/in/%E5%8D%9A%E6%96%87-%E6%9D%A8-83bba6148)
+  * [GitHub](https://github.com/Grillnov)
+  * [Facebook](https://www.facebook.com/yang.bowen.7399)
+  * [Steam](https://steamcommunity.com/id/grillnov)
+* Tested on: Windows 10 x64, i7-6800K @ 3.40GHz 32GB, GTX 1080 8GB (Personal computer at home)
 
-### (TODO: Your README)
+# Description
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+Implement a path tracer with CUDA acceleration enabled, capable of rendering globally-illuminated, photo-realistic images at high speeds. The basecode already has the auxiliary services like I/O, keyboard interrrupts, and OpenGL calls. What we're supposed to do is to implement the core of it.
 
+# Part 1: Basics
+
+![](img/Plain.png)
+The picture pretty much says it all.
+
+## Performance analysis so far
+We choose this default configuration as the benchmark of our performance analysis:
+*Anti-aliasing: OFF*
+*Depth of Field: OFF*
+*1st Cache: OFF*
+*Material sort: OFF*
+
+### Performance analysis phase 1: The cached rays
+
+![](img/Cache.png)
+As we can see here the caching of the 1st arrays created slightly more overhead and consequently had negative effect on the performance.
+
+### Performance analysis phase 2: The sorting
+
+Thoretically we can expect some performance boost from sorting all the intersections by their material keys to make them continuously scattered in memory, so that the spatial locality becomes better in succeeding phases that calculate the scatters, etc.
+
+However many of our classmates are suffering from severe performance loss due to this sorting operation. Some even faced a 90x execution time increase.
+
+After careful referring the thrust library I found out that, unless sorting primitive integers, thrust::sort() or thrust::sort_by_key() will call its merge-sort subprocedure, not the radix-sort one, although for our purposes radix-sort is the optimal solution. So I headed out and implemented one on my own since it's way too hard to inherit the boolean functors in thrust library.
+
+![](img/Sort.png)
+
+Still some performance loss, but a lot better than using sort in thrust library directly.
+
+# Part 2: Detail of the radix sort implementation
+
+To perform radix sort or bucket sort in our case, first we need to allocate the buckets, one for each material ID. Each bucket is sized $num_paths for the worst case.
+Then we perform 2 passes to complete the radix sort:
+
+## 1. Collect
+Fill the buckets with corresponding intersections.
+```
+__global__ void fillBuckets(
+	int num_paths
+	, const ShadeableIntersection * shaderableIntersections
+	, ShadeableIntersection ** shadeableIntersectionBucketsPtr
+	, int * bucketSizes
+)
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+	if (idx < num_paths)
+	{
+		int materialIndex = shaderableIntersections[idx].materialId;
+		int &indexWithinBucket = bucketSizes[materialIndex];
+
+		shadeableIntersectionBucketsPtr[materialIndex][indexWithinBucket] = shaderableIntersections[idx];
+		++indexWithinBucket;
+	}
+}
+```
+## 2. Expand
+Recover the new sequence of the original array by expanding each bucket back into the array.
+```
+//For each bucket, do
+__global__ void expandBuckets(
+	const int materialId
+	, ShadeableIntersection * shaderableIntersections
+	, const ShadeableIntersection * shadeableIntersectionBucketI
+	, const int * bucketSizes)
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+	int bucketISize = bucketSizes[materialId];
+	if (idx < bucketISize)
+	{
+		int offset = 0;
+		for (int i = 0; i < materialId; ++i)
+		{
+			offset += bucketSizes[i];
+		}
+		shaderableIntersections[offset + idx] = shadeableIntersectionBucketI[materialId];
+	}
+}
+```
+
+## Calling kernels from host side
+```
+//Set the buckets to empty
+cudaMemset(reinterpret_cast<void*>(validElementNumbers), 0, numOfBuckets * sizeof(int));
+
+fillBuckets <<<numblocksPathSegmentTracing, blockSize1d >>> (
+		num_paths
+		, dev_intersections
+		, dev_intersectionBucketsPtrs
+		, validElementNumbers
+		);
+for (int i = 0; i < numOfBuckets; ++i)
+{
+	expandBuckets <<<numblocksPathSegmentTracing, blockSize1d >>> (
+		i
+		, dev_intersections
+		, dev_intersectionBuckets[i]
+		, validElementNumbers
+		);
+}
+```
+
+## Discussions
+As we can see here my somewhat primitive radix sort is not optimized well enough, especially when it comes to expanding the buckets back to the original array. To remedy this we can either unroll the loop in the host and move it to the kernel, since we're calling one kernel for each bucket, which is a significant waste of threads. Secondly we can cache the sizes of the buffer somewhere else on the device so that we don't have to calculate the offset for each thread.
+
+# Part 3: Anti-Aliasing
+
+Jitter the rays with an amplitute taken from thrust::uniform_real_distribution, with a range of (-JITTER_RANGE, JITTER_RANGE). Since we're doing multiple iterations of sampling anyway, the multisampling is automatically
+
+![](img/AA.png)
+With jitter = 0.008f.
+
+![](img/AAmessed.png)
+It's a little tricky to tackle with the jitter amplitute or you'll end up like this.
+
+Some comparison between AA switched on/off.
+
+With AA on:
+
+![](img/AACloseup.png)
+
+With AA off:
+
+![](img/PlainCloseup.png)
+
+# Part 4: Motion Blur
+
+I added a translational velocity to the sphere, and added an explicit time integration with iteration increasing, so that the sphere translates with time elapsing.
+
+![](img/MotionBlur.png)
+# Part 5: Depth of Field
+
+To implement DOF with physically-based lens, I referred to [this page](https://pub.dartlang.org/documentation/dartray/0.0.1/core/ConcentricSampleDisk.html) to get a better sampling throughout the disk. This sampled amplitute is then applied on the rays to create the effect of looking through an aperture of the camera, with certain focus and thickness of the lens.
+
+Depth of Field render.
+
+![](img/DOF.png)
+
+Close-up scene render.
+
+![](img/CloseUpDOF.png)