Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 81 additions & 75 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,95 +1,101 @@
cmake_minimum_required(VERSION 3.0)
cmake_minimum_required(VERSION 3.1)

project(cis565_path_tracer)

set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})

# Set up include and lib paths
set(EXTERNAL "external")
include_directories("${EXTERNAL}")
include_directories("${EXTERNAL}/include")
if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/osx")
elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/linux" "/usr/lib64")
elseif(WIN32)
set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win")
endif()
link_directories(${EXTERNAL_LIB_PATH})
list(APPEND CMAKE_LIBRARY_PATH "${EXTERNAL_LIB_PATH}")


# Find up and set up core dependency libs
set_property(GLOBAL PROPERTY USE_FOLDERS ON)

set(GLFW_INCLUDE_DIR "${EXTERNAL}/include")
set(GLFW_LIBRARY_DIR "${CMAKE_LIBRARY_PATH}")
find_library(GLFW_LIBRARY "glfw3" HINTS "${GLFW_LIBRARY_DIR}")

set(GLEW_INCLUDE_DIR "${EXTERNAL}/include")
set(GLEW_LIBRARY_DIR "${CMAKE_LIBRARY_PATH}")
add_definitions(-DGLEW_STATIC)
find_package(GLEW)
set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})

find_package(OpenGL)

set(CORELIBS
"${GLFW_LIBRARY}"
"${OPENGL_LIBRARY}"
"${GLEW_LIBRARY}"
)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

# Enable C++11 for host code
set(CMAKE_CXX_STANDARD 11)

# Enable CUDA debug info in debug mode builds
list(APPEND CUDA_NVCC_FLAGS_DEBUG -G -g)

# OSX-specific hacks/fixes
if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
list(APPEND CORELIBS "-framework IOKit")
list(APPEND CORELIBS "-framework Cocoa")
list(APPEND CORELIBS "-framework CoreVideo")
# Set a default build type if none was specified
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
SET(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
# Set the possible values of build type for cmake-gui
SET_PROPERTY(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif()

# Linux-specific hacks/fixes
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
list(APPEND CMAKE_EXE_LINKER_FLAGS "-lX11 -lXxf86vm -lXrandr -lXi")
endif()

if (WIN32)
list(APPEND CORELIBS legacy_stdio_definitions.lib)
endif()
########################################
# CUDA Setup
########################################
find_package(CUDA 10 REQUIRED)
include(${CMAKE_MODULE_PATH}/CUDAComputesList.cmake)

list(APPEND CUDA_NVCC_FLAGS ${CUDA_GENERATE_CODE})
list(APPEND CUDA_NVCC_FLAGS_DEBUG "-g -G")
set(CUDA_VERBOSE_BUILD ON)

if(WIN32)
# Set up include and lib paths
set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} CACHE FILEPATH "Host side compiler used by NVCC" FORCE)
endif(WIN32)
########################################

find_package(OpenGL REQUIRED)

if(UNIX)
find_package(glfw3 REQUIRED)
find_package(GLEW REQUIRED)
set(LIBRARIES glfw ${GLEW_LIBRARIES} ${OPENGL_gl_LIBRARY})
else(UNIX)
set(EXTERNAL "external")

set(GLFW_ROOT_DIR ${EXTERNAL})
set(GLFW_USE_STATIC_LIBS ON)
find_package(GLFW REQUIRED)

set(GLEW_ROOT_DIR ${EXTERNAL})
set(GLEW_USE_STATIC_LIBS ON)
find_package(GLEW REQUIRED)

add_definitions(${GLEW_DEFINITIONS})
include_directories(${GLEW_INCLUDE_DIR} ${GLFW_INCLUDE_DIR})
set(LIBRARIES ${GLEW_LIBRARY} ${GLFW_LIBRARY} ${OPENGL_LIBRARY})
endif(UNIX)

set(GLM_ROOT_DIR "external")
find_package(GLM REQUIRED)
include_directories(${GLM_INCLUDE_DIRS})

set(headers
src/main.h
src/image.h
src/interactions.h
src/intersections.h
src/glslUtility.hpp
src/pathtrace.h
src/scene.h
src/sceneStructs.h
src/preview.h
src/utilities.h
)

# Crucial magic for CUDA linking
find_package(Threads REQUIRED)
find_package(CUDA 8.0 REQUIRED)
set(sources
src/main.cpp
src/stb.cpp
src/image.cpp
src/glslUtility.cpp
src/pathtrace.cu
src/scene.cpp
src/preview.cpp
src/utilities.cpp
)

set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON)
set(CUDA_SEPARABLE_COMPILATION ON)
list(SORT headers)
list(SORT sources)

if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
endif()
source_group(Headers FILES ${headers})
source_group(Sources FILES ${sources})

include_directories(.)
#add_subdirectory(stream_compaction) # TODO: uncomment if using your stream compaction
add_subdirectory(src)

cuda_add_executable(${CMAKE_PROJECT_NAME}
"src/main.h"
"src/main.cpp"
)

cuda_add_executable(${CMAKE_PROJECT_NAME} ${sources} ${headers})
target_link_libraries(${CMAKE_PROJECT_NAME}
src
${LIBRARIES}
#stream_compaction # TODO: uncomment if using your stream compaction
${CORELIBS}
)

add_custom_command(
TARGET ${CMAKE_PROJECT_NAME}
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_directory
${CMAKE_SOURCE_DIR}/shaders
${CMAKE_BINARY_DIR}/shaders
)
153 changes: 148 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,154 @@ CUDA Path Tracer

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3**

* (TODO) YOUR NAME HERE
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
* Name: Bowen Yang
* [LinkedIn](https://www.linkedin.com/in/%E5%8D%9A%E6%96%87-%E6%9D%A8-83bba6148)
* [GitHub](https://github.com/Grillnov)
* [Facebook](https://www.facebook.com/yang.bowen.7399)
* [Steam](https://steamcommunity.com/id/grillnov)
* Tested on: Windows 10 x64, i7-6800K @ 3.40GHz 32GB, GTX 1080 8GB (Personal computer at home)

### (TODO: Your README)
# Description

*DO NOT* leave the README to the last minute! It is a crucial part of the
project, and we will not be able to grade you without a good README.
Implement a path tracer with CUDA acceleration enabled, capable of rendering globally-illuminated, photo-realistic images at high speeds. The basecode already has the auxiliary services like I/O, keyboard interrrupts, and OpenGL calls. What we're supposed to do is to implement the core of it.

# Part 1: Basics

![](img/Plain.png)
The picture pretty much says it all.

## Performance analysis so far
We choose this default configuration as the benchmark of our performance analysis:
*Anti-aliasing: OFF*
*Depth of Field: OFF*
*1st Cache: OFF*
*Material sort: OFF*

### Performance analysis phase 1: The cached rays

![](img/Cache.png)
As we can see here the caching of the 1st arrays created slightly more overhead and consequently had negative effect on the performance.

### Performance analysis phase 2: The sorting

Thoretically we can expect some performance boost from sorting all the intersections by their material keys to make them continuously scattered in memory, so that the spatial locality becomes better in succeeding phases that calculate the scatters, etc.

However many of our classmates are suffering from severe performance loss due to this sorting operation. Some even faced a 90x execution time increase.

After careful referring the thrust library I found out that, unless sorting primitive integers, thrust::sort() or thrust::sort_by_key() will call its merge-sort subprocedure, not the radix-sort one, although for our purposes radix-sort is the optimal solution. So I headed out and implemented one on my own since it's way too hard to inherit the boolean functors in thrust library.

![](img/Sort.png)

Still some performance loss, but a lot better than using sort in thrust library directly.

# Part 2: Detail of the radix sort implementation

To perform radix sort or bucket sort in our case, first we need to allocate the buckets, one for each material ID. Each bucket is sized $num_paths for the worst case.
Then we perform 2 passes to complete the radix sort:

## 1. Collect
Fill the buckets with corresponding intersections.
```
__global__ void fillBuckets(
int num_paths
, const ShadeableIntersection * shaderableIntersections
, ShadeableIntersection ** shadeableIntersectionBucketsPtr
, int * bucketSizes
)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_paths)
{
int materialIndex = shaderableIntersections[idx].materialId;
int &indexWithinBucket = bucketSizes[materialIndex];

shadeableIntersectionBucketsPtr[materialIndex][indexWithinBucket] = shaderableIntersections[idx];
++indexWithinBucket;
}
}
```
## 2. Expand
Recover the new sequence of the original array by expanding each bucket back into the array.
```
//For each bucket, do
__global__ void expandBuckets(
const int materialId
, ShadeableIntersection * shaderableIntersections
, const ShadeableIntersection * shadeableIntersectionBucketI
, const int * bucketSizes)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int bucketISize = bucketSizes[materialId];
if (idx < bucketISize)
{
int offset = 0;
for (int i = 0; i < materialId; ++i)
{
offset += bucketSizes[i];
}
shaderableIntersections[offset + idx] = shadeableIntersectionBucketI[materialId];
}
}
```

## Calling kernels from host side
```
//Set the buckets to empty
cudaMemset(reinterpret_cast<void*>(validElementNumbers), 0, numOfBuckets * sizeof(int));

fillBuckets <<<numblocksPathSegmentTracing, blockSize1d >>> (
num_paths
, dev_intersections
, dev_intersectionBucketsPtrs
, validElementNumbers
);
for (int i = 0; i < numOfBuckets; ++i)
{
expandBuckets <<<numblocksPathSegmentTracing, blockSize1d >>> (
i
, dev_intersections
, dev_intersectionBuckets[i]
, validElementNumbers
);
}
```

## Discussions
As we can see here my somewhat primitive radix sort is not optimized well enough, especially when it comes to expanding the buckets back to the original array. To remedy this we can either unroll the loop in the host and move it to the kernel, since we're calling one kernel for each bucket, which is a significant waste of threads. Secondly we can cache the sizes of the buffer somewhere else on the device so that we don't have to calculate the offset for each thread.

# Part 3: Anti-Aliasing

Jitter the rays with an amplitute taken from thrust::uniform_real_distribution, with a range of (-JITTER_RANGE, JITTER_RANGE). Since we're doing multiple iterations of sampling anyway, the multisampling is automatically

![](img/AA.png)
With jitter = 0.008f.

![](img/AAmessed.png)
It's a little tricky to tackle with the jitter amplitute or you'll end up like this.

Some comparison between AA switched on/off.

With AA on:

![](img/AACloseup.png)

With AA off:

![](img/PlainCloseup.png)

# Part 4: Motion Blur

I added a translational velocity to the sphere, and added an explicit time integration with iteration increasing, so that the sphere translates with time elapsing.

![](img/MotionBlur.png)
# Part 5: Depth of Field

To implement DOF with physically-based lens, I referred to [this page](https://pub.dartlang.org/documentation/dartray/0.0.1/core/ConcentricSampleDisk.html) to get a better sampling throughout the disk. This sampled amplitute is then applied on the rays to create the effect of looking through an aperture of the camera, with certain focus and thickness of the lens.

Depth of Field render.

![](img/DOF.png)

Close-up scene render.

![](img/CloseUpDOF.png)
Loading