From 66f2e539b2a48903b3a5fa9243c48e0dd5b28082 Mon Sep 17 00:00:00 2001 From: Erik Zenker Date: Mon, 5 Oct 2015 19:05:02 +0200 Subject: [PATCH] Squashed 'include/alpaka/' changes from 955fa97..9dfdb96 9dfdb96 fix missing OpenMP link flag b9f099c fix foldrAll ICC bug 83ddac5 disable the OpenMP 4 back-end by default 8644064 fix Vec for Intel 819e5d9 fix boost 1.56 missing const bug f9cd663 really fix Intel cpuid 330d983 remove incorrect docu 9f1b692 fix Intel compiler cpuid 1aa4c86 fix missing OMP_NUM_THREADS reset in getMaxOmpThreads 328e866 fix CUDA compilation 33c7888 remove ICC from the readme (untested / not compiling) 40a8465 always interpret all source files as .cu files for nvcc 25f4670 allow vectorize to be called without the element type 882c0a9 enhance documentation 05454a6 fix ambiguous template specialization for GetWorkDiv 5b70326 remove call to std::ref in BlockSharedAllocCudaBuiltIn e15c40a fix fix AtomicOmpCritSec afffe2f fix wrong atomic implementation for AccCpuOmp2Blocks 2a60bbb fix BufCudaRt destruction 062378d add ALPAKA_ADD_EXECUTABLE to alpakaConfig.cmake b9a4125 use DimInt more consistently 919dc26 move ElemType from mem::view to elem 2807fc8 add initial ALPAKA_ADD_EXECUTABLE f019e70 fix BufPlainPtrWrapper pitch 1ca1923 fix missing OpenMP linker flag 9ee231d fix getFreeGlobalMemSizeBytes 7e853c6 Merge pull request #54 from psychocoderHPC/fix-cudaSet 6796eff Merge pull request #55 from psychocoderHPC/fix-callingHostFunctionFromDevice 9f3d8e6 fix warning calling host function from device 000a250 fix wrong usage of `getPitchBytes<>()` 8be955d Merge pull request #53 from psychocoderHPC/topic-suppressHostDeviceWarning b7c877d Merge pull request #52 from psychocoderHPC/tpoic-updateGitIgnore 0b94251 suppress host device warning 33a59be update `.gitignore` 237898f refactoring d0ad945 implement getFreeGlobalMemSizeBytes f85e233 allow accelerators to inherit from rand implementation d96e8b5 fix CUDA set implemenentation git-subtree-dir: include/alpaka git-subtree-split: 9dfdb96b0cb2fc32a1f2e447de755905f7538bf4 --- .gitignore | 13 ++ .travis.yml | 56 ++--- README.md | 33 ++- alpakaConfig.cmake | 21 +- cmake/addExecutable.cmake | 39 ++++ doc/Abstraction.md | 127 ----------- doc/Distinction.md | 125 ----------- doc/Threading.md | 57 ----- examples/mandelbrot/CMakeLists.txt | 42 +--- examples/mandelbrot/src/main.cpp | 6 +- examples/mandelbrot/src/main.cu | 22 -- examples/matMul/CMakeLists.txt | 42 +--- examples/matMul/src/main.cpp | 6 +- examples/matMul/src/main.cu | 22 -- examples/sharedMem/CMakeLists.txt | 42 +--- examples/sharedMem/src/main.cu | 22 -- examples/vectorAdd/CMakeLists.txt | 42 +--- examples/vectorAdd/src/main.cpp | 2 +- examples/vectorAdd/src/main.cu | 22 -- include/alpaka/acc/AccCpuFibers.hpp | 5 +- include/alpaka/acc/AccCpuOmp2Blocks.hpp | 11 +- include/alpaka/acc/AccCpuOmp2Threads.hpp | 5 +- include/alpaka/acc/AccCpuOmp4.hpp | 5 +- include/alpaka/acc/AccCpuSerial.hpp | 5 +- include/alpaka/acc/AccCpuThreads.hpp | 7 +- include/alpaka/acc/AccGpuCudaRt.hpp | 7 +- include/alpaka/atomic/Traits.hpp | 17 +- .../shared/BlockSharedAllocCudaBuiltIn.hpp | 2 +- include/alpaka/block/shared/Traits.hpp | 24 +- .../block/sync/BlockSyncCudaBuiltIn.hpp | 2 +- .../block/sync/BlockSyncFiberIdMapBarrier.hpp | 4 +- include/alpaka/block/sync/BlockSyncNoOp.hpp | 3 +- .../sync/BlockSyncThreadIdMapBarrier.hpp | 4 +- include/alpaka/core/ConcurrentExecPool.hpp | 5 +- include/alpaka/core/Cuda.hpp | 24 +- include/alpaka/core/Fold.hpp | 8 +- include/alpaka/core/NdLoop.hpp | 2 +- include/alpaka/core/OpenMp.hpp | 15 +- include/alpaka/core/Vectorize.hpp | 73 +++--- include/alpaka/dev/DevCpu.hpp | 7 +- include/alpaka/dev/Traits.hpp | 36 +-- include/alpaka/dev/cpu/SysInfo.hpp | 75 +++++-- include/alpaka/dim/DimArithmetic.hpp | 2 +- include/alpaka/elem/Traits.hpp | 72 ++++++ include/alpaka/event/EventCpu.hpp | 6 +- include/alpaka/event/Traits.hpp | 9 +- include/alpaka/exec/ExecCpuOmp2Blocks.hpp | 4 +- include/alpaka/exec/ExecCpuOmp4.hpp | 4 +- include/alpaka/exec/ExecGpuCudaRt.hpp | 6 +- include/alpaka/extent/Traits.hpp | 8 +- include/alpaka/idx/Traits.hpp | 30 +-- include/alpaka/idx/bt/IdxBtOmp.hpp | 2 +- include/alpaka/math/abs/Traits.hpp | 13 +- include/alpaka/math/acos/Traits.hpp | 13 +- include/alpaka/math/asin/Traits.hpp | 13 +- include/alpaka/math/atan/Traits.hpp | 13 +- include/alpaka/math/atan2/Traits.hpp | 17 +- include/alpaka/math/cbrt/Traits.hpp | 13 +- include/alpaka/math/ceil/Traits.hpp | 13 +- include/alpaka/math/cos/Traits.hpp | 13 +- include/alpaka/math/erf/Traits.hpp | 13 +- include/alpaka/math/exp/Traits.hpp | 13 +- include/alpaka/math/floor/Traits.hpp | 13 +- include/alpaka/math/fmod/Traits.hpp | 17 +- include/alpaka/math/log/Traits.hpp | 13 +- include/alpaka/math/max/Traits.hpp | 17 +- include/alpaka/math/min/Traits.hpp | 17 +- include/alpaka/math/pow/Traits.hpp | 17 +- include/alpaka/math/remainder/Traits.hpp | 17 +- include/alpaka/math/round/Traits.hpp | 39 ++-- include/alpaka/math/rsqrt/Traits.hpp | 13 +- include/alpaka/math/sin/Traits.hpp | 13 +- include/alpaka/math/sqrt/Traits.hpp | 13 +- include/alpaka/math/tan/Traits.hpp | 13 +- include/alpaka/math/trunc/Traits.hpp | 13 +- .../alpaka/math/trunc/TruncCudaBuiltIn.hpp | 5 +- include/alpaka/mem/buf/BufCpu.hpp | 36 +-- include/alpaka/mem/buf/BufCudaRt.hpp | 44 ++-- include/alpaka/mem/buf/BufPlainPtrWrapper.hpp | 55 ++--- include/alpaka/mem/buf/BufStdContainers.hpp | 99 ++++---- include/alpaka/mem/buf/Traits.hpp | 70 +++--- include/alpaka/mem/buf/cpu/Copy.hpp | 6 +- include/alpaka/mem/buf/cpu/Set.hpp | 6 +- include/alpaka/mem/buf/cuda/Copy.hpp | 18 +- include/alpaka/mem/buf/cuda/Set.hpp | 22 +- include/alpaka/mem/view/Traits.hpp | 86 ++++--- include/alpaka/mem/view/ViewBasic.hpp | 51 +++-- include/alpaka/offset/Traits.hpp | 8 +- include/alpaka/rand/RandCuRand.hpp | 50 ++--- include/alpaka/rand/RandStl.hpp | 56 +++-- include/alpaka/rand/Traits.hpp | 212 ++++++++++++++---- include/alpaka/size/Traits.hpp | 2 +- include/alpaka/stream/StreamCpuAsync.hpp | 1 + include/alpaka/stream/StreamCpuSync.hpp | 1 + include/alpaka/vec/Vec.hpp | 97 ++++---- include/alpaka/workdiv/Traits.hpp | 41 +++- 96 files changed, 1237 insertions(+), 1308 deletions(-) create mode 100644 cmake/addExecutable.cmake delete mode 100644 doc/Abstraction.md delete mode 100644 doc/Distinction.md delete mode 100644 doc/Threading.md delete mode 100644 examples/mandelbrot/src/main.cu delete mode 100644 examples/matMul/src/main.cu delete mode 100644 examples/sharedMem/src/main.cu delete mode 100644 examples/vectorAdd/src/main.cu create mode 100644 include/alpaka/elem/Traits.hpp diff --git a/.gitignore b/.gitignore index d943d9a..921e1af 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,16 @@ !/doc/doxygen/Doxyfile /doc/latex/* **/build + +# tmp files +*~ + +# netbeans project files +/nbproject/ + +# Code::Blocks project files +/*.cbp +/*.layout + +# original backup files +*.orig diff --git a/.travis.yml b/.travis.yml index d7d4bb3..3601c49 100644 --- a/.travis.yml +++ b/.travis.yml @@ -45,7 +45,7 @@ compiler: # [clang++] ALPAKA_CLANG_VER : {3.5, 3.6, 3.7} # [g++] ALPAKA_GCC_VER : {4.9, 5} # ALPAKA_BOOST_BRANCH : {boost-1.56.0, boost-1.57.0, boost-1.58.0, boost-1.59.0, develop} -# ALPAKA_CMAKE_VER : {2.8.12.2, 3.0.2, 3.1.3, 3.2.2, 3.3.1} +# ALPAKA_CMAKE_VER : {3.3.0, 3.3.1, 3.3.2} # CMAKE_BUILD_TYPE : {Debug, Release} # ALPAKA_DEBUG : {0, 1, 2} # ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE : {ON, OFF} @@ -73,28 +73,28 @@ env: - ALPAKA_CLANG_LIBSTDCPP_VERSION=4.9 matrix: - - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.3.1 CMAKE_BUILD_TYPE=Debug OMP_NUM_THREADS=4 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.56.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON - - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=0 ALPAKA_CMAKE_VER=3.2.2 CMAKE_BUILD_TYPE=Debug OMP_NUM_THREADS=3 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.57.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON - - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.2.2 CMAKE_BUILD_TYPE=Debug OMP_NUM_THREADS=2 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.58.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF - - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.1.3 CMAKE_BUILD_TYPE=Debug OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5 ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.59.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF - - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.2.2 CMAKE_BUILD_TYPE=Debug OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5 ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop ALPAKA_ACC_GPU_CUDA_ENABLE=OFF - - - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.3.1 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.58.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON - - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=0 ALPAKA_CMAKE_VER=3.2.2 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=3 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.59.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON - - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.0.2 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=2 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.56.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF - - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=0 ALPAKA_CMAKE_VER=2.8.12.2 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=1 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.57.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF - - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.2.2 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5 ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop ALPAKA_ACC_GPU_CUDA_ENABLE=OFF + - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.3.2 CMAKE_BUILD_TYPE=Debug OMP_NUM_THREADS=4 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.56.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON + - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=0 ALPAKA_CMAKE_VER=3.3.1 CMAKE_BUILD_TYPE=Debug OMP_NUM_THREADS=3 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.57.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON + - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.3.0 CMAKE_BUILD_TYPE=Debug OMP_NUM_THREADS=2 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.58.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF + - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.3.2 CMAKE_BUILD_TYPE=Debug OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5 ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.59.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF + - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.3.1 CMAKE_BUILD_TYPE=Debug OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5 ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop ALPAKA_ACC_GPU_CUDA_ENABLE=OFF + + - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.3.0 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.58.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON + - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=0 ALPAKA_CMAKE_VER=3.3.2 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=3 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.59.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON + - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.3.1 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=2 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.56.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF + - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=0 ALPAKA_CMAKE_VER=3.3.0 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=1 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.57.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF + - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.3.2 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5 ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop ALPAKA_ACC_GPU_CUDA_ENABLE=OFF matrix: allow_failures: - compiler: gcc - env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.2.2 CMAKE_BUILD_TYPE=Debug OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5 ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop ALPAKA_ACC_GPU_CUDA_ENABLE=OFF + env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.3.1 CMAKE_BUILD_TYPE=Debug OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5 ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop ALPAKA_ACC_GPU_CUDA_ENABLE=OFF - compiler: gcc - env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.2.2 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5 ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop ALPAKA_ACC_GPU_CUDA_ENABLE=OFF + env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.3.2 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5 ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop ALPAKA_ACC_GPU_CUDA_ENABLE=OFF - compiler: clang - env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.2.2 CMAKE_BUILD_TYPE=Debug OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5 ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop ALPAKA_ACC_GPU_CUDA_ENABLE=OFF + env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.3.1 CMAKE_BUILD_TYPE=Debug OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5 ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop ALPAKA_ACC_GPU_CUDA_ENABLE=OFF - compiler: clang - env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.2.2 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5 ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop ALPAKA_ACC_GPU_CUDA_ENABLE=OFF + env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.3.2 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5 ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop ALPAKA_ACC_GPU_CUDA_ENABLE=OFF - os: osx branches: @@ -314,25 +314,11 @@ install: - echo ${ALPAKA_CMAKE_VER_MAJOR} - ALPAKA_CMAKE_VER_MINOR=${ALPAKA_CMAKE_VER:2:1} - echo ${ALPAKA_CMAKE_VER_MINOR} - # NOTE: Older versions have to be built from source because CMake does not provide older x64 binaries. - - if (( (( ${ALPAKA_CMAKE_VER_MAJOR} < 3 )) || ( (( ${ALPAKA_CMAKE_VER_MAJOR} == 3 )) && (( ${ALPAKA_CMAKE_VER_MINOR} < 1 )) ) )) - ;then - ALPAKA_CMAKE_PKG_FILE_NAME=cmake-${ALPAKA_CMAKE_VER} - && wget http://www.cmake.org/files/v${ALPAKA_CMAKE_VER_MAJOR}.${ALPAKA_CMAKE_VER_MINOR}/${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz - && tar -xzf ${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz - && cd cmake-${ALPAKA_CMAKE_VER} - && ./bootstrap --prefix=/usr - && make - && sudo make install - && cd ../ - && sudo rm -rf ${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz ${ALPAKA_CMAKE_PKG_FILE_NAME} - ;else - ALPAKA_CMAKE_PKG_FILE_NAME=cmake-${ALPAKA_CMAKE_VER}-Linux-x86_64 - && wget http://www.cmake.org/files/v${ALPAKA_CMAKE_VER_MAJOR}.${ALPAKA_CMAKE_VER_MINOR}/${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz - && tar -xzf ${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz - && sudo cp -fR cmake-${ALPAKA_CMAKE_VER}-Linux-x86_64/* /usr - && sudo rm -rf ${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz ${ALPAKA_CMAKE_PKG_FILE_NAME} - ;fi + - ALPAKA_CMAKE_PKG_FILE_NAME=cmake-${ALPAKA_CMAKE_VER}-Linux-x86_64 + - wget http://www.cmake.org/files/v${ALPAKA_CMAKE_VER_MAJOR}.${ALPAKA_CMAKE_VER_MINOR}/${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz + && tar -xzf ${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz + && sudo cp -fR cmake-${ALPAKA_CMAKE_VER}-Linux-x86_64/* /usr + && sudo rm -rf ${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz ${ALPAKA_CMAKE_PKG_FILE_NAME} #------------------------------------------------------------------------------- # Clone boost. diff --git a/README.md b/README.md index 55658ea..e5421a2 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Accelerator Back-ends |Serial|n/a|Host CPU (single core)|sequential|sequential (only 1 thread per block)| |OpenMP 2.0 blocks|OpenMP 2.0|Host CPU (multi core)|parallel (preemptive multitasking)|sequential (only 1 thread per block)| |OpenMP 2.0 threads|OpenMP 2.0|Host CPU (multi core)|sequential|parallel (preemptive multitasking)| -|OpenMP 4.0|OpenMP 4.0|Host CPU (multi core)|parallel (undefined)|parallel (preemptive multitasking)| +|OpenMP 4.0 (CPU)|OpenMP 4.0|Host CPU (multi core)|parallel (undefined)|parallel (preemptive multitasking)| | std::thread | std::thread |Host CPU (multi core)|sequential|parallel (preemptive multitasking)| | Boost.Fiber | boost::fibers::fiber |Host CPU (single core)|sequential|parallel (cooperative multitasking)| |CUDA 7.0|CUDA 7.0|NVIDIA GPUs SM 2.0+|parallel (undefined)|parallel (lock-step within warps)| @@ -55,15 +55,15 @@ Supported Compilers This library uses C++11 (or newer when available). -|Accelerator Back-end|gcc 4.9.2|gcc 5.2|clang 3.5/3.6|clang 3.7|MSVC 2015|icc 15.0+ (untested)| -|---|---|---|---|---|---|---| -|Serial|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:| -|OpenMP 2.0 blocks|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|:white_check_mark:| -|OpenMP 2.0 threads|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|:white_check_mark:| -|OpenMP 4.0|:white_check_mark:|:white_check_mark:|:x:|:x:|:x:|:white_check_mark:| -| std::thread |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:| -| Boost.Fiber |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:| -|CUDA 7.0|:white_check_mark:|:x:|:x:|:x:|:x:|:white_check_mark:| +|Accelerator Back-end|gcc 4.9.2|gcc 5.2|clang 3.5/3.6|clang 3.7|MSVC 2015| +|---|---|---|---|---|---| +|Serial|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:| +|OpenMP 2.0 blocks|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:| +|OpenMP 2.0 threads|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:| +|OpenMP 4.0 (CPU)|:white_check_mark:|:white_check_mark:|:x:|:x:|:x:| +| std::thread |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:| +| Boost.Fiber |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:| +|CUDA 7.0|:white_check_mark:|:x:|:x:|:x:|:x:| **NOTE**: :bangbang: Currently the *CUDA accelerator back-end* can not be enabled together with the *std::thread accelerator back-end* or the *Boost.Fiber accelerator back-end* due to bugs in the NVIDIA nvcc compiler :bangbang: @@ -91,17 +91,14 @@ Usage ----- The library is header only so nothing has to be build. -Only the include path (`-I` or `export CPLUS_INCLUDE_PATH=`) has to be set to `/include/`. -This allows to include the whole alpaka library with: `#include ` +CMake 3.3.0+ is required to provide the correct defines and include paths. +Just call `ALPAKA_ADD_EXECUTABLE` instead of `CUDA_ADD_EXECUTABLE` or `ADD_EXECUTABLE` and the difficulties of the CUDA nvcc compier in handling `.cu` and `.cpp` files is automatically taken care of. +Examples how to utilize alpaka within CMake can be found in the `examples` folder. +Source files do not need any special file ending. +The whole alpaka library can be included with: `#include ` Code not intended to be utilized by users is hidden in the `detail` namespace. -If you are building with the *CUDA accelerator back-end* enabled, your source files are required to have the ending `.cu` to comply with the nvcc (NVIDIA CUDA C++ compiler) rules for code files using CUDA. -When the *CUDA accelerator back-end* is disabled, this is not required and a `.cpp` extension is enough. -To allow both use-cases, it is desirable to have both, a `.cpp` file with the implementation and a `.cu` file containing only `#include ` to forward to the implementation. -The build system then has to use the `.cu` files when the *CUDA accelerator back-end* is enabled and the `.cpp` files else. -Examples how to do this with CMake can be found in the `examples` folder. - Authors ------- diff --git a/alpakaConfig.cmake b/alpakaConfig.cmake index f6f59b1..b97af32 100644 --- a/alpakaConfig.cmake +++ b/alpakaConfig.cmake @@ -44,6 +44,7 @@ UNSET(_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC) UNSET(_ALPAKA_LINK_LIBRARIES_PUBLIC) UNSET(_ALPAKA_LINK_FLAGS_PUBLIC) UNSET(_ALPAKA_COMMON_FILE) +UNSET(_ALPAKA_ADD_EXECUTABLE_FILE) UNSET(_ALPAKA_FILES_HEADER) UNSET(_ALPAKA_FILES_SOURCE) UNSET(_ALPAKA_FILES_OTHER) @@ -75,6 +76,10 @@ LIST(APPEND CMAKE_MODULE_PATH "${_ALPAKA_ROOT_DIR}/cmake/modules") SET(_ALPAKA_COMMON_FILE "${_ALPAKA_ROOT_DIR}/cmake/common.cmake") INCLUDE("${_ALPAKA_COMMON_FILE}") +# Add ALPAKA_ADD_EXECUTABLE function. +SET(_ALPAKA_ADD_EXECUTABLE_FILE "${_ALPAKA_ROOT_DIR}/cmake/addExecutable.cmake") +INCLUDE("${_ALPAKA_ADD_EXECUTABLE_FILE}") + #------------------------------------------------------------------------------- # Options. #------------------------------------------------------------------------------- @@ -83,7 +88,7 @@ OPTION(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE "Enable the threads CPU block threa OPTION(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE "Enable the fibers CPU block thread accelerator" ON) OPTION(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE "Enable the OpenMP 2.0 CPU grid block accelerator" ON) OPTION(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE "Enable the OpenMP 2.0 CPU block thread accelerator" ON) -OPTION(ALPAKA_ACC_CPU_BT_OMP4_ENABLE "Enable the OpenMP 4.0 CPU block and block thread accelerator" ON) +OPTION(ALPAKA_ACC_CPU_BT_OMP4_ENABLE "Enable the OpenMP 4.0 CPU block and block thread accelerator" OFF) OPTION(ALPAKA_ACC_GPU_CUDA_ENABLE "Enable the CUDA GPU accelerator" ON) # Drop-down combo box in cmake-gui. @@ -185,13 +190,12 @@ IF(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OR ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE OR A ELSE() SET(_ALPAKA_COMPILE_OPTIONS_PUBLIC ${OpenMP_CXX_FLAGS}) + IF(NOT MSVC) + SET(_ALPAKA_LINK_FLAGS_PUBLIC ${OpenMP_CXX_FLAGS}) + ENDIF() # CUDA requires some special handling IF(ALPAKA_ACC_GPU_CUDA_ENABLE) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - ELSE() - IF(NOT MSVC) - SET(_ALPAKA_LINK_FLAGS_PUBLIC ${OpenMP_CXX_FLAGS}) - ENDIF() ENDIF() ENDIF() ENDIF() @@ -356,7 +360,7 @@ SET(_ALPAKA_SUFFIXED_INCLUDE_DIR "${_ALPAKA_INCLUDE_DIRECTORY}/alpaka") SET(_ALPAKA_LINK_LIBRARY) LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "${_ALPAKA_LINK_LIBRARY}") -SET(_ALPAKA_FILES_OTHER "${_ALPAKA_ROOT_DIR}/alpakaConfig.cmake" "${_ALPAKA_COMMON_FILE}" "${_ALPAKA_ROOT_DIR}/.travis.yml" "${_ALPAKA_ROOT_DIR}/README.md") +SET(_ALPAKA_FILES_OTHER "${_ALPAKA_ROOT_DIR}/alpakaConfig.cmake" "${_ALPAKA_ADD_EXECUTABLE_FILE}" "${_ALPAKA_COMMON_FILE}" "${_ALPAKA_ROOT_DIR}/.travis.yml" "${_ALPAKA_ROOT_DIR}/README.md") # Add all the source and include files in all recursive subdirectories and group them accordingly. append_recursive_files_add_to_src_group("${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "hpp" "_ALPAKA_FILES_HEADER") @@ -453,7 +457,8 @@ LIST(APPEND alpaka_DEFINITIONS ${_ALPAKA_COMPILE_OPTIONS_PUBLIC}) SET(alpaka_INCLUDE_DIR ${_ALPAKA_INCLUDE_DIRECTORY}) SET(alpaka_INCLUDE_DIRS ${_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC}) SET(alpaka_LIBRARY ${_ALPAKA_LINK_LIBRARY}) -SET(alpaka_LIBRARIES ${_ALPAKA_LINK_LIBRARIES_PUBLIC}) +SET(alpaka_LIBRARIES ${_ALPAKA_LINK_FLAGS_PUBLIC}) +LIST(APPEND alpaka_LIBRARIES ${_ALPAKA_LINK_LIBRARIES_PUBLIC}) #------------------------------------------------------------------------------- # Print the return values. @@ -491,6 +496,7 @@ IF(NOT _ALPAKA_FOUND) UNSET(_ALPAKA_LINK_LIBRARIES_PUBLIC) UNSET(_ALPAKA_LINK_FLAGS_PUBLIC) UNSET(_ALPAKA_COMMON_FILE) + UNSET(_ALPAKA_ADD_EXECUTABLE_FILE) UNSET(_ALPAKA_FILES_HEADER) UNSET(_ALPAKA_FILES_SOURCE) UNSET(_ALPAKA_FILES_OTHER) @@ -512,6 +518,7 @@ ELSE() _ALPAKA_LINK_LIBRARIES_PUBLIC _ALPAKA_LINK_FLAGS_PUBLIC _ALPAKA_COMMON_FILE + _ALPAKA_ADD_EXECUTABLE_FILE _ALPAKA_FILES_HEADER _ALPAKA_FILES_SOURCE _ALPAKA_FILES_OTHER diff --git a/cmake/addExecutable.cmake b/cmake/addExecutable.cmake new file mode 100644 index 0000000..90a5fa4 --- /dev/null +++ b/cmake/addExecutable.cmake @@ -0,0 +1,39 @@ +################################################################################ +# Copyright 2015 Benjamin Worpitz +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER +# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, +# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE +# USE OR PERFORMANCE OF THIS SOFTWARE. +################################################################################ + +# CUDA_SOURCE_PROPERTY_FORMAT is only supported starting from 3.3.0. +CMAKE_MINIMUM_REQUIRED(VERSION 3.3.0) + +#------------------------------------------------------------------------------ +# Calls CUDA_ADD_EXECUTABLE or ADD_EXECUTABLE depending on the enabled alpaka accelerators. +#------------------------------------------------------------------------------ +FUNCTION(ALPAKA_ADD_EXECUTABLE In_Name) + IF(ALPAKA_ACC_GPU_CUDA_ENABLE) + FOREACH(_file ${ARGN}) + IF((${_file} MATCHES "\\.cpp$") OR (${_file} MATCHES "\\.cxx$")) + SET_SOURCE_FILES_PROPERTIES(${_file} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ) + ENDIF() + ENDFOREACH() + CMAKE_POLICY(SET CMP0023 OLD) # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords. + CUDA_ADD_EXECUTABLE( + ${In_Name} + ${ARGN}) + ELSE() + ADD_EXECUTABLE( + ${In_Name} + ${ARGN}) + ENDIF() +ENDFUNCTION() diff --git a/doc/Abstraction.md b/doc/Abstraction.md deleted file mode 100644 index 770b021..0000000 --- a/doc/Abstraction.md +++ /dev/null @@ -1,127 +0,0 @@ -Abstraction -=========== - -Parallelism and memory hierarchies at all levels need to be exploited in order to achieve performance portability across various types of accelerators. - -On the hardware side we have nodes with multiple sockets/processors extended by accelerators like GPUs or Intel Xeon Phi each with their own processing units. -Within a CPU or a Intel Xeon Phi there are cores with hyper-threads and vector units, within a GPU there are many small cores. -Each of those entities has access to different memories in the hierarchy. -For example each socket/processor manages its RAM and the cores additionally have access to L3, L2 and L1 caches. -On a GPU there is global, constant and shared memory. -**alpaka** is designed to abstract from these differences without sacrificing speed by defining a domain decomposition for the computation domain. -This domain decomposition is abstract enough to map optimally to all (currently known) accelerators. -The **alpaka** library hides the mapping to the underlying hardware together with the execution primitives like kernel-threads and fibers allowing accelerator independent performance and portability. - -A process running on a multi-socket node is the largest entity within **alpaka**. -The library itself only abstracts the task and data parallel execution on the process/node level and down. -It does not provide any primitives for inter-node communication but such libraries can build upon **alpaka**. -The process always has a main thread and is by definition running on the host. -It can access the host memory and various accelerator devices. -Such accelerator devices can be GPUs, Intel Xeon Phi, the host itself or other hardware. -Thus the host not necessarily has to be different from the accelerator device used to compute on. -For example a Intel Xeon Phi simultaneously can be the host and the accelerator device. -**alpaka** also allows the parallel execution on nodes without any accelerator hardware. -The node the process is running on can itself be used as an accelerator device. - -**alpaka** can be used to offload the parallel execution of task and data parallel work simultaneously onto different accelerator devices. - - -Task Parallelism ----------------- - -With the concept of streams known from CUDA, where each stream is a queue of sequential tasks, but streams can be processed in parallel, **alpaka** provides an implementation of task parallelism. -Events that can be enqueued to the streams enhance this basic task parallelism by allowing synchronization between different streams, devices or host threads. - - -Data Parallelism ----------------- - -The main part of **alpaka** is the way it abstracts data parallelism. -The work is divided into a 1 to 3 dimensional grid of uniform threads appropriate to the problem at hand. -The uniform function executed by each of the threads is called a kernel. - -The threads are organized hierarchically and can access local memory at each of the hierarchy levels. -All these higher levels are hidden in the internals of the accelerator implementations and their execution order can not be controlled. - -The abstraction used extends the CUDA grid-blocks-threads division strategy explained below by further allowing to facilitate vectorization. -This extended *redundant hierarchical parallelism* scheme is discussed in the paper [The Future of Accelerator Programming: Abstraction, Performance or Can We Have Both?](http://dx.doi.org/10.1109/ICPADS.2013.76) ([PDF](http://olab.is.s.u-tokyo.ac.jp/~kamil.rocki/rocki_burtscher_sac14.pdf)). - -### Thread Hierarchy - -#### Grid - -The whole grid consists of uniform threads each executing the same kernel. -By default threads do not have a cheap way to communicate safely within the whole grid. -This forces decoupling of threads and avoids global interaction and global dependencies. -This independence allows scattering of work blocks within the grid of worker threads and their independent execution on separate processing units. -Utilizing this property on the higher level allows applications to scale very well. -All threads within the grid can access a global memory. - -#### Block - -A block is a group of threads. -The whole grid is subdivided into equal sized blocks. -Threads within a block can synchronize and have a fast but small shared memory. -This allows for fast interaction on a local scale. - -**TODO**: Why blocks? - -#### Thread - -Each thread executes the same kernel. -The only difference is the index into the grid which allows each thread to compute a different part of the solution. - -**TODO**: more - -#### Vectorization (SIMD) - -To use the maximum available computing power of a x86 core the computation has to exploit the vector registers. - -Because of the x86 SIMD intrinsics (``) not being portable, we have to rely on the loop vectorization capabilities of the compiler. - -The best solution to vectorization would be one, where the user does not have to do anything. -This is not possible because the smallest unit the user supplies is a kernel which is executed in threads which can synchronize. -It is not possible to hide the vectorization by starting a kernel-thread for e.g. each 4th thread in a block and then looping over the 4 entries. -This would prohibit the synchronization between these threads. -By executing 4 fibers inside such a vectorization kernel-thread we would allow synchronization again but prevent the loop vectorizer from working. - -The only possible usage of vectorization is one where we create a kernel-thread for e.g. each 4th thread in a block but do not loop over the 4 threads ourself but rely on the user to implement loops that can be vectorized safely. - -### Memory Hierarchy - -#### Global Memory - -The global memory can be accessed from every thread executing on an accelerator. -This is typically the largest but also the slowest memory available. - -#### Shared Memory - -Each block has its own shared memory. -This memory can only be accessed by threads within the same block and gets discarded after the complete block finished its calculation. -This memory is typically very fast but also very small. -Sharing has to be done explicitly. -No variables are shared between kernels by default. - -#### Registers - -This memory is local to each thread. -All variables with default scope defined inside a kernel are automatically saved in registers and not shared automatically. - -**TODO**: Constant Memory, Texture Memory? - -Mapping *Redundant Hierarchical Parallelism* onto Hardware --------------------------------------------------------- - -By providing an accelerator independent interface for kernels, their execution and memory access at different hierarchy levels **alpaka** allows the user to write accelerator independent code that does not neglect performance. - -The hard part, the mapping of the decomposition to the execution environment is handled by the **alpaka** library. -The decomposition of the computation in use can not be mapped one to one to any existing hardware. -GPUs do not have vector registers where multiple values of type `int` or `float` can be manipulated by one instruction. -Newer versions of CUDA only implement basic SIMD instructions "on pairs of 16-bit values and quads of 8-bit values". -They are described in the documentation of the [NVIDIA GPU instruction set architecture](http://docs.nvidia.com/cuda/pdf/ptx_isa_3.2.pdf) chapter 8.7.13 but are only of any use in very special problem domains. -So the vector level is omitted on the CUDA accelerator. -CPUs in turn are not (currently) capable of running thousands of threads concurrently. -Furthermore CPUs do not have an equivalently fast inter-thread synchronization and shared memory access as GPUs do. - -A major point of the *redundant hierarchical parallelism* abstraction is to ignore specific unsupported levels and utilize only the ones supported on a specific accelerator. -Furthermore the hierarchy allows a mapping to various current and future accelerators in a variety of ways enabling optimal usage of the underlying compute and memory capabilities. diff --git a/doc/Distinction.md b/doc/Distinction.md deleted file mode 100644 index 0cb85f7..0000000 --- a/doc/Distinction.md +++ /dev/null @@ -1,125 +0,0 @@ -Distinction -=========== - -There are multiple other projects which pretend to achieve full performance portability. -Many do not satisfy the requirement for full c++ support that is needed because of the usage of template meta-programming for method specialization to avoid runtime-polymorphism. - - -CUDA ------- - -### Positive -* Full control over memory, caches and execution. - -### Negative -* Not platform independent: NVIDIA GPUs are required. No execution on other manufacturers GPUs or even standard CPUs. -* Language extension: The nvcc compiler driver is required to understand the language constructs that are used inside the kernel methods. -* Not compiler independent: The back-end compiler has to be one of the supported ones. - - -OpenMP --------- - -### Negative -* No good way to control memory hierarchies and thread interaction (shared memory). -* Only compiler hints, no direct control over result. - - -OpenACC ---------- - -### Positive -* Can generate x86 and CUDA code from C++ code. - -### Negative -* Compiler dependent (currently not supported by many compilers and the PGI compiler is not actively enough developed and does not have a good C++ support). -* Only compiler hints, no direct control over result. - - -OpenCl --------- - -### Positive -* Hardware independent (CPUs and GPUs of nearly all vendors). - -### Negative -* No full C++ support. -* Runtime compilation -> No direct inclusion into the source (syntax highlighting?, static analysis?, debugging?, templated kernels?) - - -C++ AMP ---------- - -### Positive -* Open specification -* Annotated C++ code can run on multiple accelerators. - -### Negative -* Language extension -* Compiler dependent (currently not supported by many compilers) - - -PGI CUDA-X86 -------------- -When run on x86-based systems without a GPU, PGI CUDA C applications use multiple cores and the streaming SIMD (Single Instruction Multiple Data) capabilities of Intel and AMD CPUs for parallelvectorized execution. -At run-time, CUDA C programs compiled for x86 executes each CUDA thread block using a single host core, eliminating synchronization where possible. - -### Positive -* Lets you write standard CUDA code and execute it on x86. - -### Negative -* Not actively developed. - - -LLVM backends (PTX, R600) ---------------------------- - -### Negative -* Those back-ends never got really mature and up-to-date. - - -KOKKOS -------------- -See [here](https://www.xsede.org/documents/271087/586927/Edwards-2013-XSCALE13-Kokkos.pdf) -[here](http://trilinos.org/oldsite/events/trilinos_user_group_2013/presentations/2013-11-TUG-Kokkos-Tutorial.pdf) -[here](http://on-demand.gputechconf.com/supercomputing/2013/presentation/SC3103_Towards-Performance-Portable-Applications-Kokkos.pdf) -and [here](http://dx.doi.org/10.3233/SPR-2012-0343). -Source is available [here](https://github.com/trilinos/trilinos/tree/master/packages/kokkos). -The project is similar to *alpaka* in the way it abstracts the kernel as templated function object. -It provides parallel_for, parallel_reduce, etc. similar to thrust. - -### Positive -* Offers buffer views with a neutral indexing scheme that maps to the underlying hardware (row/col-major, blocking, ...). - -### Negative -* License. -* The parameters are required to be given to the function object constructor coupling algorithm and data together. -* The implementation of accelerator methods (atomics, ...) is selected via macros defined by the nvcc compiler. So there is no way to select between different x86 implementations for different x86 accelerators. - - -Phalanx ------------ -See [here](http://www.mgarland.org/files/papers/phalanx-sc12-preprint.pdf). -It is very similar to *alpaka* in the way it abstracts the accelerators. - -### Positive -* C++ Interface provides CUDA, OpenMP, and GASNet back-ends - -### Negative -* License. -* No official source repository available? - - -thrust (bulk) -------------- -*... - - -Intel TBB ---------- -*... - - -Intel Cilk Plus ---------------- -*... diff --git a/doc/Threading.md b/doc/Threading.md deleted file mode 100644 index 9063f01..0000000 --- a/doc/Threading.md +++ /dev/null @@ -1,57 +0,0 @@ -Threading -========= - -There are multiple possible ways to implement threading on non CUDA accelerators. - - -Serial ------- - -### Positive -* Easy implementation (no sync). - -### Negative -* Restricts block size to 1*1*1. Due to the threads in a block being able to synchronize they have to be executed in parallel. This can not be faked. - - -OpenMP ------- - -### Positive -* Lightweight threads. - -### Negative -* Interaction with std::threads/pthreads unspecified. -* Thread count limit platform dependent (some runtimes allow oversubscription). No correct way to read the maximum number of threads supported. -* Hard to control thread affinity. -* Non-deterministical thread change. - - -Kernel-Threads --------------- - -std::thread, pthread, ... - -### Positive -* Thread affinity is controllable (platform dependent implementation). - -### Negative -* High cost of thread creation and thread change. -* Non-deterministical thread change. - - -Fibers ------- - -A fiber is a user-space thread with cooperative context-switch. -They are implemented on top of coroutines. A coroutine is a function that can be suspended and resumed but has not necessarily a stack. -boost::fiber = stackful coroutine + scheduler + sync (no wait → next fiber in thread) -C++17: N3858, N3985, (N4134 stackless coroutines 'await') - -### Positive -* Less cost of creation. -* Less cost of thread change. -* Deterministic thread change. -* No locks at all (Always only one active fiber per kernel-thread). -* Prevents false sharing because all fibers working on nearby values are in the same block and can be executed by the same kernel-thread on the same core. -* Prevents cache thrashing (threads on the same core compete for the same cache line) by using a user-level scheduler for the fibers that can invocate them in order of access of the memory. diff --git a/examples/mandelbrot/CMakeLists.txt b/examples/mandelbrot/CMakeLists.txt index 1cfc4db..2e98a95 100644 --- a/examples/mandelbrot/CMakeLists.txt +++ b/examples/mandelbrot/CMakeLists.txt @@ -63,41 +63,15 @@ append_recursive_files_add_to_src_group("${_SUFFIXED_INCLUDE_DIR}" "" "hpp" _FIL # Add all the source files in all recursive subdirectories and group them accordingly. append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cpp" _FILES_SOURCE_CXX) -# Add all the CUDA source files in all recursive subdirectories and group them accordingly. -append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cu" _FILES_SOURCE_CU) - +INCLUDE_DIRECTORIES( + ${_INCLUDE_DIRECTORIES_PRIVATE} + ${alpaka_INCLUDE_DIRS}) +ADD_DEFINITIONS( + ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS}) # Always add all files to the target executable build call to add them to the build project. -IF(ALPAKA_ACC_GPU_CUDA_ENABLE) - # Force the .cpp files to be recognized as headers and not be compiled so there wont be a second entry point. - SET_SOURCE_FILES_PROPERTIES( - ${_FILES_SOURCE_CXX} - PROPERTIES HEADER_FILE_ONLY TRUE) - # CUDA does not work well with the much better target dependent TARGET_XXX commands but requires the settings to be available globally: https://www.cmake.org/Bug/view.php?id=14201&nbn=1 - INCLUDE_DIRECTORIES( - ${_INCLUDE_DIRECTORIES_PRIVATE} - ${alpaka_INCLUDE_DIRS}) - ADD_DEFINITIONS( - ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS}) - CMAKE_POLICY(SET CMP0023 OLD) # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords. - CUDA_ADD_EXECUTABLE( - "mandelbrot" - ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU}) -ELSE() - # Force the .cu files to be recognized as headers and not be compiled so there wont be a second entry point. - SET_SOURCE_FILES_PROPERTIES( - ${_FILES_SOURCE_CU} - PROPERTIES HEADER_FILE_ONLY TRUE) - ADD_EXECUTABLE( - "mandelbrot" - ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU}) - TARGET_INCLUDE_DIRECTORIES( - "mandelbrot" - PRIVATE ${_INCLUDE_DIRECTORIES_PRIVATE}) - TARGET_COMPILE_OPTIONS( - "mandelbrot" - PRIVATE ${ALPAKA_DEV_COMPILE_OPTIONS}) -ENDIF() - +ALPAKA_ADD_EXECUTABLE( + "mandelbrot" + ${_FILES_HEADER} ${_FILES_SOURCE_CXX}) # Set the link libraries for this library (adds libs, include directories, defines and compile options). TARGET_LINK_LIBRARIES( "mandelbrot" diff --git a/examples/mandelbrot/src/main.cpp b/examples/mandelbrot/src/main.cpp index 2986fbd..b3e117d 100644 --- a/examples/mandelbrot/src/main.cpp +++ b/examples/mandelbrot/src/main.cpp @@ -277,12 +277,12 @@ auto writeTgaColorImage( alpaka::dim::Dim::value == 2, "The buffer has to be 2 dimensional!"); static_assert( - std::is_integral>::value, + std::is_integral>::value, "The buffer element type has to be integral!"); // The width of the input buffer is in input elements. auto const bufWidthElems(alpaka::extent::getWidth(bufRgba)); - auto const bufWidthBytes(bufWidthElems * sizeof(alpaka::mem::view::Elem)); + auto const bufWidthBytes(bufWidthElems * sizeof(alpaka::elem::Elem)); // The row width in bytes has to be dividable by 4 Bytes (RGBA). assert(bufWidthBytes % sizeof(std::uint32_t) == 0); // The number of colors in a row. @@ -378,7 +378,7 @@ struct MandelbrotKernelTester // Get a stream on this device. alpaka::examples::Stream> stream(devAcc); - alpaka::Vec2 const v2uiExtents( + alpaka::Vec, TSize> const v2uiExtents( static_cast(numRows), static_cast(numCols)); diff --git a/examples/mandelbrot/src/main.cu b/examples/mandelbrot/src/main.cu deleted file mode 100644 index 428b7f4..0000000 --- a/examples/mandelbrot/src/main.cu +++ /dev/null @@ -1,22 +0,0 @@ -/** -* \file -* Copyright 2014-2015 Benjamin Worpitz -* -* This file is part of alpaka. -* -* alpaka is free software: you can redistribute it and/or modify -* it under the terms of the GNU Lesser General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. -* -* alpaka is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Lesser General Public License for more details. -* -* You should have received a copy of the GNU Lesser General Public License -* along with alpaka. -* If not, see . -*/ - -#include "main.cpp" diff --git a/examples/matMul/CMakeLists.txt b/examples/matMul/CMakeLists.txt index e952292..d87e3e1 100644 --- a/examples/matMul/CMakeLists.txt +++ b/examples/matMul/CMakeLists.txt @@ -63,41 +63,15 @@ append_recursive_files_add_to_src_group("${_SUFFIXED_INCLUDE_DIR}" "" "hpp" _FIL # Add all the source files in all recursive subdirectories and group them accordingly. append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cpp" _FILES_SOURCE_CXX) -# Add all the CUDA source files in all recursive subdirectories and group them accordingly. -append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cu" _FILES_SOURCE_CU) - +INCLUDE_DIRECTORIES( + ${_INCLUDE_DIRECTORIES_PRIVATE} + ${alpaka_INCLUDE_DIRS}) +ADD_DEFINITIONS( + ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS}) # Always add all files to the target executable build call to add them to the build project. -IF(ALPAKA_ACC_GPU_CUDA_ENABLE) - # Force the .cpp files to be recognized as headers and not be compiled so there wont be a second entry point. - SET_SOURCE_FILES_PROPERTIES( - ${_FILES_SOURCE_CXX} - PROPERTIES HEADER_FILE_ONLY TRUE) - # CUDA does not work well with the much better target dependent TARGET_XXX commands but requires the settings to be available globally: https://www.cmake.org/Bug/view.php?id=14201&nbn=1 - INCLUDE_DIRECTORIES( - ${_INCLUDE_DIRECTORIES_PRIVATE} - ${alpaka_INCLUDE_DIRS}) - ADD_DEFINITIONS( - ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS}) - CMAKE_POLICY(SET CMP0023 OLD) # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords. - CUDA_ADD_EXECUTABLE( - "matMul" - ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU}) -ELSE() - # Force the .cu files to be recognized as headers and not be compiled so there wont be a second entry point. - SET_SOURCE_FILES_PROPERTIES( - ${_FILES_SOURCE_CU} - PROPERTIES HEADER_FILE_ONLY TRUE) - ADD_EXECUTABLE( - "matMul" - ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU}) - TARGET_INCLUDE_DIRECTORIES( - "matMul" - PRIVATE ${_INCLUDE_DIRECTORIES_PRIVATE}) - TARGET_COMPILE_OPTIONS( - "matMul" - PRIVATE ${ALPAKA_DEV_COMPILE_OPTIONS}) -ENDIF() - +ALPAKA_ADD_EXECUTABLE( + "matMul" + ${_FILES_HEADER} ${_FILES_SOURCE_CXX}) # Set the link libraries for this library (adds libs, include directories, defines and compile options). TARGET_LINK_LIBRARIES( "matMul" diff --git a/examples/matMul/src/main.cpp b/examples/matMul/src/main.cpp index 7d1191c..ed99e8e 100644 --- a/examples/matMul/src/main.cpp +++ b/examples/matMul/src/main.cpp @@ -251,16 +251,16 @@ struct MatMulTester // Get a stream on the accelerator device. alpaka::examples::Stream> streamAcc(devAcc); - alpaka::Vec2 const v2uiExtentsA( + alpaka::Vec, TSize> const v2uiExtentsA( static_cast(m), static_cast(k)); - alpaka::Vec2 const v2uiExtentsB( + alpaka::Vec, TSize> const v2uiExtentsB( static_cast(k), static_cast(n)); // Result matrix is MxN. We create one worker per result matrix cell. - alpaka::Vec2 const v2uiExtentsC( + alpaka::Vec, TSize> const v2uiExtentsC( static_cast(m), static_cast(n)); diff --git a/examples/matMul/src/main.cu b/examples/matMul/src/main.cu deleted file mode 100644 index 428b7f4..0000000 --- a/examples/matMul/src/main.cu +++ /dev/null @@ -1,22 +0,0 @@ -/** -* \file -* Copyright 2014-2015 Benjamin Worpitz -* -* This file is part of alpaka. -* -* alpaka is free software: you can redistribute it and/or modify -* it under the terms of the GNU Lesser General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. -* -* alpaka is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Lesser General Public License for more details. -* -* You should have received a copy of the GNU Lesser General Public License -* along with alpaka. -* If not, see . -*/ - -#include "main.cpp" diff --git a/examples/sharedMem/CMakeLists.txt b/examples/sharedMem/CMakeLists.txt index 9cf17c1..9250f01 100644 --- a/examples/sharedMem/CMakeLists.txt +++ b/examples/sharedMem/CMakeLists.txt @@ -63,41 +63,15 @@ append_recursive_files_add_to_src_group("${_SUFFIXED_INCLUDE_DIR}" "" "hpp" _FIL # Add all the source files in all recursive subdirectories and group them accordingly. append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cpp" _FILES_SOURCE_CXX) -# Add all the CUDA source files in all recursive subdirectories and group them accordingly. -append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cu" _FILES_SOURCE_CU) - +INCLUDE_DIRECTORIES( + ${_INCLUDE_DIRECTORIES_PRIVATE} + ${alpaka_INCLUDE_DIRS}) +ADD_DEFINITIONS( + ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS}) # Always add all files to the target executable build call to add them to the build project. -IF(ALPAKA_ACC_GPU_CUDA_ENABLE) - # Force the .cpp files to be recognized as headers and not be compiled so there wont be a second entry point. - SET_SOURCE_FILES_PROPERTIES( - ${_FILES_SOURCE_CXX} - PROPERTIES HEADER_FILE_ONLY TRUE) - # CUDA does not work well with the much better target dependent TARGET_XXX commands but requires the settings to be available globally: https://www.cmake.org/Bug/view.php?id=14201&nbn=1 - INCLUDE_DIRECTORIES( - ${_INCLUDE_DIRECTORIES_PRIVATE} - ${alpaka_INCLUDE_DIRS}) - ADD_DEFINITIONS( - ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS}) - CMAKE_POLICY(SET CMP0023 OLD) # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords. - CUDA_ADD_EXECUTABLE( - "sharedMem" - ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU}) -ELSE() - # Force the .cu files to be recognized as headers and not be compiled so there wont be a second entry point. - SET_SOURCE_FILES_PROPERTIES( - ${_FILES_SOURCE_CU} - PROPERTIES HEADER_FILE_ONLY TRUE) - ADD_EXECUTABLE( - "sharedMem" - ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU}) - TARGET_INCLUDE_DIRECTORIES( - "sharedMem" - PRIVATE ${_INCLUDE_DIRECTORIES_PRIVATE}) - TARGET_COMPILE_OPTIONS( - "sharedMem" - PRIVATE ${ALPAKA_DEV_COMPILE_OPTIONS}) -ENDIF() - +ALPAKA_ADD_EXECUTABLE( + "sharedMem" + ${_FILES_HEADER} ${_FILES_SOURCE_CXX}) # Set the link libraries for this library (adds libs, include directories, defines and compile options). TARGET_LINK_LIBRARIES( "sharedMem" diff --git a/examples/sharedMem/src/main.cu b/examples/sharedMem/src/main.cu deleted file mode 100644 index 428b7f4..0000000 --- a/examples/sharedMem/src/main.cu +++ /dev/null @@ -1,22 +0,0 @@ -/** -* \file -* Copyright 2014-2015 Benjamin Worpitz -* -* This file is part of alpaka. -* -* alpaka is free software: you can redistribute it and/or modify -* it under the terms of the GNU Lesser General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. -* -* alpaka is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Lesser General Public License for more details. -* -* You should have received a copy of the GNU Lesser General Public License -* along with alpaka. -* If not, see . -*/ - -#include "main.cpp" diff --git a/examples/vectorAdd/CMakeLists.txt b/examples/vectorAdd/CMakeLists.txt index 49a3a6e..74a50b2 100644 --- a/examples/vectorAdd/CMakeLists.txt +++ b/examples/vectorAdd/CMakeLists.txt @@ -63,41 +63,15 @@ append_recursive_files_add_to_src_group("${_SUFFIXED_INCLUDE_DIR}" "" "hpp" _FIL # Add all the source files in all recursive subdirectories and group them accordingly. append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cpp" _FILES_SOURCE_CXX) -# Add all the CUDA source files in all recursive subdirectories and group them accordingly. -append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cu" _FILES_SOURCE_CU) - +INCLUDE_DIRECTORIES( + ${_INCLUDE_DIRECTORIES_PRIVATE} + ${alpaka_INCLUDE_DIRS}) +ADD_DEFINITIONS( + ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS}) # Always add all files to the target executable build call to add them to the build project. -IF(ALPAKA_ACC_GPU_CUDA_ENABLE) - # Force the .cpp files to be recognized as headers and not be compiled so there wont be a second entry point. - SET_SOURCE_FILES_PROPERTIES( - ${_FILES_SOURCE_CXX} - PROPERTIES HEADER_FILE_ONLY TRUE) - # CUDA does not work well with the much better target dependent TARGET_XXX commands but requires the settings to be available globally: https://www.cmake.org/Bug/view.php?id=14201&nbn=1 - INCLUDE_DIRECTORIES( - ${_INCLUDE_DIRECTORIES_PRIVATE} - ${alpaka_INCLUDE_DIRS}) - ADD_DEFINITIONS( - ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS}) - CMAKE_POLICY(SET CMP0023 OLD) # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords. - CUDA_ADD_EXECUTABLE( - "vectorAdd" - ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU}) -ELSE() - # Force the .cu files to be recognized as headers and not be compiled so there wont be a second entry point. - SET_SOURCE_FILES_PROPERTIES( - ${_FILES_SOURCE_CU} - PROPERTIES HEADER_FILE_ONLY TRUE) - ADD_EXECUTABLE( - "vectorAdd" - ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU}) - TARGET_INCLUDE_DIRECTORIES( - "vectorAdd" - PRIVATE ${_INCLUDE_DIRECTORIES_PRIVATE}) - TARGET_COMPILE_OPTIONS( - "vectorAdd" - PRIVATE ${ALPAKA_DEV_COMPILE_OPTIONS}) -ENDIF() - +ALPAKA_ADD_EXECUTABLE( + "vectorAdd" + ${_FILES_HEADER} ${_FILES_SOURCE_CXX}) # Set the link libraries for this library (adds libs, include directories, defines and compile options). TARGET_LINK_LIBRARIES( "vectorAdd" diff --git a/examples/vectorAdd/src/main.cpp b/examples/vectorAdd/src/main.cpp index 673c466..03abb21 100644 --- a/examples/vectorAdd/src/main.cpp +++ b/examples/vectorAdd/src/main.cpp @@ -102,7 +102,7 @@ struct VectorAddKernelTester // Get a stream on this device. alpaka::examples::Stream> stream(devAcc); - alpaka::Vec1 const v1uiExtents( + alpaka::Vec, TSize> const v1uiExtents( numElements); // Let alpaka calculate good block and grid sizes given our full problem extents. diff --git a/examples/vectorAdd/src/main.cu b/examples/vectorAdd/src/main.cu deleted file mode 100644 index 428b7f4..0000000 --- a/examples/vectorAdd/src/main.cu +++ /dev/null @@ -1,22 +0,0 @@ -/** -* \file -* Copyright 2014-2015 Benjamin Worpitz -* -* This file is part of alpaka. -* -* alpaka is free software: you can redistribute it and/or modify -* it under the terms of the GNU Lesser General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. -* -* alpaka is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Lesser General Public License for more details. -* -* You should have received a copy of the GNU Lesser General Public License -* along with alpaka. -* If not, see . -*/ - -#include "main.cpp" diff --git a/include/alpaka/acc/AccCpuFibers.hpp b/include/alpaka/acc/AccCpuFibers.hpp index 2b27e0e..51494be 100644 --- a/include/alpaka/acc/AccCpuFibers.hpp +++ b/include/alpaka/acc/AccCpuFibers.hpp @@ -29,6 +29,7 @@ #include // MathStl #include // BlockSharedAllocMasterSync #include // BlockSyncFiberIdMapBarrier +#include // RandStl // Specialized traits. #include // acc::traits::AccType @@ -79,7 +80,8 @@ namespace alpaka public atomic::AtomicNoOp, public math::MathStl, public block::shared::BlockSharedAllocMasterSync, - public block::sync::BlockSyncFiberIdMapBarrier + public block::sync::BlockSyncFiberIdMapBarrier, + public rand::RandStl { public: // Partial specialization with the correct TDim and TSize is not allowed. @@ -109,6 +111,7 @@ namespace alpaka block::sync::BlockSyncFiberIdMapBarrier( m_threadsPerBlockCount, m_fibersToBarrier), + rand::RandStl(), m_gridBlockIdx(Vec::zeros()), m_threadsPerBlockCount(workdiv::getWorkDiv(workDiv).prod()) {} diff --git a/include/alpaka/acc/AccCpuOmp2Blocks.hpp b/include/alpaka/acc/AccCpuOmp2Blocks.hpp index 2d0ac1e..4fadb77 100644 --- a/include/alpaka/acc/AccCpuOmp2Blocks.hpp +++ b/include/alpaka/acc/AccCpuOmp2Blocks.hpp @@ -25,10 +25,11 @@ #include // workdiv::WorkDivMembers #include // IdxGbRef #include // IdxBtZero -#include // AtomicNoOp +#include // AtomicOmpCritSec #include // MathStl #include // BlockSharedAllocNoSync #include // BlockSyncNoOp +#include // RandStl // Specialized traits. #include // acc::traits::AccType @@ -73,10 +74,11 @@ namespace alpaka public workdiv::WorkDivMembers, public idx::gb::IdxGbRef, public idx::bt::IdxBtZero, - public atomic::AtomicNoOp, + public atomic::AtomicOmpCritSec, public math::MathStl, public block::shared::BlockSharedAllocNoSync, - public block::sync::BlockSyncNoOp + public block::sync::BlockSyncNoOp, + public rand::RandStl { public: // Partial specialization with the correct TDim and TSize is not allowed. @@ -98,10 +100,11 @@ namespace alpaka workdiv::WorkDivMembers(workDiv), idx::gb::IdxGbRef(m_gridBlockIdx), idx::bt::IdxBtZero(), - atomic::AtomicNoOp(), + atomic::AtomicOmpCritSec(), math::MathStl(), block::shared::BlockSharedAllocNoSync(), block::sync::BlockSyncNoOp(), + rand::RandStl(), m_gridBlockIdx(Vec::zeros()) {} diff --git a/include/alpaka/acc/AccCpuOmp2Threads.hpp b/include/alpaka/acc/AccCpuOmp2Threads.hpp index 68ecb68..1baba83 100644 --- a/include/alpaka/acc/AccCpuOmp2Threads.hpp +++ b/include/alpaka/acc/AccCpuOmp2Threads.hpp @@ -29,6 +29,7 @@ #include // MathStl #include // BlockSharedAllocMasterSync #include // BlockSyncOmpBarrier +#include // RandStl // Specialized traits. #include // acc::traits::AccType @@ -75,7 +76,8 @@ namespace alpaka public atomic::AtomicOmpCritSec, public math::MathStl, public block::shared::BlockSharedAllocMasterSync, - public block::sync::BlockSyncOmpBarrier + public block::sync::BlockSyncOmpBarrier, + public rand::RandStl { public: // Partial specialization with the correct TDim and TSize is not allowed. @@ -103,6 +105,7 @@ namespace alpaka [this](){block::sync::syncBlockThreads(*this);}, [](){return (::omp_get_thread_num() == 0);}), block::sync::BlockSyncOmpBarrier(), + rand::RandStl(), m_gridBlockIdx(Vec::zeros()) {} diff --git a/include/alpaka/acc/AccCpuOmp4.hpp b/include/alpaka/acc/AccCpuOmp4.hpp index 8f50de7..d39ec42 100644 --- a/include/alpaka/acc/AccCpuOmp4.hpp +++ b/include/alpaka/acc/AccCpuOmp4.hpp @@ -29,6 +29,7 @@ #include // MathStl #include // BlockSharedAllocMasterSync #include // BlockSyncOmpBarrier +#include // RandStl // Specialized traits. #include // acc::traits::AccType @@ -75,7 +76,8 @@ namespace alpaka public atomic::AtomicOmpCritSec, public math::MathStl, public block::shared::BlockSharedAllocMasterSync, - public block::sync::BlockSyncOmpBarrier + public block::sync::BlockSyncOmpBarrier, + public rand::RandStl { public: // Partial specialization with the correct TDim and TSize is not allowed. @@ -103,6 +105,7 @@ namespace alpaka [this](){block::sync::syncBlockThreads(*this);}, [](){return (::omp_get_thread_num() == 0);}), block::sync::BlockSyncOmpBarrier(), + rand::RandStl(), m_gridBlockIdx(Vec::zeros()) {} diff --git a/include/alpaka/acc/AccCpuSerial.hpp b/include/alpaka/acc/AccCpuSerial.hpp index d0543e0..26eac76 100644 --- a/include/alpaka/acc/AccCpuSerial.hpp +++ b/include/alpaka/acc/AccCpuSerial.hpp @@ -29,6 +29,7 @@ #include // MathStl #include // BlockSharedAllocNoSync #include // BlockSyncNoOp +#include // RandStl // Specialized traits. #include // acc::traits::AccType @@ -73,7 +74,8 @@ namespace alpaka public atomic::AtomicNoOp, public math::MathStl, public block::shared::BlockSharedAllocNoSync, - public block::sync::BlockSyncNoOp + public block::sync::BlockSyncNoOp, + public rand::RandStl { public: // Partial specialization with the correct TDim and TSize is not allowed. @@ -99,6 +101,7 @@ namespace alpaka math::MathStl(), block::shared::BlockSharedAllocNoSync(), block::sync::BlockSyncNoOp(), + rand::RandStl(), m_gridBlockIdx(Vec::zeros()) {} diff --git a/include/alpaka/acc/AccCpuThreads.hpp b/include/alpaka/acc/AccCpuThreads.hpp index 8b8b535..8f64a2b 100644 --- a/include/alpaka/acc/AccCpuThreads.hpp +++ b/include/alpaka/acc/AccCpuThreads.hpp @@ -29,6 +29,7 @@ #include // MathStl #include // BlockSharedAllocMasterSync #include // BlockSyncThreadIdMapBarrier +#include // RandStl // Specialized traits. #include // acc::traits::AccType @@ -76,7 +77,8 @@ namespace alpaka public atomic::AtomicStlLock, public math::MathStl, public block::shared::BlockSharedAllocMasterSync, - public block::sync::BlockSyncThreadIdMapBarrier + public block::sync::BlockSyncThreadIdMapBarrier, + public rand::RandStl { public: // Partial specialization with the correct TDim and TSize is not allowed. @@ -106,6 +108,7 @@ namespace alpaka block::sync::BlockSyncThreadIdMapBarrier( m_threadsPerBlockCount, m_mThreadsToBarrier), + rand::RandStl(), m_gridBlockIdx(Vec::zeros()), m_threadsPerBlockCount(workdiv::getWorkDiv(workDiv).prod()) {} @@ -147,7 +150,7 @@ namespace alpaka // getIdx std::mutex mutable m_mtxMapInsert; //!< The mutex used to secure insertion into the ThreadIdToIdxMap. typename idx::bt::IdxBtRefThreadIdMap::ThreadIdToIdxMap mutable m_threadsToIndices; //!< The mapping of thread id's to indices. - alignas(16u) Vec mutable m_gridBlockIdx; //!< The index of the currently executed block. + alignas(16u) Vec mutable m_gridBlockIdx; //!< The index of the currently executed block. // syncBlockThreads TSize const m_threadsPerBlockCount; //!< The number of threads per block the barrier has to wait for. diff --git a/include/alpaka/acc/AccGpuCudaRt.hpp b/include/alpaka/acc/AccGpuCudaRt.hpp index d340a11..3f5098e 100644 --- a/include/alpaka/acc/AccGpuCudaRt.hpp +++ b/include/alpaka/acc/AccGpuCudaRt.hpp @@ -29,6 +29,7 @@ #include // MathCudaBuiltIn #include // BlockSharedAllocCudaBuiltIn #include // BlockSyncCudaBuiltIn +#include // RandCuRand // Specialized traits. #include // acc::traits::AccType @@ -73,7 +74,8 @@ namespace alpaka public atomic::AtomicCudaBuiltIn, public math::MathCudaBuiltIn, public block::shared::BlockSharedAllocCudaBuiltIn, - public block::sync::BlockSyncCudaBuiltIn + public block::sync::BlockSyncCudaBuiltIn, + public rand::RandCuRand { public: //----------------------------------------------------------------------------- @@ -86,7 +88,8 @@ namespace alpaka atomic::AtomicCudaBuiltIn(), math::MathCudaBuiltIn(), block::shared::BlockSharedAllocCudaBuiltIn(), - block::sync::BlockSyncCudaBuiltIn() + block::sync::BlockSyncCudaBuiltIn(), + rand::RandCuRand() {} public: diff --git a/include/alpaka/atomic/Traits.hpp b/include/alpaka/atomic/Traits.hpp index 24a6408..514ce5f 100644 --- a/include/alpaka/atomic/Traits.hpp +++ b/include/alpaka/atomic/Traits.hpp @@ -69,14 +69,15 @@ namespace alpaka T const & value) -> T { - return traits::AtomicOp< - TOp, - TAtomic, - T> - ::atomicOp( - atomic, - addr, - value); + return + traits::AtomicOp< + TOp, + TAtomic, + T> + ::atomicOp( + atomic, + addr, + value); } namespace traits diff --git a/include/alpaka/block/shared/BlockSharedAllocCudaBuiltIn.hpp b/include/alpaka/block/shared/BlockSharedAllocCudaBuiltIn.hpp index 053b9a9..2ce57f4 100644 --- a/include/alpaka/block/shared/BlockSharedAllocCudaBuiltIn.hpp +++ b/include/alpaka/block/shared/BlockSharedAllocCudaBuiltIn.hpp @@ -95,7 +95,7 @@ namespace alpaka -> T & { __shared__ T shMem; - return std::ref(shMem); + return shMem; } }; //############################################################################# diff --git a/include/alpaka/block/shared/Traits.hpp b/include/alpaka/block/shared/Traits.hpp index 7b1c56b..aaf88bd 100644 --- a/include/alpaka/block/shared/Traits.hpp +++ b/include/alpaka/block/shared/Traits.hpp @@ -83,11 +83,12 @@ namespace alpaka TBlockSharedAlloc const & blockSharedAlloc) -> T & { - return traits::AllocVar< - T, - TBlockSharedAlloc> - ::allocVar( - blockSharedAlloc); + return + traits::AllocVar< + T, + TBlockSharedAlloc> + ::allocVar( + blockSharedAlloc); } //----------------------------------------------------------------------------- @@ -111,12 +112,13 @@ namespace alpaka TnumElements > 0, "The number of elements to allocate in block shared memory must not be zero!"); - return traits::AllocArr< - T, - TnumElements, - TBlockSharedAlloc> - ::allocArr( - blockSharedAlloc); + return + traits::AllocArr< + T, + TnumElements, + TBlockSharedAlloc> + ::allocArr( + blockSharedAlloc); } //----------------------------------------------------------------------------- diff --git a/include/alpaka/block/sync/BlockSyncCudaBuiltIn.hpp b/include/alpaka/block/sync/BlockSyncCudaBuiltIn.hpp index c95fa86..e2d7821 100644 --- a/include/alpaka/block/sync/BlockSyncCudaBuiltIn.hpp +++ b/include/alpaka/block/sync/BlockSyncCudaBuiltIn.hpp @@ -78,7 +78,7 @@ namespace alpaka // //----------------------------------------------------------------------------- ALPAKA_FN_ACC_CUDA_ONLY static auto syncBlockThreads( - block::sync::BlockSyncCudaBuiltIn const &) + block::sync::BlockSyncCudaBuiltIn const & /*blockSync*/) -> void { __syncthreads(); diff --git a/include/alpaka/block/sync/BlockSyncFiberIdMapBarrier.hpp b/include/alpaka/block/sync/BlockSyncFiberIdMapBarrier.hpp index 0cba118..0da54fd 100644 --- a/include/alpaka/block/sync/BlockSyncFiberIdMapBarrier.hpp +++ b/include/alpaka/block/sync/BlockSyncFiberIdMapBarrier.hpp @@ -92,7 +92,7 @@ namespace alpaka auto & barrierIdx(itFind->second); TSize const modBarrierIdx(barrierIdx % 2); - auto & bar(m_abarSyncThreads[modBarrierIdx]); + auto & bar(m_barriers[modBarrierIdx]); // (Re)initialize a barrier if this is the first thread to reach it. // DCLP: Double checked locking pattern for better performance. @@ -111,7 +111,7 @@ namespace alpaka ThreadIdToBarrierIdxMap & m_threadIdToBarrierIdxMap; //!< We have to keep the current and the last barrier because one of the threads can reach the next barrier before a other thread was wakeup from the last one and has checked if it can run. - Barrier mutable m_abarSyncThreads[2]; //!< The barriers for the synchronization of threads. + Barrier mutable m_barriers[2]; //!< The barriers for the synchronization of threads. }; namespace traits diff --git a/include/alpaka/block/sync/BlockSyncNoOp.hpp b/include/alpaka/block/sync/BlockSyncNoOp.hpp index 58b60ed..91893d2 100644 --- a/include/alpaka/block/sync/BlockSyncNoOp.hpp +++ b/include/alpaka/block/sync/BlockSyncNoOp.hpp @@ -1,6 +1,6 @@ /** * \file -* Copyright 2014-2015 Benjamin Worpitz +* Copyright 2014-2015 Benjamin Worpitz, Rene Widera * * This file is part of alpaka. * @@ -79,6 +79,7 @@ namespace alpaka //----------------------------------------------------------------------------- // //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_ACC static auto syncBlockThreads( block::sync::BlockSyncNoOp const & /*blockSync*/) -> void diff --git a/include/alpaka/block/sync/BlockSyncThreadIdMapBarrier.hpp b/include/alpaka/block/sync/BlockSyncThreadIdMapBarrier.hpp index f33991e..6a63ca7 100644 --- a/include/alpaka/block/sync/BlockSyncThreadIdMapBarrier.hpp +++ b/include/alpaka/block/sync/BlockSyncThreadIdMapBarrier.hpp @@ -92,7 +92,7 @@ namespace alpaka auto & barrierIdx(itFind->second); TSize const modBarrierIdx(barrierIdx % 2); - auto & bar(m_abarSyncThreads[modBarrierIdx]); + auto & bar(m_barriers[modBarrierIdx]); // (Re)initialize a barrier if this is the first thread to reach it. // DCLP: Double checked locking pattern for better performance. @@ -114,7 +114,7 @@ namespace alpaka ThreadIdToBarrierIdxMap & m_threadIdToBarrierIdxMap; //!< We have to keep the current and the last barrier because one of the threads can reach the next barrier before a other thread was wakeup from the last one and has checked if it can run. - Barrier mutable m_abarSyncThreads[2]; //!< The barriers for the synchronization of threads. + Barrier mutable m_barriers[2]; //!< The barriers for the synchronization of threads. std::mutex mutable m_mtxBarrier; }; diff --git a/include/alpaka/core/ConcurrentExecPool.hpp b/include/alpaka/core/ConcurrentExecPool.hpp index 36cc99c..6f4254f 100644 --- a/include/alpaka/core/ConcurrentExecPool.hpp +++ b/include/alpaka/core/ConcurrentExecPool.hpp @@ -22,6 +22,7 @@ #pragma once #include // workarounds +#include // workarounds // nvcc does not currently support boost correctly. // boost/utility/detail/result_of_iterate.hpp:148:75: error: invalid use of qualified-name 'std::allocator_traits<_Alloc>::propagate_on_container_swap' @@ -398,7 +399,7 @@ namespace alpaka auto isQueueEmpty() const -> bool { -#if BOOST_COMP_GNUC +#if (BOOST_VERSION < 105700) return const_cast &>(m_qTasks).empty(); #else return m_qTasks.empty(); @@ -615,7 +616,7 @@ namespace alpaka auto isQueueEmpty() const -> bool { -#if BOOST_COMP_GNUC +#if (BOOST_VERSION < 105700) return const_cast &>(m_qTasks).empty(); #else return m_qTasks.empty(); diff --git a/include/alpaka/core/Cuda.hpp b/include/alpaka/core/Cuda.hpp index fbad1b3..4d28558 100644 --- a/include/alpaka/core/Cuda.hpp +++ b/include/alpaka/core/Cuda.hpp @@ -24,6 +24,10 @@ #include // ALPAKA_FN_HOST #include // Vec #include // integer_sequence +#include // ElemType +#include // GetOffset/SetOffset +#include // GetExtent/SetExtent +#include // SizeType #include // boost::mpl::apply #include // boost::mpl::and_ @@ -331,6 +335,24 @@ namespace alpaka }; } } + namespace elem + { + namespace traits + { + //############################################################################# + //! The CUDA vectors elem type trait specialization. + //############################################################################# + template< + typename TSize> + struct ElemType< + TSize, + typename std::enable_if< + cuda::traits::IsCudaBuiltInType::value>::type> + { + using type = decltype(TSize().x); + }; + } + } namespace extent { namespace traits @@ -701,7 +723,7 @@ namespace alpaka typename std::enable_if< cuda::traits::IsCudaBuiltInType::value>::type> { - using type = decltype(TSize().x); + using type = std::size_t; }; } } diff --git a/include/alpaka/core/Fold.hpp b/include/alpaka/core/Fold.hpp index a931cf7..5a4077d 100644 --- a/include/alpaka/core/Fold.hpp +++ b/include/alpaka/core/Fold.hpp @@ -21,13 +21,15 @@ #pragma once -#include // ALPAKA_FN_HOST_ACC +#include // ALPAKA_FN_HOST_ACC #if !defined(__CUDA_ARCH__) #include // boost::ignore_unused #endif -#include // std::result_of +#if !(__cplusplus >= 201402L) + #include // std::result_of +#endif namespace alpaka { @@ -122,7 +124,7 @@ namespace alpaka T0 const & t0, T1 const & t1, Ts const & ... ts) - // NOTE: This is not allowed because the point of function declaration is after the trailing return type. + // NOTE: The following line is not allowed because the point of function declaration is after the trailing return type. // Thus the function itself is not available inside its return type declaration. // http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_closed.html#1433 // http://stackoverflow.com/questions/3744400/trailing-return-type-using-decltype-with-a-variadic-template-function diff --git a/include/alpaka/core/NdLoop.hpp b/include/alpaka/core/NdLoop.hpp index 2b45467..78ed6d8 100644 --- a/include/alpaka/core/NdLoop.hpp +++ b/include/alpaka/core/NdLoop.hpp @@ -161,7 +161,7 @@ namespace alpaka typename TFnObj, std::size_t... Tdims> ALPAKA_FN_HOST_ACC auto ndLoop( - core::detail::index_sequence const & indexSequence, + core::detail::index_sequence const & /*indexSequence*/, TExtentsVec const & extents, TFnObj const & f) -> void diff --git a/include/alpaka/core/OpenMp.hpp b/include/alpaka/core/OpenMp.hpp index 801da11..be39b81 100644 --- a/include/alpaka/core/OpenMp.hpp +++ b/include/alpaka/core/OpenMp.hpp @@ -28,17 +28,26 @@ namespace alpaka namespace omp { //----------------------------------------------------------------------------- - //! \return The device this object is bound to. + //! \return The maximum number of threads the OpenMP 2.0 runtime is capable of. //----------------------------------------------------------------------------- ALPAKA_FN_HOST auto getMaxOmpThreads() -> int { - // HACK: ::omp_get_max_threads() does not return the real limit of the underlying OpenMP 2.0 runtime: + // NOTE: ::omp_get_max_threads() does not return the real limit of the underlying OpenMP 2.0 runtime at any time: // 'The omp_get_max_threads routine returns the value of the internal control variable, which is used to determine the number of threads that would form the new team, // if an active parallel region without a num_threads clause were to be encountered at that point in the program.' // How to do this correctly? Is there even a way to get the hard limit apart from omp_set_num_threads(high_value) -> omp_get_max_threads()? + + // Get the current thread number. This is OMP_NUM_THREADS if it has not been changed up to here. + auto const maxThreadsOld(::omp_get_max_threads()); + ::omp_set_num_threads(1024); - return ::omp_get_max_threads(); + auto const maxThreadsReal(::omp_get_max_threads()); + + // Reset the max threads. + ::omp_set_num_threads(maxThreadsOld); + + return maxThreadsReal; } } } diff --git a/include/alpaka/core/Vectorize.hpp b/include/alpaka/core/Vectorize.hpp index 9b12c61..726a7d9 100644 --- a/include/alpaka/core/Vectorize.hpp +++ b/include/alpaka/core/Vectorize.hpp @@ -59,7 +59,7 @@ namespace alpaka //----------------------------------------------------------------------------- template< typename TElem> - struct GetCurrentThreadVectorRegisterSizeElems + struct GetVectorizationSizeElems { static constexpr std::size_t value = 1u; }; @@ -68,7 +68,7 @@ namespace alpaka // Number of elements of the given type that can be processed in parallel in a vector register. //----------------------------------------------------------------------------- template<> - struct GetCurrentThreadVectorRegisterSizeElems< + struct GetVectorizationSizeElems< double> { static constexpr std::size_t value = @@ -100,7 +100,7 @@ namespace alpaka // Number of elements of the given type that can be processed in parallel in a vector register. //----------------------------------------------------------------------------- template<> - struct GetCurrentThreadVectorRegisterSizeElems< + struct GetVectorizationSizeElems< float> { static constexpr std::size_t value = @@ -131,7 +131,7 @@ namespace alpaka // Number of elements of the given type that can be processed in parallel in a vector register. //----------------------------------------------------------------------------- template<> - struct GetCurrentThreadVectorRegisterSizeElems< + struct GetVectorizationSizeElems< std::int8_t> { static constexpr std::size_t value = @@ -167,7 +167,7 @@ namespace alpaka // Number of elements of the given type that can be processed in parallel in a vector register. //----------------------------------------------------------------------------- template<> - struct GetCurrentThreadVectorRegisterSizeElems< + struct GetVectorizationSizeElems< std::uint8_t> { static constexpr std::size_t value = @@ -203,7 +203,7 @@ namespace alpaka // Number of elements of the given type that can be processed in parallel in a vector register. //----------------------------------------------------------------------------- template<> - struct GetCurrentThreadVectorRegisterSizeElems< + struct GetVectorizationSizeElems< std::int16_t> { static constexpr std::size_t value = @@ -239,7 +239,7 @@ namespace alpaka // Number of elements of the given type that can be processed in parallel in a vector register. //----------------------------------------------------------------------------- template<> - struct GetCurrentThreadVectorRegisterSizeElems< + struct GetVectorizationSizeElems< std::uint16_t> { static constexpr std::size_t value = @@ -275,7 +275,7 @@ namespace alpaka // Number of elements of the given type that can be processed in parallel in a vector register. //----------------------------------------------------------------------------- template<> - struct GetCurrentThreadVectorRegisterSizeElems< + struct GetVectorizationSizeElems< std::int32_t> { static constexpr std::size_t value = @@ -306,7 +306,7 @@ namespace alpaka // Number of elements of the given type that can be processed in parallel in a vector register. //----------------------------------------------------------------------------- template<> - struct GetCurrentThreadVectorRegisterSizeElems< + struct GetVectorizationSizeElems< std::uint32_t> { static constexpr std::size_t value = @@ -337,7 +337,7 @@ namespace alpaka // Number of elements of the given type that can be processed in parallel in a vector register. //----------------------------------------------------------------------------- template<> - struct GetCurrentThreadVectorRegisterSizeElems< + struct GetVectorizationSizeElems< std::int64_t> { static constexpr std::size_t value = @@ -366,7 +366,7 @@ namespace alpaka // Number of elements of the given type that can be processed in parallel in a vector register. //----------------------------------------------------------------------------- template<> - struct GetCurrentThreadVectorRegisterSizeElems< + struct GetVectorizationSizeElems< std::uint64_t> { static constexpr std::size_t value = @@ -395,10 +395,10 @@ namespace alpaka namespace detail { //############################################################################# - //! + //! The general vectorization implementation. //############################################################################# template< - std::size_t TvecRegElemCount> + std::size_t TvectorizationCount> struct Vectorize { //----------------------------------------------------------------------------- @@ -414,29 +414,33 @@ namespace alpaka { #ifdef __CUDACC__ // nvcc seems not to support constexpr correctly static_assert( - (TvecRegElemCount > 0), - "The number of elements per vector register has to be greater zero!"); + (static_cast(TvectorizationCount) > 0), + "The number of loop invocations to vectorize (TvectorizationCount) has to be greater zero!"); TSize i(0); - while((count-i) >= TvecRegElemCount) + while((count-i) >= static_cast(TvectorizationCount)) { - for(TSize j(0); j(TvectorizationCount); ++j) { fnObj(i+j); } - i += TvecRegElemCount; + i += static_cast(TvectorizationCount); } #else + // Cast it to the user defined type. constexpr TSize vecRegElems( - static_cast(TvecRegElemCount)); + static_cast(TvectorizationCount)); static_assert( (vecRegElems > 0), - "The number of elements per vector register has to be greater zero!"); + "The number of loop invocations to vectorize (TvectorizationCount) has to be greater zero!"); + // While we have more then TvectorizationCount elements left ... TSize i(0); while((count-i) >= vecRegElems) { + // ... execute exactly TvectorizationCount invocations of the loop. + // This enables the compiler to optimize much better because of the compile-time loop bounds. for(TSize j(0); j struct Vectorize< @@ -484,41 +489,29 @@ namespace alpaka } //----------------------------------------------------------------------------- + //! Calls the given function object fnObj count times trying to utilize vectorization with TVectorizationCount elements. //! + //! The supported number of elements per vector of a given type on the current architecture can be queried with registerGetVectorizationSizeElems::value. + //! For optimal vectorization, the fnObj should be only consist of one line. //----------------------------------------------------------------------------- template< + std::size_t TvectorizationCount, typename TSize, - typename TElem, typename TFnObj> ALPAKA_FN_HOST auto vectorize( TSize const & count, TFnObj const & fnObj) -> void { -#ifdef __CUDACC__ // nvcc seems not to support constexpr correctly - static_assert( - (GetCurrentThreadVectorRegisterSizeElems::value > 0), - "The number of elements per vector register has to be greater zero!"); - - detail::Vectorize< - GetCurrentThreadVectorRegisterSizeElems::value> - ::vectorize( - count, - fnObj); -#else - constexpr auto vecRegElems( - GetCurrentThreadVectorRegisterSizeElems::value); - static_assert( - (vecRegElems > 0), - "The number of elements per vector register has to be greater zero!"); + (TvectorizationCount > 0), + "The number of loop invocations to vectorize (TvectorizationCount) has to be greater zero!"); detail::Vectorize< - vecRegElems> + TvectorizationCount> ::vectorize( count, fnObj); -#endif } } } diff --git a/include/alpaka/dev/DevCpu.hpp b/include/alpaka/dev/DevCpu.hpp index 89c7d10..89690ad 100644 --- a/include/alpaka/dev/DevCpu.hpp +++ b/include/alpaka/dev/DevCpu.hpp @@ -28,7 +28,7 @@ #include // mem::view::traits::ViewType #include // stream::enqueue -#include // getCpuName, getGlobalMemSizeBytes +#include // getCpuName, getTotalGlobalMemSizeBytes, getFreeGlobalMemSizeBytes #include // boost::ignore_unused @@ -361,7 +361,7 @@ namespace alpaka { boost::ignore_unused(dev); - return dev::cpu::detail::getGlobalMemSizeBytes(); + return dev::cpu::detail::getTotalGlobalMemSizeBytes(); } }; @@ -381,8 +381,7 @@ namespace alpaka { boost::ignore_unused(dev); - // \FIXME: Get correct free memory size! - return dev::cpu::detail::getGlobalMemSizeBytes(); + return dev::cpu::detail::getFreeGlobalMemSizeBytes(); } }; diff --git a/include/alpaka/dev/Traits.hpp b/include/alpaka/dev/Traits.hpp index a010b63..e11f0c7 100644 --- a/include/alpaka/dev/Traits.hpp +++ b/include/alpaka/dev/Traits.hpp @@ -115,10 +115,11 @@ namespace alpaka T const & t) -> decltype(traits::GetDev::getDev(t)) { - return traits::GetDev< - T> - ::getDev( - t); + return + traits::GetDev< + T> + ::getDev( + t); } //----------------------------------------------------------------------------- @@ -149,10 +150,11 @@ namespace alpaka TDev const & dev) -> std::string { - return traits::GetName< - TDev> - ::getName( - dev); + return + traits::GetName< + TDev> + ::getName( + dev); } //----------------------------------------------------------------------------- @@ -164,10 +166,11 @@ namespace alpaka TDev const & dev) -> std::size_t { - return traits::GetMemBytes< - TDev> - ::getMemBytes( - dev); + return + traits::GetMemBytes< + TDev> + ::getMemBytes( + dev); } //----------------------------------------------------------------------------- @@ -179,10 +182,11 @@ namespace alpaka TDev const & dev) -> std::size_t { - return traits::GetFreeMemBytes< - TDev> - ::getFreeMemBytes( - dev); + return + traits::GetFreeMemBytes< + TDev> + ::getFreeMemBytes( + dev); } //----------------------------------------------------------------------------- diff --git a/include/alpaka/dev/cpu/SysInfo.hpp b/include/alpaka/dev/cpu/SysInfo.hpp index b05da74..182641f 100644 --- a/include/alpaka/dev/cpu/SysInfo.hpp +++ b/include/alpaka/dev/cpu/SysInfo.hpp @@ -21,6 +21,8 @@ #pragma once +#include // BOOST_XXX + #if BOOST_OS_WINDOWS || BOOST_OS_CYGWIN #ifndef NOMINMAX #define NOMINMAX @@ -40,7 +42,9 @@ #endif #endif -#include // BOOST_XXX +#if BOOST_OS_LINUX + #include +#endif #include // std::memcpy #include // std::string @@ -54,7 +58,7 @@ namespace alpaka #if BOOST_ARCH_X86 namespace detail { - #if BOOST_COMP_GNUC || BOOST_COMP_CLANG + #if BOOST_COMP_GNUC || BOOST_COMP_CLANG || __INTEL_COMPILER #include //----------------------------------------------------------------------------- //! @@ -65,7 +69,7 @@ namespace alpaka __cpuid_count(level, subfunction, ex[0], ex[1], ex[2], ex[3]); } - #elif BOOST_COMP_MSVC || BOOST_COMP_INTEL + #elif BOOST_COMP_MSVC #include //----------------------------------------------------------------------------- //! @@ -90,7 +94,7 @@ namespace alpaka std::uint32_t const nExIds(ex[0]); // Get the information associated with each extended ID. - char pCpuBrandString[0x40] = {0}; + char cpuBrandString[0x40] = {0}; for(std::uint32_t i(0x80000000); i<=nExIds; ++i) { cpuid(i, 0, ex); @@ -98,18 +102,18 @@ namespace alpaka // Interpret CPU brand string and cache information. if(i == 0x80000002) { - std::memcpy(pCpuBrandString, ex, sizeof(ex)); + std::memcpy(cpuBrandString, ex, sizeof(ex)); } else if(i == 0x80000003) { - std::memcpy(pCpuBrandString + 16, ex, sizeof(ex)); + std::memcpy(cpuBrandString + 16, ex, sizeof(ex)); } else if(i == 0x80000004) { - std::memcpy(pCpuBrandString + 32, ex, sizeof(ex)); + std::memcpy(cpuBrandString + 32, ex, sizeof(ex)); } } - return std::string(pCpuBrandString); + return std::string(cpuBrandString); #else return ""; #endif @@ -124,10 +128,10 @@ namespace alpaka return 0; }*/ //----------------------------------------------------------------------------- - //! \return The number of bytes of global memory. + //! \return The total number of bytes of global memory. //! Adapted from David Robert Nadeau: http://nadeausoftware.com/articles/2012/09/c_c_tip_how_get_physical_memory_size_system //----------------------------------------------------------------------------- - inline auto getGlobalMemSizeBytes() + inline auto getTotalGlobalMemSizeBytes() -> std::size_t { #if BOOST_OS_WINDOWS @@ -157,7 +161,7 @@ namespace alpaka std::size_t const sizeLen{sizeof(size)}; if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0) { - throw std::logic_error("getGlobalMemSizeBytes failed calling sysctl!"); + throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!"); } return static_cast(size); @@ -182,13 +186,58 @@ namespace alpaka std::size_t const sizeLen{sizeof(size)}; if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0) { - throw std::logic_error("getGlobalMemSizeBytes failed calling sysctl!"); + throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!"); } return static_cast(size); #endif #else - throw std::logic_error("getGlobalMemSizeBytes not implemented for this system!"); + #error "getTotalGlobalMemSizeBytes not implemented for this system!" +#endif + } + //----------------------------------------------------------------------------- + //! \return The free number of bytes of global memory. + //! \throws std::logic_error if not implemented on the system and std::runtime_error on other errors. + //----------------------------------------------------------------------------- + inline auto getFreeGlobalMemSizeBytes() + -> std::size_t + { +#if BOOST_OS_WINDOWS + MEMORYSTATUSEX status; + status.dwLength = sizeof(status); + GlobalMemoryStatusEx(&status); + return static_cast(status.ullAvailPhys); + +#elif BOOST_OS_LINUX + std::string token; + std::ifstream file("/proc/meminfo"); + if(file) + { + while(file >> token) + { + if(token == "MemFree:") + { + std::size_t freeGlobalMemSizeBytes(0); + if(file >> freeGlobalMemSizeBytes) + { + return freeGlobalMemSizeBytes; + } + else + { + throw std::runtime_error("Unable to read MemFree value!"); + } + } + } + throw std::runtime_error("Unable to find MemFree in '/proc/meminfo'!"); + } + else + { + throw std::runtime_error("Unable to open '/proc/meminfo'!"); + } +#elif BOOST_OS_MACOS + #error "getFreeGlobalMemSizeBytes not implemented for __APPLE__!" +#else + #error "getFreeGlobalMemSizeBytes not implemented for this system!" #endif } } diff --git a/include/alpaka/dim/DimArithmetic.hpp b/include/alpaka/dim/DimArithmetic.hpp index c36f9a9..e48b091 100644 --- a/include/alpaka/dim/DimArithmetic.hpp +++ b/include/alpaka/dim/DimArithmetic.hpp @@ -43,7 +43,7 @@ namespace alpaka T, typename std::enable_if::value>::type> { - using type = std::integral_constant; + using type = dim::DimInt<1u>; }; } } diff --git a/include/alpaka/elem/Traits.hpp b/include/alpaka/elem/Traits.hpp new file mode 100644 index 0000000..884a3de --- /dev/null +++ b/include/alpaka/elem/Traits.hpp @@ -0,0 +1,72 @@ +/** +* \file +* Copyright 2014-2015 Benjamin Worpitz +* +* This file is part of alpaka. +* +* alpaka is free software: you can redistribute it and/or modify +* it under the terms of the GNU Lesser General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* alpaka is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Lesser General Public License for more details. +* +* You should have received a copy of the GNU Lesser General Public License +* along with alpaka. +* If not, see . +*/ + +#pragma once + +#include // std::enable_if, std::is_fundamental + +namespace alpaka +{ + //----------------------------------------------------------------------------- + //! The element specifics. + //----------------------------------------------------------------------------- + namespace elem + { + //----------------------------------------------------------------------------- + //! The element traits. + //----------------------------------------------------------------------------- + namespace traits + { + //############################################################################# + //! The element type trait. + //############################################################################# + template< + typename TView, + typename TSfinae = void> + struct ElemType; + } + + //############################################################################# + //! The element type trait alias template to remove the ::type. + //############################################################################# + template< + typename TView> + using Elem = typename std::remove_volatile::type>::type; + + //----------------------------------------------------------------------------- + // Trait specializations for unsigned integral types. + //----------------------------------------------------------------------------- + namespace traits + { + //############################################################################# + //! The fundamental type elem type trait specialization. + //############################################################################# + template< + typename T> + struct ElemType< + T, + typename std::enable_if::value>::type> + { + using type = T; + }; + } + } +} diff --git a/include/alpaka/event/EventCpu.hpp b/include/alpaka/event/EventCpu.hpp index 0ec411b..3ce682f 100644 --- a/include/alpaka/event/EventCpu.hpp +++ b/include/alpaka/event/EventCpu.hpp @@ -261,7 +261,7 @@ namespace alpaka #endif return; } - // ... and was enqueued before, increment the cancel counter. + // ... and noby is waiting for it, increment the cancel counter. else { ++spEventCpuImpl->m_canceledEnqueueCount; @@ -353,11 +353,11 @@ namespace alpaka if(spEventCpuImpl->m_bIsWaitedFor) { #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL - std::cout << BOOST_CURRENT_FUNCTION << "WARNING: The event to enqueue is already enqueued AND waited on. It can NOT be re-enqueued!" << std::endl; + std::cout << BOOST_CURRENT_FUNCTION << "WARNING: The event to enqueue is already enqueued AND waited on. It can NOT be re-enqueued!" << std::endl; #endif return; } - // ... and was enqueued before, increment the cancel counter. + // ... and noby is waiting for it, increment the cancel counter. else { ++spEventCpuImpl->m_canceledEnqueueCount; diff --git a/include/alpaka/event/Traits.hpp b/include/alpaka/event/Traits.hpp index a67d177..0270da8 100644 --- a/include/alpaka/event/Traits.hpp +++ b/include/alpaka/event/Traits.hpp @@ -80,10 +80,11 @@ namespace alpaka TEvent const & event) -> bool { - return traits::EventTest< - TEvent> - ::eventTest( - event); + return + traits::EventTest< + TEvent> + ::eventTest( + event); } } } diff --git a/include/alpaka/exec/ExecCpuOmp2Blocks.hpp b/include/alpaka/exec/ExecCpuOmp2Blocks.hpp index 036ab38..0f358ff 100644 --- a/include/alpaka/exec/ExecCpuOmp2Blocks.hpp +++ b/include/alpaka/exec/ExecCpuOmp2Blocks.hpp @@ -192,9 +192,9 @@ namespace alpaka acc.m_gridBlockIdx = core::mapIdx( #if _OPENMP < 200805 - Vec1(static_cast(i)), + Vec, TSize>(static_cast(i)), #else - Vec1(i), + Vec, TSize>(i), #endif gridBlockExtents); diff --git a/include/alpaka/exec/ExecCpuOmp4.hpp b/include/alpaka/exec/ExecCpuOmp4.hpp index 4d749d3..0c77bca 100644 --- a/include/alpaka/exec/ExecCpuOmp4.hpp +++ b/include/alpaka/exec/ExecCpuOmp4.hpp @@ -182,7 +182,7 @@ namespace alpaka #pragma omp distribute for(TSize b = 0u; b const gridBlockIdx(b); + Vec, TSize> const gridBlockIdx(b); // When this is not repeated here: // error: ‘gridBlockExtents’ referenced in target region does not have a mappable type auto const gridBlockExtents2( @@ -210,7 +210,7 @@ namespace alpaka int const numThreads(::omp_get_num_threads()); // NOTE: No std::cout in omp target! printf("%s omp_get_num_threads: %d\n", BOOST_CURRENT_FUNCTION, numThreads); - if(numThreads != numThreadsInBlock) + if(numThreads != static_cast(numThreadsInBlock)) { throw std::runtime_error("The CPU OpenMP4 runtime did not use the number of threads that had been required!"); } diff --git a/include/alpaka/exec/ExecGpuCudaRt.hpp b/include/alpaka/exec/ExecGpuCudaRt.hpp index 7d389ac..705905c 100644 --- a/include/alpaka/exec/ExecGpuCudaRt.hpp +++ b/include/alpaka/exec/ExecGpuCudaRt.hpp @@ -326,8 +326,10 @@ namespace alpaka } #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL - std::cout << BOOST_CURRENT_FUNCTION << "gridDim: " << gridDim.z << " " << gridDim.y << " " << gridDim.x << std::endl; - std::cout << BOOST_CURRENT_FUNCTION << "blockDim: " << blockDim.z << " " << blockDim.y << " " << blockDim.x << std::endl; + std::cout << BOOST_CURRENT_FUNCTION + << " gridDim: " << gridDim.z << " " << gridDim.y << " " << gridDim.x + << " blockDim: " << blockDim.z << " " << blockDim.y << " " << blockDim.x + << std::endl; #endif #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL diff --git a/include/alpaka/extent/Traits.hpp b/include/alpaka/extent/Traits.hpp index cb8366b..2478e3e 100644 --- a/include/alpaka/extent/Traits.hpp +++ b/include/alpaka/extent/Traits.hpp @@ -89,7 +89,7 @@ namespace alpaka { return traits::GetExtent< - std::integral_constant, + dim::DimInt, TExtents> ::getExtent( extents); @@ -186,7 +186,7 @@ namespace alpaka -> void { traits::SetExtent< - std::integral_constant, + dim::DimInt, TExtents, TExtent> ::setExtent( @@ -247,7 +247,7 @@ namespace alpaka template< typename TExtents> struct GetExtent< - std::integral_constant, + dim::DimInt<0u>, TExtents, typename std::enable_if< std::is_integral::value>::type> @@ -267,7 +267,7 @@ namespace alpaka typename TExtents, typename TExtent> struct SetExtent< - std::integral_constant, + dim::DimInt<0u>, TExtents, TExtent, typename std::enable_if< diff --git a/include/alpaka/idx/Traits.hpp b/include/alpaka/idx/Traits.hpp index 244b8cb..5393c7a 100644 --- a/include/alpaka/idx/Traits.hpp +++ b/include/alpaka/idx/Traits.hpp @@ -69,13 +69,14 @@ namespace alpaka TWorkDiv const & workDiv) -> Vec, size::Size> { - return traits::GetIdx< - TIdx, - TOrigin, - TUnit> - ::getIdx( - idx, - workDiv); + return + traits::GetIdx< + TIdx, + TOrigin, + TUnit> + ::getIdx( + idx, + workDiv); } //----------------------------------------------------------------------------- //! Get the indices requested. @@ -89,13 +90,14 @@ namespace alpaka TIdxWorkDiv const & idxWorkDiv) -> Vec, size::Size> { - return traits::GetIdx< - TIdxWorkDiv, - TOrigin, - TUnit> - ::getIdx( - idxWorkDiv, - idxWorkDiv); + return + traits::GetIdx< + TIdxWorkDiv, + TOrigin, + TUnit> + ::getIdx( + idxWorkDiv, + idxWorkDiv); } namespace traits diff --git a/include/alpaka/idx/bt/IdxBtOmp.hpp b/include/alpaka/idx/bt/IdxBtOmp.hpp index 0c08871..6e752c8 100644 --- a/include/alpaka/idx/bt/IdxBtOmp.hpp +++ b/include/alpaka/idx/bt/IdxBtOmp.hpp @@ -120,7 +120,7 @@ namespace alpaka assert(::omp_get_thread_num()>=0); // \TODO: Would it be faster to precompute the index and cache it inside an array? return core::mapIdx( - Vec1(static_cast(::omp_get_thread_num())), + Vec, TSize>(static_cast(::omp_get_thread_num())), workdiv::getWorkDiv(workDiv)); } }; diff --git a/include/alpaka/math/abs/Traits.hpp b/include/alpaka/math/abs/Traits.hpp index 44ffbcf..838b849 100644 --- a/include/alpaka/math/abs/Traits.hpp +++ b/include/alpaka/math/abs/Traits.hpp @@ -67,12 +67,13 @@ namespace alpaka abs, arg)) { - return traits::Abs< - T, - TArg> - ::abs( - abs, - arg); + return + traits::Abs< + T, + TArg> + ::abs( + abs, + arg); } namespace traits diff --git a/include/alpaka/math/acos/Traits.hpp b/include/alpaka/math/acos/Traits.hpp index b795145..1fb240e 100644 --- a/include/alpaka/math/acos/Traits.hpp +++ b/include/alpaka/math/acos/Traits.hpp @@ -63,12 +63,13 @@ namespace alpaka acos, arg)) { - return traits::Acos< - T, - TArg> - ::acos( - acos, - arg); + return + traits::Acos< + T, + TArg> + ::acos( + acos, + arg); } namespace traits diff --git a/include/alpaka/math/asin/Traits.hpp b/include/alpaka/math/asin/Traits.hpp index f02f31b..f831eee 100644 --- a/include/alpaka/math/asin/Traits.hpp +++ b/include/alpaka/math/asin/Traits.hpp @@ -63,12 +63,13 @@ namespace alpaka asin, arg)) { - return traits::Asin< - T, - TArg> - ::asin( - asin, - arg); + return + traits::Asin< + T, + TArg> + ::asin( + asin, + arg); } namespace traits diff --git a/include/alpaka/math/atan/Traits.hpp b/include/alpaka/math/atan/Traits.hpp index 640a532..27ab455 100644 --- a/include/alpaka/math/atan/Traits.hpp +++ b/include/alpaka/math/atan/Traits.hpp @@ -63,12 +63,13 @@ namespace alpaka atan, arg)) { - return traits::Atan< - T, - TArg> - ::atan( - atan, - arg); + return + traits::Atan< + T, + TArg> + ::atan( + atan, + arg); } namespace traits diff --git a/include/alpaka/math/atan2/Traits.hpp b/include/alpaka/math/atan2/Traits.hpp index e1c341b..5b8958a 100644 --- a/include/alpaka/math/atan2/Traits.hpp +++ b/include/alpaka/math/atan2/Traits.hpp @@ -71,14 +71,15 @@ namespace alpaka y, x)) { - return traits::Atan2< - T, - Ty, - Tx> - ::atan2( - atan2, - y, - x); + return + traits::Atan2< + T, + Ty, + Tx> + ::atan2( + atan2, + y, + x); } namespace traits diff --git a/include/alpaka/math/cbrt/Traits.hpp b/include/alpaka/math/cbrt/Traits.hpp index 5f4b454..d24959b 100644 --- a/include/alpaka/math/cbrt/Traits.hpp +++ b/include/alpaka/math/cbrt/Traits.hpp @@ -64,12 +64,13 @@ namespace alpaka cbrt, arg)) { - return traits::Cbrt< - T, - TArg> - ::cbrt( - cbrt, - arg); + return + traits::Cbrt< + T, + TArg> + ::cbrt( + cbrt, + arg); } namespace traits diff --git a/include/alpaka/math/ceil/Traits.hpp b/include/alpaka/math/ceil/Traits.hpp index 8fcef22..1aab94e 100644 --- a/include/alpaka/math/ceil/Traits.hpp +++ b/include/alpaka/math/ceil/Traits.hpp @@ -64,12 +64,13 @@ namespace alpaka ceil, arg)) { - return traits::Ceil< - T, - TArg> - ::ceil( - ceil, - arg); + return + traits::Ceil< + T, + TArg> + ::ceil( + ceil, + arg); } namespace traits diff --git a/include/alpaka/math/cos/Traits.hpp b/include/alpaka/math/cos/Traits.hpp index 26ae1a9..f74e41a 100644 --- a/include/alpaka/math/cos/Traits.hpp +++ b/include/alpaka/math/cos/Traits.hpp @@ -64,12 +64,13 @@ namespace alpaka cos, arg)) { - return traits::Cos< - T, - TArg> - ::cos( - cos, - arg); + return + traits::Cos< + T, + TArg> + ::cos( + cos, + arg); } namespace traits diff --git a/include/alpaka/math/erf/Traits.hpp b/include/alpaka/math/erf/Traits.hpp index 372cc52..2b1d453 100644 --- a/include/alpaka/math/erf/Traits.hpp +++ b/include/alpaka/math/erf/Traits.hpp @@ -64,12 +64,13 @@ namespace alpaka erf, arg)) { - return traits::Erf< - T, - TArg> - ::erf( - erf, - arg); + return + traits::Erf< + T, + TArg> + ::erf( + erf, + arg); } namespace traits diff --git a/include/alpaka/math/exp/Traits.hpp b/include/alpaka/math/exp/Traits.hpp index 45a21b4..99544d7 100644 --- a/include/alpaka/math/exp/Traits.hpp +++ b/include/alpaka/math/exp/Traits.hpp @@ -63,12 +63,13 @@ namespace alpaka exp, arg)) { - return traits::Exp< - T, - TArg> - ::exp( - exp, - arg); + return + traits::Exp< + T, + TArg> + ::exp( + exp, + arg); } namespace traits diff --git a/include/alpaka/math/floor/Traits.hpp b/include/alpaka/math/floor/Traits.hpp index 926e1e4..60fd5a2 100644 --- a/include/alpaka/math/floor/Traits.hpp +++ b/include/alpaka/math/floor/Traits.hpp @@ -64,12 +64,13 @@ namespace alpaka floor, arg)) { - return traits::Floor< - T, - TArg> - ::floor( - floor, - arg); + return + traits::Floor< + T, + TArg> + ::floor( + floor, + arg); } namespace traits diff --git a/include/alpaka/math/fmod/Traits.hpp b/include/alpaka/math/fmod/Traits.hpp index 9726a7e..e172ca2 100644 --- a/include/alpaka/math/fmod/Traits.hpp +++ b/include/alpaka/math/fmod/Traits.hpp @@ -71,14 +71,15 @@ namespace alpaka x, y)) { - return traits::Fmod< - T, - Tx, - Ty> - ::fmod( - fmod, - x, - y); + return + traits::Fmod< + T, + Tx, + Ty> + ::fmod( + fmod, + x, + y); } namespace traits diff --git a/include/alpaka/math/log/Traits.hpp b/include/alpaka/math/log/Traits.hpp index 7ce49c0..e8d9b6d 100644 --- a/include/alpaka/math/log/Traits.hpp +++ b/include/alpaka/math/log/Traits.hpp @@ -64,12 +64,13 @@ namespace alpaka log, arg)) { - return traits::Log< - T, - TArg> - ::log( - log, - arg); + return + traits::Log< + T, + TArg> + ::log( + log, + arg); } namespace traits diff --git a/include/alpaka/math/max/Traits.hpp b/include/alpaka/math/max/Traits.hpp index e882898..7d9f25a 100644 --- a/include/alpaka/math/max/Traits.hpp +++ b/include/alpaka/math/max/Traits.hpp @@ -72,14 +72,15 @@ namespace alpaka x, y)) { - return traits::Max< - T, - Tx, - Ty> - ::max( - max, - x, - y); + return + traits::Max< + T, + Tx, + Ty> + ::max( + max, + x, + y); } namespace traits diff --git a/include/alpaka/math/min/Traits.hpp b/include/alpaka/math/min/Traits.hpp index ea90616..f3cca44 100644 --- a/include/alpaka/math/min/Traits.hpp +++ b/include/alpaka/math/min/Traits.hpp @@ -72,14 +72,15 @@ namespace alpaka x, y)) { - return traits::Min< - T, - Tx, - Ty> - ::min( - min, - x, - y); + return + traits::Min< + T, + Tx, + Ty> + ::min( + min, + x, + y); } namespace traits diff --git a/include/alpaka/math/pow/Traits.hpp b/include/alpaka/math/pow/Traits.hpp index 61e752e..e77bb93 100644 --- a/include/alpaka/math/pow/Traits.hpp +++ b/include/alpaka/math/pow/Traits.hpp @@ -71,14 +71,15 @@ namespace alpaka base, exp)) { - return traits::Pow< - T, - TBase, - TExp> - ::pow( - pow, - base, - exp); + return + traits::Pow< + T, + TBase, + TExp> + ::pow( + pow, + base, + exp); } namespace traits diff --git a/include/alpaka/math/remainder/Traits.hpp b/include/alpaka/math/remainder/Traits.hpp index 8e7e8ba..54db37a 100644 --- a/include/alpaka/math/remainder/Traits.hpp +++ b/include/alpaka/math/remainder/Traits.hpp @@ -71,14 +71,15 @@ namespace alpaka x, y)) { - return traits::Remainder< - T, - Tx, - Ty> - ::remainder( - remainder, - x, - y); + return + traits::Remainder< + T, + Tx, + Ty> + ::remainder( + remainder, + x, + y); } namespace traits diff --git a/include/alpaka/math/round/Traits.hpp b/include/alpaka/math/round/Traits.hpp index f087490..248156d 100644 --- a/include/alpaka/math/round/Traits.hpp +++ b/include/alpaka/math/round/Traits.hpp @@ -82,12 +82,13 @@ namespace alpaka round, arg)) { - return traits::Round< - T, - TArg> - ::round( - round, - arg); + return + traits::Round< + T, + TArg> + ::round( + round, + arg); } //----------------------------------------------------------------------------- //! Computes the nearest integer value to arg (in integer format), rounding halfway cases away from zero, regardless of the current rounding mode. @@ -106,12 +107,13 @@ namespace alpaka TArg const & arg) -> long int { - return traits::Lround< - T, - TArg> - ::lround( - lround, - arg); + return + traits::Lround< + T, + TArg> + ::lround( + lround, + arg); } //----------------------------------------------------------------------------- //! Computes the nearest integer value to arg (in integer format), rounding halfway cases away from zero, regardless of the current rounding mode. @@ -130,12 +132,13 @@ namespace alpaka TArg const & arg) -> long long int { - return traits::Llround< - T, - TArg> - ::llround( - llround, - arg); + return + traits::Llround< + T, + TArg> + ::llround( + llround, + arg); } namespace traits diff --git a/include/alpaka/math/rsqrt/Traits.hpp b/include/alpaka/math/rsqrt/Traits.hpp index 5f550aa..6f62c6b 100644 --- a/include/alpaka/math/rsqrt/Traits.hpp +++ b/include/alpaka/math/rsqrt/Traits.hpp @@ -64,12 +64,13 @@ namespace alpaka rsqrt, arg)) { - return traits::Rsqrt< - T, - TArg> - ::rsqrt( - rsqrt, - arg); + return + traits::Rsqrt< + T, + TArg> + ::rsqrt( + rsqrt, + arg); } namespace traits diff --git a/include/alpaka/math/sin/Traits.hpp b/include/alpaka/math/sin/Traits.hpp index f495040..4bd88fc 100644 --- a/include/alpaka/math/sin/Traits.hpp +++ b/include/alpaka/math/sin/Traits.hpp @@ -64,12 +64,13 @@ namespace alpaka sin, arg)) { - return traits::Sin< - T, - TArg> - ::sin( - sin, - arg); + return + traits::Sin< + T, + TArg> + ::sin( + sin, + arg); } namespace traits diff --git a/include/alpaka/math/sqrt/Traits.hpp b/include/alpaka/math/sqrt/Traits.hpp index 62bef93..1d8dc26 100644 --- a/include/alpaka/math/sqrt/Traits.hpp +++ b/include/alpaka/math/sqrt/Traits.hpp @@ -64,12 +64,13 @@ namespace alpaka sqrt, arg)) { - return traits::Sqrt< - T, - TArg> - ::sqrt( - sqrt, - arg); + return + traits::Sqrt< + T, + TArg> + ::sqrt( + sqrt, + arg); } namespace traits diff --git a/include/alpaka/math/tan/Traits.hpp b/include/alpaka/math/tan/Traits.hpp index 1dfb74d..2874be1 100644 --- a/include/alpaka/math/tan/Traits.hpp +++ b/include/alpaka/math/tan/Traits.hpp @@ -64,12 +64,13 @@ namespace alpaka tan, arg)) { - return traits::Tan< - T, - TArg> - ::tan( - tan, - arg); + return + traits::Tan< + T, + TArg> + ::tan( + tan, + arg); } namespace traits diff --git a/include/alpaka/math/trunc/Traits.hpp b/include/alpaka/math/trunc/Traits.hpp index f3e4015..0cbe5ec 100644 --- a/include/alpaka/math/trunc/Traits.hpp +++ b/include/alpaka/math/trunc/Traits.hpp @@ -64,12 +64,13 @@ namespace alpaka trunc, arg)) { - return traits::Trunc< - T, - TArg> - ::trunc( - trunc, - arg); + return + traits::Trunc< + T, + TArg> + ::trunc( + trunc, + arg); } namespace traits diff --git a/include/alpaka/math/trunc/TruncCudaBuiltIn.hpp b/include/alpaka/math/trunc/TruncCudaBuiltIn.hpp index bfcca80..631a707 100644 --- a/include/alpaka/math/trunc/TruncCudaBuiltIn.hpp +++ b/include/alpaka/math/trunc/TruncCudaBuiltIn.hpp @@ -1,6 +1,6 @@ /** * \file -* Copyright 2014-2015 Benjarint Worpitz +* Copyright 2014-2015 Benjarint Worpitz, Rene Widera * * This file is part of alpaka. * @@ -55,11 +55,10 @@ namespace alpaka std::is_floating_point::value>::type> { ALPAKA_FN_ACC_CUDA_ONLY static auto trunc( - TruncCudaBuiltIn const & trunc, + TruncCudaBuiltIn const & /*trunc*/, TArg const & arg) -> decltype(::trunc(arg)) { - boost::ignore_unused(trunc); return ::trunc(arg); } }; diff --git a/include/alpaka/mem/buf/BufCpu.hpp b/include/alpaka/mem/buf/BufCpu.hpp index d3a6002..8878c88 100644 --- a/include/alpaka/mem/buf/BufCpu.hpp +++ b/include/alpaka/mem/buf/BufCpu.hpp @@ -24,7 +24,7 @@ #include // dev::traits::DevType #include // mem::buf::Alloc, ... -#include // Vec +#include // Vec // \TODO: Remove CUDA inclusion for BufCpu by replacing pinning with non CUDA code! #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDACC__) @@ -252,6 +252,24 @@ namespace alpaka }; } } + namespace elem + { + namespace traits + { + //############################################################################# + //! The BufCpu memory element type get trait specialization. + //############################################################################# + template< + typename TElem, + typename TDim, + typename TSize> + struct ElemType< + mem::buf::BufCpu> + { + using type = TElem; + }; + } + } namespace extent { namespace traits @@ -287,18 +305,6 @@ namespace alpaka { namespace traits { - //############################################################################# - //! The BufCpu memory element type get trait specialization. - //############################################################################# - template< - typename TElem, - typename TDim, - typename TSize> - struct ElemType< - mem::buf::BufCpu> - { - using type = TElem; - }; //############################################################################# //! The BufCpu buf trait specialization. //############################################################################# @@ -411,7 +417,7 @@ namespace alpaka typename TDim, typename TSize> struct GetPitchBytes< - std::integral_constant, + dim::DimInt, mem::buf::BufCpu> { //----------------------------------------------------------------------------- @@ -548,7 +554,7 @@ namespace alpaka ALPAKA_CUDA_RT_CHECK_IGNORE( cudaHostRegister( const_cast(reinterpret_cast(mem::view::getPtrNative(buf))), - extent::getProductOfExtents(buf) * sizeof(mem::view::Elem>), + extent::getProductOfExtents(buf) * sizeof(elem::Elem>), cudaHostRegisterDefault), cudaErrorHostMemoryAlreadyRegistered); diff --git a/include/alpaka/mem/buf/BufCudaRt.hpp b/include/alpaka/mem/buf/BufCudaRt.hpp index ebf1ee5..04d5666 100644 --- a/include/alpaka/mem/buf/BufCudaRt.hpp +++ b/include/alpaka/mem/buf/BufCudaRt.hpp @@ -23,9 +23,9 @@ #include // dev::traits::DevType #include // dim::DimInt -#include // view::Copy, ... +#include // mem::view::Copy, ... -#include // Vec +#include // Vec #include // cudaMalloc, ... #include // assert @@ -80,7 +80,9 @@ namespace alpaka m_extentsElements(extent::getExtentsVecEnd(extents)), m_spMem( pMem, - std::bind(&BufCudaRt::freeBuffer, std::placeholders::_1, std::ref(m_dev))), + // NOTE: Because the BufCudaRt object can be copied and the original object could have been destroyed, + // a std::ref(m_dev) or a this pointer can not be bound to the callback because they are not always valid at time of destruction. + std::bind(&BufCudaRt::freeBuffer, std::placeholders::_1, m_dev)), m_pitchBytes(pitchBytes) { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; @@ -115,7 +117,7 @@ namespace alpaka } public: - dev::DevCudaRt m_dev; + dev::DevCudaRt m_dev; // NOTE: The device has to be destructed after the memory pointer because it is required for destruction. Vec m_extentsElements; std::shared_ptr m_spMem; TSize m_pitchBytes; @@ -179,6 +181,24 @@ namespace alpaka }; } } + namespace elem + { + namespace traits + { + //############################################################################# + //! The BufCudaRt memory element type get trait specialization. + //############################################################################# + template< + typename TElem, + typename TDim, + typename TSize> + struct ElemType< + mem::buf::BufCudaRt> + { + using type = TElem; + }; + } + } namespace extent { namespace traits @@ -214,18 +234,6 @@ namespace alpaka { namespace traits { - //############################################################################# - //! The BufCudaRt memory element type get trait specialization. - //############################################################################# - template< - typename TElem, - typename TDim, - typename TSize> - struct ElemType< - mem::buf::BufCudaRt> - { - using type = TElem; - }; //############################################################################# //! The BufCudaRt buf trait specialization. //############################################################################# @@ -338,7 +346,7 @@ namespace alpaka typename TDim, typename TSize> struct GetPitchBytes< - std::integral_constant, + dim::DimInt, mem::buf::BufCudaRt> { //----------------------------------------------------------------------------- @@ -753,7 +761,7 @@ namespace alpaka ALPAKA_CUDA_RT_CHECK( cudaHostRegister( const_cast(reinterpret_cast(mem::view::getPtrNative(buf))), - extent::getProductOfExtents(buf) * sizeof(mem::view::Elem>), + extent::getProductOfExtents(buf) * sizeof(elem::Elem>), cudaHostRegisterMapped)); } // If it is already the same device, nothing has to be mapped. diff --git a/include/alpaka/mem/buf/BufPlainPtrWrapper.hpp b/include/alpaka/mem/buf/BufPlainPtrWrapper.hpp index 3bc2ea1..d61f319 100644 --- a/include/alpaka/mem/buf/BufPlainPtrWrapper.hpp +++ b/include/alpaka/mem/buf/BufPlainPtrWrapper.hpp @@ -176,6 +176,25 @@ namespace alpaka }; } } + namespace elem + { + namespace traits + { + //############################################################################# + //! The BufPlainPtrWrapper memory element type get trait specialization. + //############################################################################# + template< + typename TDev, + typename TElem, + typename TDim, + typename TSize> + struct ElemType< + mem::buf::BufPlainPtrWrapper> + { + using type = TElem; + }; + } + } namespace extent { namespace traits @@ -210,20 +229,6 @@ namespace alpaka { namespace traits { - //############################################################################# - //! The BufPlainPtrWrapper memory element type get trait specialization. - //############################################################################# - template< - typename TDev, - typename TElem, - typename TDim, - typename TSize> - struct ElemType< - buf::BufPlainPtrWrapper> - { - using type = TElem; - }; - //############################################################################# //! The BufPlainPtrWrapper buf trait specialization. //############################################################################# @@ -233,15 +238,15 @@ namespace alpaka typename TDim, typename TSize> struct GetBuf< - buf::BufPlainPtrWrapper> + mem::buf::BufPlainPtrWrapper> { //----------------------------------------------------------------------------- //! //----------------------------------------------------------------------------- ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static auto getBuf( - buf::BufPlainPtrWrapper const & buf) - -> buf::BufPlainPtrWrapper const & + mem::buf::BufPlainPtrWrapper const & buf) + -> mem::buf::BufPlainPtrWrapper const & { return buf; } @@ -250,8 +255,8 @@ namespace alpaka //----------------------------------------------------------------------------- ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static auto getBuf( - buf::BufPlainPtrWrapper & buf) - -> buf::BufPlainPtrWrapper & + mem::buf::BufPlainPtrWrapper & buf) + -> mem::buf::BufPlainPtrWrapper & { return buf; } @@ -266,18 +271,18 @@ namespace alpaka typename TDim, typename TSize> struct GetPtrNative< - buf::BufPlainPtrWrapper> + mem::buf::BufPlainPtrWrapper> { ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static auto getPtrNative( - buf::BufPlainPtrWrapper const & buf) + mem::buf::BufPlainPtrWrapper const & buf) -> TElem const * { return buf.m_pMem; } ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static auto getPtrNative( - buf::BufPlainPtrWrapper & buf) + mem::buf::BufPlainPtrWrapper & buf) -> TElem * { return buf.m_pMem; @@ -293,12 +298,12 @@ namespace alpaka typename TDim, typename TSize> struct GetPitchBytes< - std::integral_constant, - buf::BufPlainPtrWrapper> + dim::DimInt, + mem::buf::BufPlainPtrWrapper> { ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static auto getPitchBytes( - buf::BufPlainPtrWrapper const & buf) + mem::buf::BufPlainPtrWrapper const & buf) -> TSize { return buf.m_pitchBytes; diff --git a/include/alpaka/mem/buf/BufStdContainers.hpp b/include/alpaka/mem/buf/BufStdContainers.hpp index 267970d..69885d9 100644 --- a/include/alpaka/mem/buf/BufStdContainers.hpp +++ b/include/alpaka/mem/buf/BufStdContainers.hpp @@ -97,6 +97,24 @@ namespace alpaka }; } } + namespace elem + { + namespace traits + { + //############################################################################# + //! The fixed size array memory element type get trait specialization. + //############################################################################# + template< + typename TFixedSizeArray> + struct ElemType< + TFixedSizeArray, + typename std::enable_if< + std::is_array::value>::type> + { + using type = typename std::remove_all_extents::type; + }; + } + } namespace extent { namespace traits @@ -136,19 +154,6 @@ namespace alpaka { namespace traits { - //############################################################################# - //! The fixed size array memory element type get trait specialization. - //############################################################################# - template< - typename TFixedSizeArray> - struct ElemType< - TFixedSizeArray, - typename std::enable_if< - std::is_array::value>::type> - { - using type = typename std::remove_all_extents::type; - }; - //############################################################################# //! The fixed size array buf trait specialization. //############################################################################# @@ -219,7 +224,7 @@ namespace alpaka template< typename TFixedSizeArray> struct GetPitchBytes< - std::integral_constant::value - 1u>, + dim::DimInt::value - 1u>, TFixedSizeArray, typename std::enable_if< std::is_array::value @@ -344,6 +349,23 @@ namespace alpaka }; } } + namespace elem + { + namespace traits + { + //############################################################################# + //! The std::array memory element type get trait specialization. + //############################################################################# + template< + typename TElem, + std::size_t Tsize> + struct ElemType< + std::array> + { + using type = TElem; + }; + } + } namespace extent { namespace traits @@ -355,7 +377,7 @@ namespace alpaka typename TElem, std::size_t Tsize> struct GetExtent< - std::integral_constant, + dim::DimInt<0u>, std::array> { //----------------------------------------------------------------------------- @@ -378,18 +400,6 @@ namespace alpaka { namespace traits { - //############################################################################# - //! The std::array memory element type get trait specialization. - //############################################################################# - template< - typename TElem, - std::size_t Tsize> - struct ElemType< - std::array> - { - using type = TElem; - }; - //############################################################################# //! The std::array buf trait specialization. //############################################################################# @@ -455,7 +465,7 @@ namespace alpaka typename TElem, std::size_t Tsize> struct GetPitchBytes< - std::integral_constant, + dim::DimInt<0u>, std::array> { //----------------------------------------------------------------------------- @@ -573,6 +583,23 @@ namespace alpaka }; } } + namespace elem + { + namespace traits + { + //############################################################################# + //! The std::vector memory element type get trait specialization. + //############################################################################# + template< + typename TElem, + typename TAllocator> + struct ElemType< + std::vector> + { + using type = TElem; + }; + } + } namespace extent { namespace traits @@ -584,7 +611,7 @@ namespace alpaka typename TElem, typename TAllocator> struct GetExtent< - std::integral_constant, + dim::DimInt<0u>, std::vector> { //----------------------------------------------------------------------------- @@ -605,18 +632,6 @@ namespace alpaka { namespace traits { - //############################################################################# - //! The std::vector memory element type get trait specialization. - //############################################################################# - template< - typename TElem, - typename TAllocator> - struct ElemType< - std::vector> - { - using type = TElem; - }; - //############################################################################# //! The std::vector buf trait specialization. //############################################################################# @@ -682,7 +697,7 @@ namespace alpaka typename TElem, typename TAllocator> struct GetPitchBytes< - std::integral_constant, + dim::DimInt<0u>, std::vector> { //----------------------------------------------------------------------------- diff --git a/include/alpaka/mem/buf/Traits.hpp b/include/alpaka/mem/buf/Traits.hpp index 9cbde10..fff347c 100644 --- a/include/alpaka/mem/buf/Traits.hpp +++ b/include/alpaka/mem/buf/Traits.hpp @@ -145,14 +145,15 @@ namespace alpaka dev, extents)) { - return traits::Alloc< - TElem, - dim::Dim, - TSize, - TDev> - ::alloc( - dev, - extents); + return + traits::Alloc< + TElem, + dim::Dim, + TSize, + TDev> + ::alloc( + dev, + extents); } //----------------------------------------------------------------------------- //! Maps the buffer into the memory of the given device. @@ -170,12 +171,13 @@ namespace alpaka TDev const & dev) -> void { - return traits::Map< - TBuf, - TDev> - ::map( - buf, - dev); + return + traits::Map< + TBuf, + TDev> + ::map( + buf, + dev); } //----------------------------------------------------------------------------- //! Unmaps the buffer from the memory of the given device. @@ -193,12 +195,13 @@ namespace alpaka TDev const & dev) -> void { - return traits::Unmap< - TBuf, - TDev> - ::unmap( - buf, - dev); + return + traits::Unmap< + TBuf, + TDev> + ::unmap( + buf, + dev); } //----------------------------------------------------------------------------- //! Pins the buffer. @@ -212,10 +215,11 @@ namespace alpaka TBuf & buf) -> void { - return traits::Pin< - TBuf> - ::pin( - buf); + return + traits::Pin< + TBuf> + ::pin( + buf); } //----------------------------------------------------------------------------- //! Unpins the buffer. @@ -229,10 +233,11 @@ namespace alpaka TBuf & buf) -> void { - return traits::Unpin< - TBuf> - ::unpin( - buf); + return + traits::Unpin< + TBuf> + ::unpin( + buf); } //----------------------------------------------------------------------------- //! The pin state of the buffer. @@ -246,10 +251,11 @@ namespace alpaka TBuf const & buf) -> bool { - return traits::IsPinned< - TBuf> - ::isPinned( - buf); + return + traits::IsPinned< + TBuf> + ::isPinned( + buf); } } } diff --git a/include/alpaka/mem/buf/cpu/Copy.hpp b/include/alpaka/mem/buf/cpu/Copy.hpp index 3de046b..86f6b14 100644 --- a/include/alpaka/mem/buf/cpu/Copy.hpp +++ b/include/alpaka/mem/buf/cpu/Copy.hpp @@ -23,7 +23,7 @@ #include // dim::DimInt #include // extent::getXXX -#include // view::Copy, ... +#include // mem::view::Copy, ... #include // stream::StreamCpuAsync #include // stream::StreamCpuSync @@ -71,7 +71,7 @@ namespace alpaka "The buffers and the extents are required to have the same dimensionality!"); // TODO: Maybe check for Size of TBufDst and TBufSrc to have greater or equal range than TExtents. static_assert( - std::is_same, typename std::remove_const>::type>::value, + std::is_same, typename std::remove_const>::type>::value, "The source and the destination buffers are required to have the same element type!"); //----------------------------------------------------------------------------- @@ -82,7 +82,7 @@ namespace alpaka TBufSrc const & bufSrc, TExtents const & extents) : m_extentWidth(extent::getWidth(extents)), - m_extentWidthBytes(static_cast(m_extentWidth * sizeof(mem::view::Elem))), + m_extentWidthBytes(static_cast(m_extentWidth * sizeof(elem::Elem))), m_dstWidth(static_cast(extent::getWidth(bufDst))), m_srcWidth(static_cast(extent::getWidth(bufSrc))), m_dstBufWidth(static_cast(extent::getWidth(mem::view::getBuf(bufDst)))), diff --git a/include/alpaka/mem/buf/cpu/Set.hpp b/include/alpaka/mem/buf/cpu/Set.hpp index ec6f6d4..f504582 100644 --- a/include/alpaka/mem/buf/cpu/Set.hpp +++ b/include/alpaka/mem/buf/cpu/Set.hpp @@ -22,8 +22,8 @@ #pragma once #include // dim::DimInt -#include // view::getXXX -#include // view::Set, ... +#include // mem::view::getXXX +#include // mem::view::Set, ... #include // stream::StreamCpuAsync #include // stream::StreamCpuSync @@ -93,7 +93,7 @@ namespace alpaka assert(extentHeight <= dstHeight); assert(extentDepth <= dstDepth); - auto const extentWidthBytes(extentWidth * sizeof(mem::view::Elem)); + auto const extentWidthBytes(extentWidth * sizeof(elem::Elem)); auto const dstPitchBytes(mem::view::getPitchBytes::value - 1u>(m_buf)); assert(extentWidthBytes <= dstPitchBytes); diff --git a/include/alpaka/mem/buf/cuda/Copy.hpp b/include/alpaka/mem/buf/cuda/Copy.hpp index 28ec2c2..ea4ca95 100644 --- a/include/alpaka/mem/buf/cuda/Copy.hpp +++ b/include/alpaka/mem/buf/cuda/Copy.hpp @@ -24,8 +24,8 @@ #include // dev::DevCpu #include // dev::DevCudaRt #include // dim::DimInt -#include // view::getXXX -#include // view::Copy +#include // mem::view::getXXX +#include // mem::view::Copy #include // stream::StreamCudaRtAsync #include // stream::StreamCudaRtSync @@ -74,7 +74,7 @@ namespace alpaka "The destination buffer and the extents are required to have the same dimensionality!"); // TODO: Maybe check for Size of TBufDst and TBufSrc to have greater or equal range than TExtents. static_assert( - std::is_same, typename std::remove_const>::type>::value, + std::is_same, typename std::remove_const>::type>::value, "The source and the destination buffers are required to have the same element type!"); using Size = size::Size; @@ -97,7 +97,7 @@ namespace alpaka m_dstWidth(static_cast(extent::getWidth(bufDst))), m_srcWidth(static_cast(extent::getWidth(bufSrc))), #endif - m_extentWidthBytes(static_cast(extent::getWidth(extents) * sizeof(mem::view::Elem))), + m_extentWidthBytes(static_cast(extent::getWidth(extents) * sizeof(elem::Elem))), m_dstMemNative(reinterpret_cast(mem::view::getPtrNative(bufDst))), m_srcMemNative(reinterpret_cast(mem::view::getPtrNative(bufSrc))) { @@ -159,7 +159,7 @@ namespace alpaka "The destination buffer and the extents are required to have the same dimensionality!"); // TODO: Maybe check for Size of TBufDst and TBufSrc to have greater or equal range than TExtents. static_assert( - std::is_same, typename std::remove_const>::type>::value, + std::is_same, typename std::remove_const>::type>::value, "The source and the destination buffers are required to have the same element type!"); using Size = size::Size; @@ -180,7 +180,7 @@ namespace alpaka #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL m_extentWidth(extent::getWidth(extents)), #endif - m_extentWidthBytes(static_cast(extent::getWidth(extents) * sizeof(mem::view::Elem))), + m_extentWidthBytes(static_cast(extent::getWidth(extents) * sizeof(elem::Elem))), m_dstWidth(static_cast(extent::getWidth(bufDst))), // required for 3D peer copy m_srcWidth(static_cast(extent::getWidth(bufSrc))), // required for 3D peer copy @@ -270,7 +270,7 @@ namespace alpaka "The destination buffer and the extents are required to have the same dimensionality!"); // TODO: Maybe check for Size of TBufDst and TBufSrc to have greater or equal range than TExtents. static_assert( - std::is_same, typename std::remove_const>::type>::value, + std::is_same, typename std::remove_const>::type>::value, "The source and the destination buffers are required to have the same element type!"); using Size = size::Size; @@ -291,7 +291,7 @@ namespace alpaka m_iSrcDevice(iSrcDevice), m_extentWidth(extent::getWidth(extents)), - m_extentWidthBytes(static_cast(m_extentWidth * sizeof(mem::view::Elem))), + m_extentWidthBytes(static_cast(m_extentWidth * sizeof(elem::Elem))), m_dstWidth(static_cast(extent::getWidth(bufDst))), m_srcWidth(static_cast(extent::getWidth(bufSrc))), @@ -378,7 +378,7 @@ namespace alpaka } //----------------------------------------------------------------------------- - // Trait specializations for view::TaskCopy. + // Trait specializations for TaskCopy. //----------------------------------------------------------------------------- namespace traits { diff --git a/include/alpaka/mem/buf/cuda/Set.hpp b/include/alpaka/mem/buf/cuda/Set.hpp index 2d91c43..2e929c9 100644 --- a/include/alpaka/mem/buf/cuda/Set.hpp +++ b/include/alpaka/mem/buf/cuda/Set.hpp @@ -1,6 +1,6 @@ /** * \file -* Copyright 2014-2015 Benjamin Worpitz +* Copyright 2014-2015 Benjamin Worpitz, Rene Widera * * This file is part of alpaka. * @@ -23,8 +23,8 @@ #include // dev::getDev #include // dim::DimInt -#include // view::getXXX -#include // view::Set +#include // mem::view::getXXX +#include // mem::view::Set #include // stream::Enqueue #include // cudaMemset, ... @@ -161,7 +161,7 @@ namespace alpaka auto const & iDevice(task.m_iDevice); auto const extentWidth(extent::getWidth(extents)); - auto const extentWidthBytes(extentWidth * sizeof(mem::view::Elem)); + auto const extentWidthBytes(extentWidth * sizeof(elem::Elem)); auto const dstWidth(extent::getWidth(buf)); auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(buf))); assert(extentWidth <= dstWidth); @@ -212,7 +212,7 @@ namespace alpaka auto const & iDevice(task.m_iDevice); auto const extentWidth(extent::getWidth(extents)); - auto const extentWidthBytes(extentWidth * sizeof(mem::view::Elem)); + auto const extentWidthBytes(extentWidth * sizeof(elem::Elem)); auto const dstWidth(extent::getWidth(buf)); auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(buf))); assert(extentWidth <= dstWidth); @@ -262,11 +262,11 @@ namespace alpaka auto const & iDevice(task.m_iDevice); auto const extentWidth(extent::getWidth(extents)); - auto const extentWidthBytes(extentWidth * sizeof(mem::view::Elem)); + auto const extentWidthBytes(extentWidth * sizeof(elem::Elem)); auto const extentHeight(extent::getHeight(extents)); auto const dstWidth(extent::getWidth(buf)); auto const dstHeight(extent::getHeight(buf)); - auto const dstPitchBytes(mem::view::getPitchBytes::value - 1u>>(buf)); + auto const dstPitchBytes(mem::view::getPitchBytes::value - 1u>(buf)); auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(buf))); assert(extentWidth <= dstWidth); assert(extentHeight <= dstHeight); @@ -319,11 +319,11 @@ namespace alpaka auto const & iDevice(task.m_iDevice); auto const extentWidth(extent::getWidth(extents)); - auto const extentWidthBytes(extentWidth * sizeof(mem::view::Elem)); + auto const extentWidthBytes(extentWidth * sizeof(elem::Elem)); auto const extentHeight(extent::getHeight(extents)); auto const dstWidth(extent::getWidth(buf)); auto const dstHeight(extent::getHeight(buf)); - auto const dstPitchBytes(mem::view::getPitchBytes::value - 1u>>(buf)); + auto const dstPitchBytes(mem::view::getPitchBytes::value - 1u>(buf)); auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(buf))); assert(extentWidth <= dstWidth); assert(extentHeight <= dstHeight); @@ -380,7 +380,7 @@ namespace alpaka auto const dstWidth(extent::getWidth(buf)); auto const dstHeight(extent::getHeight(buf)); auto const dstDepth(extent::getDepth(buf)); - auto const dstPitchBytes(mem::view::getPitchBytes::value - 1u>>(buf)); + auto const dstPitchBytes(mem::view::getPitchBytes::value - 1u>(buf)); auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(buf))); assert(extentWidth <= dstWidth); assert(extentHeight <= dstHeight); @@ -451,7 +451,7 @@ namespace alpaka auto const dstWidth(extent::getWidth(buf)); auto const dstHeight(extent::getHeight(buf)); auto const dstDepth(extent::getDepth(buf)); - auto const dstPitchBytes(mem::view::getPitchBytes::value - 1u>>(buf)); + auto const dstPitchBytes(mem::view::getPitchBytes::value - 1u>(buf)); auto const dstNativePtr(reinterpret_cast(mem::view::getPtrNative(buf))); assert(extentWidth <= dstWidth); assert(extentHeight <= dstHeight); diff --git a/include/alpaka/mem/view/Traits.hpp b/include/alpaka/mem/view/Traits.hpp index 7332586..17220d3 100644 --- a/include/alpaka/mem/view/Traits.hpp +++ b/include/alpaka/mem/view/Traits.hpp @@ -21,8 +21,9 @@ #pragma once -#include // dev::traits::DevType, ... -#include // dim::DimType +#include // dev::Dev, ... +#include // dim::Dim +#include // elem::Elem #include // extent::GetExtent #include // offset::GetOffset #include // stream::enqueue @@ -54,14 +55,6 @@ namespace alpaka typename TSfinae = void> struct ViewType; - //############################################################################# - //! The memory element type trait. - //############################################################################# - template< - typename TView, - typename TSfinae = void> - struct ElemType; - //############################################################################# //! The native pointer get trait. //############################################################################# @@ -101,7 +94,7 @@ namespace alpaka using IdxSequence = alpaka::core::detail::make_integer_sequence_offset::value - TIdx::value>; return extentsProd(view, IdxSequence()) - * sizeof(typename ElemType::type); + * sizeof(typename elem::Elem); } private: //----------------------------------------------------------------------------- @@ -163,13 +156,6 @@ namespace alpaka struct GetBuf; } - //############################################################################# - //! The memory element type trait alias template to remove the ::type. - //############################################################################# - template< - typename TView> - using Elem = typename std::remove_volatile::type>::type; - //############################################################################# //! The memory buffer view type trait alias template to remove the ::type. //############################################################################# @@ -190,12 +176,13 @@ namespace alpaka typename TBuf> ALPAKA_FN_HOST auto getPtrNative( TBuf const & buf) - -> Elem const * + -> elem::Elem const * { - return traits::GetPtrNative< - TBuf> - ::getPtrNative( - buf); + return + traits::GetPtrNative< + TBuf> + ::getPtrNative( + buf); } //----------------------------------------------------------------------------- //! Gets the native pointer of the memory buffer. @@ -207,12 +194,13 @@ namespace alpaka typename TBuf> ALPAKA_FN_HOST auto getPtrNative( TBuf & buf) - -> Elem * + -> elem::Elem * { - return traits::GetPtrNative< - TBuf> - ::getPtrNative( - buf); + return + traits::GetPtrNative< + TBuf> + ::getPtrNative( + buf); } //----------------------------------------------------------------------------- @@ -228,14 +216,15 @@ namespace alpaka ALPAKA_FN_HOST auto getPtrDev( TBuf const & buf, TDev const & dev) - -> Elem const * + -> elem::Elem const * { - return traits::GetPtrDev< - TBuf, - TDev> - ::getPtrDev( - buf, - dev); + return + traits::GetPtrDev< + TBuf, + TDev> + ::getPtrDev( + buf, + dev); } //----------------------------------------------------------------------------- //! Gets the pointer to the buffer on the given device. @@ -250,14 +239,15 @@ namespace alpaka ALPAKA_FN_HOST auto getPtrDev( TBuf & buf, TDev const & dev) - -> Elem * + -> elem::Elem * { - return traits::GetPtrDev< - TBuf, - TDev> - ::getPtrDev( - buf, - dev); + return + traits::GetPtrDev< + TBuf, + TDev> + ::getPtrDev( + buf, + dev); } //----------------------------------------------------------------------------- @@ -272,7 +262,7 @@ namespace alpaka { return traits::GetPitchBytes< - std::integral_constant, + dim::DimInt, TView> ::getPitchBytes( buf); @@ -374,7 +364,7 @@ namespace alpaka dim::Dim::value == dim::Dim::value, "The destination buffer and the extents are required to have the same dimensionality!"); static_assert( - std::is_same, typename std::remove_const>::type>::value, + std::is_same, typename std::remove_const>::type>::value, "The source and the destination buffers are required to have the same element type!"); return @@ -578,7 +568,7 @@ namespace alpaka { ALPAKA_FN_HOST static auto print( TView const & view, - Elem const * const ptr, + elem::Elem const * const ptr, Vec, size::Size> const & extents, std::ostream & os, std::string const & elementSeparator, @@ -598,7 +588,7 @@ namespace alpaka TView> ::print( view, - reinterpret_cast const *>(reinterpret_cast(ptr)+i*pitch), + reinterpret_cast const *>(reinterpret_cast(ptr)+i*pitch), extents, os, elementSeparator, @@ -627,7 +617,7 @@ namespace alpaka { ALPAKA_FN_HOST static auto print( TView const & view, - Elem const * const ptr, + elem::Elem const * const ptr, Vec, size::Size> const & extents, std::ostream & os, std::string const & elementSeparator, @@ -676,7 +666,7 @@ namespace alpaka TView> ::print( view, - view::getPtrNative(view), + mem::view::getPtrNative(view), extent::getExtentsVec(view), os, elementSeparator, diff --git a/include/alpaka/mem/view/ViewBasic.hpp b/include/alpaka/mem/view/ViewBasic.hpp index a1822d9..bcce45c 100644 --- a/include/alpaka/mem/view/ViewBasic.hpp +++ b/include/alpaka/mem/view/ViewBasic.hpp @@ -23,7 +23,7 @@ #include // Dim #include // Dev -#include // view::getXXX +#include // mem::view::getXXX #include #include // traits::getOffsetX #include // size::traits::SizeType @@ -68,7 +68,7 @@ namespace alpaka using Dev = TDev; using Elem = TElem; using Dim = TDim; - using Buf = buf::BufPlainPtrWrapper; + using Buf = mem::buf::BufPlainPtrWrapper; // If the value type is const, we store a const buffer. //using BufC = detail::MimicConst; @@ -88,7 +88,9 @@ namespace alpaka mem::view::getPitchBytes(buf)), m_vOffsetsElements(offset::getOffsetsVecEnd(buf)), m_extentsElements(extent::getExtentsVecEnd(buf)) - {} + { + ALPAKA_DEBUG_FULL_LOG_SCOPE; + } //----------------------------------------------------------------------------- //! Constructor. //! \param buf This can be either a memory buffer or a memory view. @@ -105,6 +107,8 @@ namespace alpaka m_vOffsetsElements(offset::getOffsetsVecEnd(buf)), m_extentsElements(extent::getExtentsVecEnd(buf)) { + ALPAKA_DEBUG_FULL_LOG_SCOPE; + static_assert( std::is_same>::value, "The size type of TBuf and the TSize template parameter have to be identical!"); @@ -132,6 +136,8 @@ namespace alpaka m_extentsElements(extent::getExtentsVecEnd(extentsElements)), m_vOffsetsElements(offset::getOffsetsVecEnd(relativeOffsetsElements) + offset::getOffsetsVecEnd(buf)) { + ALPAKA_DEBUG_FULL_LOG_SCOPE; + static_assert( std::is_same>::value, "The buffer and the extents are required to have the same dimensionality!"); @@ -171,6 +177,8 @@ namespace alpaka m_extentsElements(extent::getExtentsVecEnd(extentsElements)), m_vOffsetsElements(offset::getOffsetsVecEnd(relativeOffsetsElements) + offset::getOffsetsVecEnd(buf)) { + ALPAKA_DEBUG_FULL_LOG_SCOPE; + static_assert( std::is_same>::value, "The buffer and the extents are required to have the same dimensionality!"); @@ -262,6 +270,25 @@ namespace alpaka }; } } + namespace elem + { + namespace traits + { + //############################################################################# + //! The ViewBasic memory element type get trait specialization. + //############################################################################# + template< + typename TElem, + typename TDim, + typename TDev, + typename TSize> + struct ElemType< + mem::view::ViewBasic> + { + using type = TElem; + }; + } + } namespace extent { namespace traits @@ -298,20 +325,6 @@ namespace alpaka { namespace traits { - //############################################################################# - //! The ViewBasic memory element type get trait specialization. - //############################################################################# - template< - typename TElem, - typename TDim, - typename TDev, - typename TSize> - struct ElemType< - mem::view::ViewBasic> - { - using type = TElem; - }; - //############################################################################# //! The memory buffer view creation type trait. //############################################################################# @@ -508,7 +521,7 @@ namespace alpaka { return offset::getOffset(view) - * view::getPitchBytes(buf); + * mem::view::getPitchBytes(buf); } }; @@ -533,7 +546,7 @@ namespace alpaka -> TSize { return - view::getPitchBytes( + mem::view::getPitchBytes( mem::view::getBuf(view)); } }; diff --git a/include/alpaka/offset/Traits.hpp b/include/alpaka/offset/Traits.hpp index f316cbc..ef04c72 100644 --- a/include/alpaka/offset/Traits.hpp +++ b/include/alpaka/offset/Traits.hpp @@ -83,7 +83,7 @@ namespace alpaka { return traits::GetOffset< - std::integral_constant, + dim::DimInt, TOffsets> ::getOffset( offsets); @@ -139,7 +139,7 @@ namespace alpaka -> void { traits::SetOffset< - std::integral_constant, + dim::DimInt, TOffsets, TOffset> ::setOffset( @@ -200,7 +200,7 @@ namespace alpaka template< typename TOffsets> struct GetOffset< - std::integral_constant, + dim::DimInt<0u>, TOffsets, typename std::enable_if< std::is_integral::value>::type> @@ -220,7 +220,7 @@ namespace alpaka typename TOffsets, typename TOffset> struct SetOffset< - std::integral_constant, + dim::DimInt<0u>, TOffsets, TOffset, typename std::enable_if< diff --git a/include/alpaka/rand/RandCuRand.hpp b/include/alpaka/rand/RandCuRand.hpp index b567125..32ba6b2 100644 --- a/include/alpaka/rand/RandCuRand.hpp +++ b/include/alpaka/rand/RandCuRand.hpp @@ -36,6 +36,15 @@ namespace alpaka { namespace rand { + //############################################################################# + //! The CUDA rand implementation. + //############################################################################# + class RandCuRand + { + public: + using RandBase = RandCuRand; + }; + namespace generator { namespace cuda @@ -237,22 +246,18 @@ namespace alpaka //! The CUDA random number float normal distribution get trait specialization. //############################################################################# template< - typename TAcc, typename T> struct CreateNormalReal< - TAcc, + RandCuRand, T, typename std::enable_if< - std::is_same< - dev::Dev, - dev::DevCudaRt>::value - && std::is_floating_point::value>::type> + std::is_floating_point::value>::type> { //----------------------------------------------------------------------------- // //----------------------------------------------------------------------------- ALPAKA_FN_ACC_CUDA_ONLY static auto createNormalReal( - TAcc const & acc) + RandCuRand const & /*rand*/) -> rand::distribution::cuda::NormalReal { return rand::distribution::cuda::NormalReal(); @@ -262,22 +267,18 @@ namespace alpaka //! The CUDA random number float uniform distribution get trait specialization. //############################################################################# template< - typename TAcc, typename T> struct CreateUniformReal< - TAcc, + RandCuRand, T, typename std::enable_if< - std::is_same< - dev::Dev, - dev::DevCudaRt>::value - && std::is_floating_point::value>::type> + std::is_floating_point::value>::type> { //----------------------------------------------------------------------------- // //----------------------------------------------------------------------------- ALPAKA_FN_ACC_CUDA_ONLY static auto createUniformReal( - TAcc const & acc) + RandCuRand const & /*rand*/) -> rand::distribution::cuda::UniformReal { return rand::distribution::cuda::UniformReal(); @@ -287,22 +288,18 @@ namespace alpaka //! The CUDA random number integer uniform distribution get trait specialization. //############################################################################# template< - typename TAcc, typename T> struct CreateUniformUint< - TAcc, + RandCuRand, T, typename std::enable_if< - std::is_same< - dev::Dev, - dev::DevCudaRt>::value - && std::is_integral::value>::type> + std::is_integral::value>::type> { //----------------------------------------------------------------------------- // //----------------------------------------------------------------------------- ALPAKA_FN_ACC_CUDA_ONLY static auto createUniformUint( - TAcc const & acc) + RandCuRand const & /*rand*/) -> rand::distribution::cuda::UniformUint { return rand::distribution::cuda::UniformUint(); @@ -317,20 +314,15 @@ namespace alpaka //############################################################################# //! The CUDA random number default generator get trait specialization. //############################################################################# - template< - typename TAcc> + template<> struct CreateDefault< - TAcc, - typename std::enable_if< - std::is_same< - dev::Dev, - dev::DevCudaRt>::value>::type> + RandCuRand> { //----------------------------------------------------------------------------- // //----------------------------------------------------------------------------- ALPAKA_FN_ACC_CUDA_ONLY static auto createDefault( - TAcc const & acc, + RandCuRand const & /*rand*/, std::uint32_t const & seed, std::uint32_t const & subsequence) -> rand::generator::cuda::Xor diff --git a/include/alpaka/rand/RandStl.hpp b/include/alpaka/rand/RandStl.hpp index 922406d..e6393bd 100644 --- a/include/alpaka/rand/RandStl.hpp +++ b/include/alpaka/rand/RandStl.hpp @@ -25,6 +25,8 @@ #include // ALPAKA_FN_HOST_ACC +#include // boost::ignore_unused + #include // std::mt19937, std::uniform_real_distribution, ... #include // std::enable_if @@ -32,6 +34,15 @@ namespace alpaka { namespace rand { + //############################################################################# + //! The standard library rand implementation. + //############################################################################# + class RandStl + { + public: + using RandBase = RandStl; + }; + namespace distribution { namespace traits @@ -40,24 +51,21 @@ namespace alpaka //! The CPU device random number float normal distribution get trait specialization. //############################################################################# template< - typename TAcc, typename T> struct CreateNormalReal< - TAcc, + RandStl, T, typename std::enable_if< - std::is_same< - dev::Dev, - dev::DevCpu>::value - && std::is_floating_point::value>::type> + std::is_floating_point::value>::type> { //----------------------------------------------------------------------------- // //----------------------------------------------------------------------------- ALPAKA_FN_ACC_NO_CUDA static auto createNormalReal( - TAcc const & acc) + RandStl const & rand) -> std::normal_distribution { + boost::ignore_unused(rand); return std::normal_distribution(); } }; @@ -65,24 +73,21 @@ namespace alpaka //! The CPU device random number float uniform distribution get trait specialization. //############################################################################# template< - typename TAcc, typename T> struct CreateUniformReal< - TAcc, + RandStl, T, typename std::enable_if< - std::is_same< - dev::Dev, - dev::DevCpu>::value - && std::is_floating_point::value>::type> + std::is_floating_point::value>::type> { //----------------------------------------------------------------------------- // //----------------------------------------------------------------------------- ALPAKA_FN_ACC_NO_CUDA static auto createUniformReal( - TAcc const & acc) + RandStl const & rand) -> std::uniform_real_distribution { + boost::ignore_unused(rand); return std::uniform_real_distribution(); } }; @@ -90,24 +95,21 @@ namespace alpaka //! The CPU device random number integer uniform distribution get trait specialization. //############################################################################# template< - typename TAcc, typename T> struct CreateUniformUint< - TAcc, + RandStl, T, typename std::enable_if< - std::is_same< - dev::Dev, - dev::DevCpu>::value - && std::is_integral::value>::type> + std::is_integral::value>::type> { //----------------------------------------------------------------------------- // //----------------------------------------------------------------------------- ALPAKA_FN_ACC_NO_CUDA static auto createUniformUint( - TAcc const & acc) + RandStl const & rand) -> std::uniform_int_distribution { + boost::ignore_unused(rand); return std::uniform_int_distribution( 0, // For signed integer: std::numeric_limits::lowest() std::numeric_limits::max()); @@ -122,24 +124,20 @@ namespace alpaka //############################################################################# //! The CPU device random number default generator get trait specialization. //############################################################################# - template< - typename TAcc> + template<> struct CreateDefault< - TAcc, - typename std::enable_if< - std::is_same< - dev::Dev, - dev::DevCpu>::value>::type> + RandStl> { //----------------------------------------------------------------------------- // //----------------------------------------------------------------------------- ALPAKA_FN_ACC_NO_CUDA static auto createDefault( - TAcc const & acc, + RandStl const & rand, std::uint32_t const & seed, std::uint32_t const & subsequence) -> std::mt19937 { + boost::ignore_unused(rand); // NOTE: XOR the seed and the subsequence to generate a unique seed. return std::mt19937(seed ^ subsequence); } diff --git a/include/alpaka/rand/Traits.hpp b/include/alpaka/rand/Traits.hpp index a9a6025..7e91282 100644 --- a/include/alpaka/rand/Traits.hpp +++ b/include/alpaka/rand/Traits.hpp @@ -47,7 +47,7 @@ namespace alpaka //! The random number float normal distribution get trait. //############################################################################# template< - typename TAcc, + typename TRand, typename T, typename TSfinae = void> struct CreateNormalReal; @@ -56,7 +56,7 @@ namespace alpaka //! The random number float uniform distribution get trait. //############################################################################# template< - typename TAcc, + typename TRand, typename T, typename TSfinae = void> struct CreateUniformReal; @@ -65,7 +65,7 @@ namespace alpaka //! The random number integer uniform distribution get trait. //############################################################################# template< - typename TAcc, + typename TRand, typename T, typename TSfinae = void> struct CreateUniformUint; @@ -77,25 +77,26 @@ namespace alpaka ALPAKA_NO_HOST_ACC_WARNING template< typename T, - typename TAcc> + typename TRand> ALPAKA_FN_HOST_ACC auto createNormalReal( - TAcc const & acc) + TRand const & rand) -> decltype( traits::CreateNormalReal< - TAcc, + TRand, T> ::createNormalReal( - std::declval())) + std::declval())) { static_assert( std::is_floating_point::value, "The value type T has to be a floating point type!"); - return traits::CreateNormalReal< - TAcc, - T> - ::createNormalReal( - acc); + return + traits::CreateNormalReal< + TRand, + T> + ::createNormalReal( + rand); } //----------------------------------------------------------------------------- //! \return A uniform floating point distribution [0.0, 1.0). @@ -103,25 +104,26 @@ namespace alpaka ALPAKA_NO_HOST_ACC_WARNING template< typename T, - typename TAcc> + typename TRand> ALPAKA_FN_HOST_ACC auto createUniformReal( - TAcc const & acc) + TRand const & rand) -> decltype( traits::CreateUniformReal< - TAcc, + TRand, T> ::createUniformReal( - std::declval())) + std::declval())) { static_assert( std::is_floating_point::value, "The value type T has to be a floating point type!"); - return traits::CreateUniformReal< - TAcc, - T> - ::createUniformReal( - acc); + return + traits::CreateUniformReal< + TRand, + T> + ::createUniformReal( + rand); } //----------------------------------------------------------------------------- //! \return A uniform integer distribution [0, UINT_MAX]. @@ -129,25 +131,116 @@ namespace alpaka ALPAKA_NO_HOST_ACC_WARNING template< typename T, - typename TAcc> + typename TRand> ALPAKA_FN_HOST_ACC auto createUniformUint( - TAcc const & acc) + TRand const & rand) -> decltype( traits::CreateUniformUint< - TAcc, + TRand, T> ::createUniformUint( - std::declval())) + std::declval())) { static_assert( std::is_integral::value && std::is_unsigned::value, "The value type T has to be a unsigned integral type!"); - return traits::CreateUniformUint< - TAcc, - T> - ::createUniformUint( - acc); + return + traits::CreateUniformUint< + TRand, + T> + ::createUniformUint( + rand); + } + namespace traits + { + //############################################################################# + //! The CreateNormalReal specialization for classes with RandBase member type. + //############################################################################# + template< + typename TRand, + typename T> + struct CreateNormalReal< + TRand, + T, + typename std::enable_if< + std::is_base_of::type>::value + && (!std::is_same::type>::value)>::type> + { + //----------------------------------------------------------------------------- + // + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + ALPAKA_FN_HOST_ACC static auto createNormalReal( + TRand const & rand) + -> decltype( + rand::distribution::createNormalReal( + static_cast(rand))) + { + // Delegate the call to the base class. + return + rand::distribution::createNormalReal( + static_cast(rand)); + } + }; + //############################################################################# + //! The CreateUniformReal specialization for classes with RandBase member type. + //############################################################################# + template< + typename TRand, + typename T> + struct CreateUniformReal< + TRand, + T, + typename std::enable_if< + std::is_base_of::type>::value + && (!std::is_same::type>::value)>::type> + { + //----------------------------------------------------------------------------- + // + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + ALPAKA_FN_HOST_ACC static auto createUniformReal( + TRand const & rand) + -> decltype( + rand::distribution::createUniformReal( + static_cast(rand))) + { + // Delegate the call to the base class. + return + rand::distribution::createUniformReal( + static_cast(rand)); + } + }; + //############################################################################# + //! The CreateUniformUint specialization for classes with RandBase member type. + //############################################################################# + template< + typename TRand, + typename T> + struct CreateUniformUint< + TRand, + T, + typename std::enable_if< + std::is_base_of::type>::value + && (!std::is_same::type>::value)>::type> + { + //----------------------------------------------------------------------------- + // + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + ALPAKA_FN_HOST_ACC static auto createUniformUint( + TRand const & rand) + -> decltype( + rand::distribution::createUniformUint( + static_cast(rand))) + { + // Delegate the call to the base class. + return + rand::distribution::createUniformUint( + static_cast(rand)); + } + }; } } //----------------------------------------------------------------------------- @@ -164,7 +257,7 @@ namespace alpaka //! The random number default generator get trait. //############################################################################# template< - typename TAcc, + typename TRand, typename TSfinae = void> struct CreateDefault; } @@ -173,25 +266,62 @@ namespace alpaka //----------------------------------------------------------------------------- ALPAKA_NO_HOST_ACC_WARNING template< - typename TAcc> + typename TRand> ALPAKA_FN_HOST_ACC auto createDefault( - TAcc const & acc, + TRand const & rand, std::uint32_t const & seed, std::uint32_t const & subsequence) -> decltype( traits::CreateDefault< - TAcc> + TRand> ::createDefault( - std::declval(), + std::declval(), std::declval(), std::declval())) { - return traits::CreateDefault< - TAcc> - ::createDefault( - acc, - seed, - subsequence); + return + traits::CreateDefault< + TRand> + ::createDefault( + rand, + seed, + subsequence); + } + namespace traits + { + //############################################################################# + //! The CreateDefault specialization for classes with RandBase member type. + //############################################################################# + template< + typename TRand> + struct CreateDefault< + TRand, + typename std::enable_if< + std::is_base_of::type>::value + && (!std::is_same::type>::value)>::type> + { + //----------------------------------------------------------------------------- + // + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + ALPAKA_FN_HOST_ACC static auto createDefault( + TRand const & rand, + std::uint32_t const & seed, + std::uint32_t const & subsequence) + -> decltype( + rand::generator::createDefault( + static_cast(rand), + seed, + subsequence)) + { + // Delegate the call to the base class. + return + rand::generator::createDefault( + static_cast(rand), + seed, + subsequence); + } + }; } } } diff --git a/include/alpaka/size/Traits.hpp b/include/alpaka/size/Traits.hpp index 950d8f8..954efab 100644 --- a/include/alpaka/size/Traits.hpp +++ b/include/alpaka/size/Traits.hpp @@ -21,7 +21,7 @@ #pragma once -#include // std::enable_if, std::is_arithmetic +#include // std::enable_if, std::is_arithmetic namespace alpaka { diff --git a/include/alpaka/stream/StreamCpuAsync.hpp b/include/alpaka/stream/StreamCpuAsync.hpp index 52d6e82..e8ad0b3 100644 --- a/include/alpaka/stream/StreamCpuAsync.hpp +++ b/include/alpaka/stream/StreamCpuAsync.hpp @@ -223,6 +223,7 @@ namespace alpaka { //############################################################################# //! The CPU async device stream enqueue trait specialization. + //! This default implementation for all tasks directly invokes the function call operator of the task. //############################################################################# template< typename TTask> diff --git a/include/alpaka/stream/StreamCpuSync.hpp b/include/alpaka/stream/StreamCpuSync.hpp index 93b88d1..bee87b8 100644 --- a/include/alpaka/stream/StreamCpuSync.hpp +++ b/include/alpaka/stream/StreamCpuSync.hpp @@ -198,6 +198,7 @@ namespace alpaka { //############################################################################# //! The CPU sync device stream enqueue trait specialization. + //! This default implementation for all tasks directly invokes the function call operator of the task. //############################################################################# template< typename TTask> diff --git a/include/alpaka/vec/Vec.hpp b/include/alpaka/vec/Vec.hpp index db6f763..9cec6a6 100644 --- a/include/alpaka/vec/Vec.hpp +++ b/include/alpaka/vec/Vec.hpp @@ -42,6 +42,12 @@ #include // std::enable_if #include // std::min, std::max, std::min_element, std::max_element +#define ALPAKA_CREATE_VEC_IN_CLASS +// The nvcc compiler does not support the out of class version. +#ifdef __CUDACC__ + #define ALPAKA_CREATE_VEC_IN_CLASS +#endif + namespace alpaka { template< @@ -49,7 +55,7 @@ namespace alpaka typename TSize> class Vec; -#ifndef __CUDACC__ +#ifndef ALPAKA_CREATE_VEC_IN_CLASS //----------------------------------------------------------------------------- //! Single value constructor helper. //----------------------------------------------------------------------------- @@ -171,10 +177,10 @@ namespace alpaka ALPAKA_FN_HOST_ACC Vec( TArg0 && arg0, TArgs && ... args) : - m_auiData{std::forward(arg0), std::forward(args)...} + m_data{std::forward(arg0), std::forward(args)...} {} -#ifdef __CUDACC__ +#ifdef ALPAKA_CREATE_VEC_IN_CLASS //----------------------------------------------------------------------------- //! Creator using func(args...) to initialize all values of the vector. //----------------------------------------------------------------------------- @@ -294,7 +300,7 @@ namespace alpaka { return createVecFromIndexedFn< -#ifndef __CUDACC__ +#ifndef ALPAKA_CREATE_VEC_IN_CLASS TDim, #endif CreateSingleVal>( @@ -335,7 +341,7 @@ namespace alpaka core::assertValueUnsigned(iIdx); auto const idx(static_cast(iIdx)); assert(idx(iIdx)); assert(idx decltype( -#if (BOOST_COMP_GNUC) && (BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(5, 0, 0)) +#if (BOOST_COMP_GNUC && (BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(5, 0, 0))) || __INTEL_COMPILER this->foldrByIndices( #else foldrByIndices( @@ -483,10 +489,10 @@ namespace alpaka return static_cast( std::distance( - std::begin(m_auiData), + std::begin(m_data), std::min_element( - std::begin(m_auiData), - std::end(m_auiData)))); + std::begin(m_data), + std::end(m_data)))); } //----------------------------------------------------------------------------- //! \return The index of the maximal element. @@ -498,32 +504,17 @@ namespace alpaka return static_cast( std::distance( - std::begin(m_auiData), + std::begin(m_data), std::max_element( - std::begin(m_auiData), - std::end(m_auiData)))); + std::begin(m_data), + std::end(m_data)))); } private: // 16 Byte alignment for usage inside of CUDA kernels. - alignas(16u) TSize m_auiData[TDim::value]; + alignas(16u) TSize m_data[TDim::value]; }; - template< - typename TSize> - using Vec1 = Vec, TSize>; - - template< - typename TSize> - using Vec2 = Vec, TSize>; - - template< - typename TSize> - using Vec3 = Vec, TSize>; - - template< - typename TSize> - using Vec4 = Vec, TSize>; namespace detail { @@ -563,11 +554,11 @@ namespace alpaka -> Vec { return -#ifdef __CUDACC__ +#ifdef ALPAKA_CREATE_VEC_IN_CLASS Vec::template #endif createVecFromIndexedFn< -#ifndef __CUDACC__ +#ifndef ALPAKA_CREATE_VEC_IN_CLASS TDim, #endif detail::CreateAdd>( @@ -613,11 +604,11 @@ namespace alpaka -> Vec { return -#ifdef __CUDACC__ +#ifdef ALPAKA_CREATE_VEC_IN_CLASS Vec::template #endif createVecFromIndexedFn< -#ifndef __CUDACC__ +#ifndef ALPAKA_CREATE_VEC_IN_CLASS TDim, #endif detail::CreateMul>( @@ -771,15 +762,15 @@ namespace alpaka //----------------------------------------------------------------------------- ALPAKA_NO_HOST_ACC_WARNING template< - typename TValNew, + typename TSizeNew, typename TDim, typename TSize> ALPAKA_FN_HOST_ACC static auto create( - TValNew const &/* valNew*/, + TSizeNew const &/* valNew*/, Vec const & vec) - -> TValNew + -> TSizeNew { - return static_cast(vec[Tidx]); + return static_cast(vec[Tidx]); } }; @@ -789,22 +780,22 @@ namespace alpaka //----------------------------------------------------------------------------- ALPAKA_NO_HOST_ACC_WARNING template< - typename TValNew, + typename TSizeNew, typename TDim, typename TSize> ALPAKA_FN_HOST_ACC static auto cast(Vec const & other) - -> Vec + -> Vec { return -#ifdef __CUDACC__ - Vec::template +#ifdef ALPAKA_CREATE_VEC_IN_CLASS + Vec::template #endif createVecFromIndexedFn< -#ifndef __CUDACC__ +#ifndef ALPAKA_CREATE_VEC_IN_CLASS TDim, #endif detail::CreateCast>( - TValNew(), + TSizeNew(), other); } } @@ -844,11 +835,11 @@ namespace alpaka -> Vec { return -#ifdef __CUDACC__ +#ifdef ALPAKA_CREATE_VEC_IN_CLASS Vec::template #endif createVecFromIndexedFn< -#ifndef __CUDACC__ +#ifndef ALPAKA_CREATE_VEC_IN_CLASS TDim, #endif detail::CreateReverse>( @@ -891,11 +882,11 @@ namespace alpaka -> Vec, size::Size> { return -#ifdef __CUDACC__ +#ifdef ALPAKA_CREATE_VEC_IN_CLASS Vec, size::Size>::template #endif createVecFromIndexedFn< -#ifndef __CUDACC__ +#ifndef ALPAKA_CREATE_VEC_IN_CLASS dim::Dim, #endif detail::CreateExtent>( @@ -914,11 +905,11 @@ namespace alpaka { using IdxOffset = std::integral_constant::value)-((std::intmax_t)TDim::value)>; return -#ifdef __CUDACC__ +#ifdef ALPAKA_CREATE_VEC_IN_CLASS Vec>::template #endif createVecFromIndexedFnOffset< -#ifndef __CUDACC__ +#ifndef ALPAKA_CREATE_VEC_IN_CLASS TDim, #endif detail::CreateExtent, @@ -963,11 +954,11 @@ namespace alpaka -> Vec, size::Size> { return -#ifdef __CUDACC__ +#ifdef ALPAKA_CREATE_VEC_IN_CLASS Vec, size::Size>::template #endif createVecFromIndexedFn< -#ifndef __CUDACC__ +#ifndef ALPAKA_CREATE_VEC_IN_CLASS dim::Dim, #endif detail::CreateOffset>( @@ -986,11 +977,11 @@ namespace alpaka { using IdxOffset = std::integral_constant::value)-((std::intmax_t)TDim::value))>; return -#ifdef __CUDACC__ +#ifdef ALPAKA_CREATE_VEC_IN_CLASS Vec>::template #endif createVecFromIndexedFnOffset< -#ifndef __CUDACC__ +#ifndef ALPAKA_CREATE_VEC_IN_CLASS TDim, #endif detail::CreateOffset, diff --git a/include/alpaka/workdiv/Traits.hpp b/include/alpaka/workdiv/Traits.hpp index 15947f6..b848ed1 100644 --- a/include/alpaka/workdiv/Traits.hpp +++ b/include/alpaka/workdiv/Traits.hpp @@ -76,17 +76,44 @@ namespace alpaka namespace traits { + //############################################################################# + //! The WorkDivMembers grid block extents trait specialization for classes with WorkDivBase member type. + //############################################################################# + template< + typename TWorkDiv> + struct GetWorkDiv< + TWorkDiv, + origin::Grid, + unit::Blocks, + typename std::enable_if< + std::is_base_of::type>::value + && (!std::is_same::type>::value)>::type> + { + //----------------------------------------------------------------------------- + //! \return The number of threads in each dimension of a block. + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + ALPAKA_FN_HOST_ACC static auto getWorkDiv( + TWorkDiv const & workDiv) + -> Vec, size::Size> + { + // Delegate the call to the base class. + return + workdiv::getWorkDiv< + origin::Grid, + unit::Blocks>( + static_cast(workDiv)); + } + }; //############################################################################# //! The WorkDivMembers block thread extents trait specialization for classes with WorkDivBase member type. //############################################################################# template< - typename TWorkDiv, - typename TOrigin, - typename TUnit> + typename TWorkDiv> struct GetWorkDiv< TWorkDiv, - TOrigin, - TUnit, + origin::Block, + unit::Threads, typename std::enable_if< std::is_base_of::type>::value && (!std::is_same::type>::value)>::type> @@ -102,8 +129,8 @@ namespace alpaka // Delegate the call to the base class. return workdiv::getWorkDiv< - TOrigin, - TUnit>( + origin::Block, + unit::Threads>( static_cast(workDiv)); } };