From 66f2e539b2a48903b3a5fa9243c48e0dd5b28082 Mon Sep 17 00:00:00 2001
From: Erik Zenker <erikzenker@posteo.de>
Date: Mon, 5 Oct 2015 19:05:02 +0200
Subject: [PATCH] Squashed 'include/alpaka/' changes from 955fa97..9dfdb96

9dfdb96 fix missing OpenMP link flag
b9f099c fix foldrAll ICC bug
83ddac5 disable the OpenMP 4 back-end by default
8644064 fix Vec for Intel
819e5d9 fix boost 1.56 missing const bug
f9cd663 really fix Intel cpuid
330d983 remove incorrect docu
9f1b692 fix Intel compiler cpuid
1aa4c86 fix missing OMP_NUM_THREADS reset in getMaxOmpThreads
328e866 fix CUDA compilation
33c7888 remove ICC from the readme (untested / not compiling)
40a8465 always interpret all source files as .cu files for nvcc
25f4670 allow vectorize to be called without the element type
882c0a9 enhance documentation
05454a6 fix ambiguous template specialization for GetWorkDiv
5b70326 remove call to std::ref in BlockSharedAllocCudaBuiltIn
e15c40a fix fix AtomicOmpCritSec
afffe2f fix wrong atomic implementation for AccCpuOmp2Blocks
2a60bbb fix BufCudaRt destruction
062378d add ALPAKA_ADD_EXECUTABLE to alpakaConfig.cmake
b9a4125 use DimInt more consistently
919dc26 move ElemType from mem::view to elem
2807fc8 add initial ALPAKA_ADD_EXECUTABLE
f019e70 fix BufPlainPtrWrapper pitch
1ca1923 fix missing OpenMP linker flag
9ee231d fix getFreeGlobalMemSizeBytes
7e853c6 Merge pull request #54 from psychocoderHPC/fix-cudaSet
6796eff Merge pull request #55 from psychocoderHPC/fix-callingHostFunctionFromDevice
9f3d8e6 fix warning calling host function from device
000a250 fix wrong usage of `getPitchBytes<>()`
8be955d Merge pull request #53 from psychocoderHPC/topic-suppressHostDeviceWarning
b7c877d Merge pull request #52 from psychocoderHPC/tpoic-updateGitIgnore
0b94251 suppress host device warning
33a59be update `.gitignore`
237898f refactoring
d0ad945 implement getFreeGlobalMemSizeBytes
f85e233 allow accelerators to inherit from rand implementation
d96e8b5 fix CUDA set implemenentation

git-subtree-dir: include/alpaka
git-subtree-split: 9dfdb96b0cb2fc32a1f2e447de755905f7538bf4
---
 .gitignore                                    |  13 ++
 .travis.yml                                   |  56 ++---
 README.md                                     |  33 ++-
 alpakaConfig.cmake                            |  21 +-
 cmake/addExecutable.cmake                     |  39 ++++
 doc/Abstraction.md                            | 127 -----------
 doc/Distinction.md                            | 125 -----------
 doc/Threading.md                              |  57 -----
 examples/mandelbrot/CMakeLists.txt            |  42 +---
 examples/mandelbrot/src/main.cpp              |   6 +-
 examples/mandelbrot/src/main.cu               |  22 --
 examples/matMul/CMakeLists.txt                |  42 +---
 examples/matMul/src/main.cpp                  |   6 +-
 examples/matMul/src/main.cu                   |  22 --
 examples/sharedMem/CMakeLists.txt             |  42 +---
 examples/sharedMem/src/main.cu                |  22 --
 examples/vectorAdd/CMakeLists.txt             |  42 +---
 examples/vectorAdd/src/main.cpp               |   2 +-
 examples/vectorAdd/src/main.cu                |  22 --
 include/alpaka/acc/AccCpuFibers.hpp           |   5 +-
 include/alpaka/acc/AccCpuOmp2Blocks.hpp       |  11 +-
 include/alpaka/acc/AccCpuOmp2Threads.hpp      |   5 +-
 include/alpaka/acc/AccCpuOmp4.hpp             |   5 +-
 include/alpaka/acc/AccCpuSerial.hpp           |   5 +-
 include/alpaka/acc/AccCpuThreads.hpp          |   7 +-
 include/alpaka/acc/AccGpuCudaRt.hpp           |   7 +-
 include/alpaka/atomic/Traits.hpp              |  17 +-
 .../shared/BlockSharedAllocCudaBuiltIn.hpp    |   2 +-
 include/alpaka/block/shared/Traits.hpp        |  24 +-
 .../block/sync/BlockSyncCudaBuiltIn.hpp       |   2 +-
 .../block/sync/BlockSyncFiberIdMapBarrier.hpp |   4 +-
 include/alpaka/block/sync/BlockSyncNoOp.hpp   |   3 +-
 .../sync/BlockSyncThreadIdMapBarrier.hpp      |   4 +-
 include/alpaka/core/ConcurrentExecPool.hpp    |   5 +-
 include/alpaka/core/Cuda.hpp                  |  24 +-
 include/alpaka/core/Fold.hpp                  |   8 +-
 include/alpaka/core/NdLoop.hpp                |   2 +-
 include/alpaka/core/OpenMp.hpp                |  15 +-
 include/alpaka/core/Vectorize.hpp             |  73 +++---
 include/alpaka/dev/DevCpu.hpp                 |   7 +-
 include/alpaka/dev/Traits.hpp                 |  36 +--
 include/alpaka/dev/cpu/SysInfo.hpp            |  75 +++++--
 include/alpaka/dim/DimArithmetic.hpp          |   2 +-
 include/alpaka/elem/Traits.hpp                |  72 ++++++
 include/alpaka/event/EventCpu.hpp             |   6 +-
 include/alpaka/event/Traits.hpp               |   9 +-
 include/alpaka/exec/ExecCpuOmp2Blocks.hpp     |   4 +-
 include/alpaka/exec/ExecCpuOmp4.hpp           |   4 +-
 include/alpaka/exec/ExecGpuCudaRt.hpp         |   6 +-
 include/alpaka/extent/Traits.hpp              |   8 +-
 include/alpaka/idx/Traits.hpp                 |  30 +--
 include/alpaka/idx/bt/IdxBtOmp.hpp            |   2 +-
 include/alpaka/math/abs/Traits.hpp            |  13 +-
 include/alpaka/math/acos/Traits.hpp           |  13 +-
 include/alpaka/math/asin/Traits.hpp           |  13 +-
 include/alpaka/math/atan/Traits.hpp           |  13 +-
 include/alpaka/math/atan2/Traits.hpp          |  17 +-
 include/alpaka/math/cbrt/Traits.hpp           |  13 +-
 include/alpaka/math/ceil/Traits.hpp           |  13 +-
 include/alpaka/math/cos/Traits.hpp            |  13 +-
 include/alpaka/math/erf/Traits.hpp            |  13 +-
 include/alpaka/math/exp/Traits.hpp            |  13 +-
 include/alpaka/math/floor/Traits.hpp          |  13 +-
 include/alpaka/math/fmod/Traits.hpp           |  17 +-
 include/alpaka/math/log/Traits.hpp            |  13 +-
 include/alpaka/math/max/Traits.hpp            |  17 +-
 include/alpaka/math/min/Traits.hpp            |  17 +-
 include/alpaka/math/pow/Traits.hpp            |  17 +-
 include/alpaka/math/remainder/Traits.hpp      |  17 +-
 include/alpaka/math/round/Traits.hpp          |  39 ++--
 include/alpaka/math/rsqrt/Traits.hpp          |  13 +-
 include/alpaka/math/sin/Traits.hpp            |  13 +-
 include/alpaka/math/sqrt/Traits.hpp           |  13 +-
 include/alpaka/math/tan/Traits.hpp            |  13 +-
 include/alpaka/math/trunc/Traits.hpp          |  13 +-
 .../alpaka/math/trunc/TruncCudaBuiltIn.hpp    |   5 +-
 include/alpaka/mem/buf/BufCpu.hpp             |  36 +--
 include/alpaka/mem/buf/BufCudaRt.hpp          |  44 ++--
 include/alpaka/mem/buf/BufPlainPtrWrapper.hpp |  55 ++---
 include/alpaka/mem/buf/BufStdContainers.hpp   |  99 ++++----
 include/alpaka/mem/buf/Traits.hpp             |  70 +++---
 include/alpaka/mem/buf/cpu/Copy.hpp           |   6 +-
 include/alpaka/mem/buf/cpu/Set.hpp            |   6 +-
 include/alpaka/mem/buf/cuda/Copy.hpp          |  18 +-
 include/alpaka/mem/buf/cuda/Set.hpp           |  22 +-
 include/alpaka/mem/view/Traits.hpp            |  86 ++++---
 include/alpaka/mem/view/ViewBasic.hpp         |  51 +++--
 include/alpaka/offset/Traits.hpp              |   8 +-
 include/alpaka/rand/RandCuRand.hpp            |  50 ++---
 include/alpaka/rand/RandStl.hpp               |  56 +++--
 include/alpaka/rand/Traits.hpp                | 212 ++++++++++++++----
 include/alpaka/size/Traits.hpp                |   2 +-
 include/alpaka/stream/StreamCpuAsync.hpp      |   1 +
 include/alpaka/stream/StreamCpuSync.hpp       |   1 +
 include/alpaka/vec/Vec.hpp                    |  97 ++++----
 include/alpaka/workdiv/Traits.hpp             |  41 +++-
 96 files changed, 1237 insertions(+), 1308 deletions(-)
 create mode 100644 cmake/addExecutable.cmake
 delete mode 100644 doc/Abstraction.md
 delete mode 100644 doc/Distinction.md
 delete mode 100644 doc/Threading.md
 delete mode 100644 examples/mandelbrot/src/main.cu
 delete mode 100644 examples/matMul/src/main.cu
 delete mode 100644 examples/sharedMem/src/main.cu
 delete mode 100644 examples/vectorAdd/src/main.cu
 create mode 100644 include/alpaka/elem/Traits.hpp

diff --git a/.gitignore b/.gitignore
index d943d9a..921e1af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,16 @@
 !/doc/doxygen/Doxyfile
 /doc/latex/*
 **/build
+
+# tmp files
+*~
+
+# netbeans project files
+/nbproject/
+
+# Code::Blocks project files
+/*.cbp
+/*.layout
+
+# original backup files
+*.orig
diff --git a/.travis.yml b/.travis.yml
index d7d4bb3..3601c49 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -45,7 +45,7 @@ compiler:
 #   [clang++] ALPAKA_CLANG_VER                  : {3.5, 3.6, 3.7}
 #   [g++] ALPAKA_GCC_VER                        : {4.9, 5}
 # ALPAKA_BOOST_BRANCH                           : {boost-1.56.0, boost-1.57.0, boost-1.58.0, boost-1.59.0, develop}
-# ALPAKA_CMAKE_VER                              : {2.8.12.2, 3.0.2, 3.1.3, 3.2.2, 3.3.1}
+# ALPAKA_CMAKE_VER                              : {3.3.0, 3.3.1, 3.3.2}
 # CMAKE_BUILD_TYPE                              : {Debug, Release}
 # ALPAKA_DEBUG                                  : {0, 1, 2}
 # ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE             : {ON, OFF}
@@ -73,28 +73,28 @@ env:
         - ALPAKA_CLANG_LIBSTDCPP_VERSION=4.9
 
     matrix:
-        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.3.1    CMAKE_BUILD_TYPE=Debug   OMP_NUM_THREADS=4 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.56.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON
-        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=0 ALPAKA_CMAKE_VER=3.2.2    CMAKE_BUILD_TYPE=Debug   OMP_NUM_THREADS=3 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.57.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON
-        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.2.2    CMAKE_BUILD_TYPE=Debug   OMP_NUM_THREADS=2 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.58.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
-        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON  ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.1.3    CMAKE_BUILD_TYPE=Debug   OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5   ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.59.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
-        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON  ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.2.2    CMAKE_BUILD_TYPE=Debug   OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5   ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop      ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
-
-        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.3.1    CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.58.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON
-        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=0 ALPAKA_CMAKE_VER=3.2.2    CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=3 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.59.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON
-        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.0.2    CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=2 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.56.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
-        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=0 ALPAKA_CMAKE_VER=2.8.12.2 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=1 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.57.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
-        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.2.2    CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5   ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop      ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
+        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.3.2 CMAKE_BUILD_TYPE=Debug   OMP_NUM_THREADS=4 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.56.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON
+        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=0 ALPAKA_CMAKE_VER=3.3.1 CMAKE_BUILD_TYPE=Debug   OMP_NUM_THREADS=3 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.57.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON
+        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.3.0 CMAKE_BUILD_TYPE=Debug   OMP_NUM_THREADS=2 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.58.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
+        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON  ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.3.2 CMAKE_BUILD_TYPE=Debug   OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5   ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.59.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
+        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON  ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.3.1 CMAKE_BUILD_TYPE=Debug   OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5   ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop      ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
+
+        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.3.0 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.58.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON
+        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=0 ALPAKA_CMAKE_VER=3.3.2 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=3 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.59.0 ALPAKA_ACC_GPU_CUDA_ENABLE=ON
+        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=2 ALPAKA_CMAKE_VER=3.3.1 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=2 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.6 ALPAKA_BOOST_BRANCH=boost-1.56.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
+        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=0 ALPAKA_CMAKE_VER=3.3.0 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=1 ALPAKA_GCC_VER=4.9 ALPAKA_CLANG_VER=3.5 ALPAKA_BOOST_BRANCH=boost-1.57.0 ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
+        - ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.3.2 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5   ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop      ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
 
 matrix:
     allow_failures:
         - compiler: gcc
-          env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON  ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.2.2    CMAKE_BUILD_TYPE=Debug   OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5   ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop      ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
+          env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON  ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.3.1 CMAKE_BUILD_TYPE=Debug   OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5   ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop      ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
         - compiler: gcc
-          env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.2.2    CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5   ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop      ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
+          env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.3.2 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5   ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop      ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
         - compiler: clang
-          env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON  ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.2.2    CMAKE_BUILD_TYPE=Debug   OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5   ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop      ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
+          env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=ON  ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.3.1 CMAKE_BUILD_TYPE=Debug   OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5   ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop      ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
         - compiler: clang
-          env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.2.2    CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5   ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop      ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
+          env: ALPAKA_CUDA_VERSION=7.0 ALPAKA_ANALYSIS=OFF ALPAKA_DEBUG=1 ALPAKA_CMAKE_VER=3.3.2 CMAKE_BUILD_TYPE=Release OMP_NUM_THREADS=4 ALPAKA_GCC_VER=5   ALPAKA_CLANG_VER=3.7 ALPAKA_BOOST_BRANCH=develop      ALPAKA_ACC_GPU_CUDA_ENABLE=OFF
         - os: osx
 
 branches:
@@ -314,25 +314,11 @@ install:
     - echo ${ALPAKA_CMAKE_VER_MAJOR}
     - ALPAKA_CMAKE_VER_MINOR=${ALPAKA_CMAKE_VER:2:1}
     - echo ${ALPAKA_CMAKE_VER_MINOR}
-    # NOTE: Older versions have to be built from source because CMake does not provide older x64 binaries.
-    - if (( (( ${ALPAKA_CMAKE_VER_MAJOR} < 3 )) || ( (( ${ALPAKA_CMAKE_VER_MAJOR} == 3 )) && (( ${ALPAKA_CMAKE_VER_MINOR} < 1 )) ) ))
-      ;then
-          ALPAKA_CMAKE_PKG_FILE_NAME=cmake-${ALPAKA_CMAKE_VER}
-          && wget http://www.cmake.org/files/v${ALPAKA_CMAKE_VER_MAJOR}.${ALPAKA_CMAKE_VER_MINOR}/${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz
-          && tar -xzf ${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz
-          && cd cmake-${ALPAKA_CMAKE_VER}
-          && ./bootstrap --prefix=/usr
-          && make
-          && sudo make install
-          && cd ../
-          && sudo rm -rf ${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz ${ALPAKA_CMAKE_PKG_FILE_NAME}
-      ;else
-          ALPAKA_CMAKE_PKG_FILE_NAME=cmake-${ALPAKA_CMAKE_VER}-Linux-x86_64
-          && wget http://www.cmake.org/files/v${ALPAKA_CMAKE_VER_MAJOR}.${ALPAKA_CMAKE_VER_MINOR}/${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz
-          && tar -xzf ${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz
-          && sudo cp -fR cmake-${ALPAKA_CMAKE_VER}-Linux-x86_64/* /usr
-          && sudo rm -rf ${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz ${ALPAKA_CMAKE_PKG_FILE_NAME}
-      ;fi
+    - ALPAKA_CMAKE_PKG_FILE_NAME=cmake-${ALPAKA_CMAKE_VER}-Linux-x86_64
+    - wget http://www.cmake.org/files/v${ALPAKA_CMAKE_VER_MAJOR}.${ALPAKA_CMAKE_VER_MINOR}/${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz
+      && tar -xzf ${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz
+      && sudo cp -fR cmake-${ALPAKA_CMAKE_VER}-Linux-x86_64/* /usr
+      && sudo rm -rf ${ALPAKA_CMAKE_PKG_FILE_NAME}.tar.gz ${ALPAKA_CMAKE_PKG_FILE_NAME}
 
     #-------------------------------------------------------------------------------
     # Clone boost.
diff --git a/README.md b/README.md
index 55658ea..e5421a2 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ Accelerator Back-ends
 |Serial|n/a|Host CPU (single core)|sequential|sequential (only 1 thread per block)|
 |OpenMP 2.0 blocks|OpenMP 2.0|Host CPU (multi core)|parallel (preemptive multitasking)|sequential (only 1 thread per block)|
 |OpenMP 2.0 threads|OpenMP 2.0|Host CPU (multi core)|sequential|parallel (preemptive multitasking)|
-|OpenMP 4.0|OpenMP 4.0|Host CPU (multi core)|parallel (undefined)|parallel (preemptive multitasking)|
+|OpenMP 4.0 (CPU)|OpenMP 4.0|Host CPU (multi core)|parallel (undefined)|parallel (preemptive multitasking)|
 | std::thread | std::thread |Host CPU (multi core)|sequential|parallel (preemptive multitasking)|
 | Boost.Fiber | boost::fibers::fiber |Host CPU (single core)|sequential|parallel (cooperative multitasking)|
 |CUDA 7.0|CUDA 7.0|NVIDIA GPUs SM 2.0+|parallel (undefined)|parallel (lock-step within warps)|
@@ -55,15 +55,15 @@ Supported Compilers
 
 This library uses C++11 (or newer when available).
 
-|Accelerator Back-end|gcc 4.9.2|gcc 5.2|clang 3.5/3.6|clang 3.7|MSVC 2015|icc 15.0+ (untested)|
-|---|---|---|---|---|---|---|
-|Serial|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
-|OpenMP 2.0 blocks|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
-|OpenMP 2.0 threads|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
-|OpenMP 4.0|:white_check_mark:|:white_check_mark:|:x:|:x:|:x:|:white_check_mark:|
-| std::thread |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
-| Boost.Fiber |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
-|CUDA 7.0|:white_check_mark:|:x:|:x:|:x:|:x:|:white_check_mark:|
+|Accelerator Back-end|gcc 4.9.2|gcc 5.2|clang 3.5/3.6|clang 3.7|MSVC 2015|
+|---|---|---|---|---|---|
+|Serial|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
+|OpenMP 2.0 blocks|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|
+|OpenMP 2.0 threads|:white_check_mark:|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|
+|OpenMP 4.0 (CPU)|:white_check_mark:|:white_check_mark:|:x:|:x:|:x:|
+| std::thread |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
+| Boost.Fiber |:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|
+|CUDA 7.0|:white_check_mark:|:x:|:x:|:x:|:x:|
 
 **NOTE**: :bangbang: Currently the *CUDA accelerator back-end* can not be enabled together with the *std::thread accelerator back-end* or the *Boost.Fiber accelerator back-end* due to bugs in the NVIDIA nvcc compiler :bangbang:
 
@@ -91,17 +91,14 @@ Usage
 -----
 
 The library is header only so nothing has to be build.
-Only the include path (`-I` or `export CPLUS_INCLUDE_PATH=`) has to be set to `<PATH-TO-ALPAKA-LIB>/include/`.
-This allows to include the whole alpaka library with: `#include <alpaka/alpaka.hpp>`
+CMake 3.3.0+ is required to provide the correct defines and include paths.
+Just call `ALPAKA_ADD_EXECUTABLE` instead of `CUDA_ADD_EXECUTABLE` or `ADD_EXECUTABLE` and the difficulties of the CUDA nvcc compier in handling `.cu` and `.cpp` files is automatically taken care of.
+Examples how to utilize alpaka within CMake can be found in the `examples` folder.
 
+Source files do not need any special file ending.
+The whole alpaka library can be included with: `#include <alpaka/alpaka.hpp>`
 Code not intended to be utilized by users is hidden in the `detail` namespace.
 
-If you are building with the *CUDA accelerator back-end* enabled, your source files are required to have the ending `.cu` to comply with the nvcc (NVIDIA CUDA C++ compiler) rules for code files using CUDA.
-When the *CUDA accelerator back-end* is disabled, this is not required and a `.cpp` extension is enough.
-To allow both use-cases, it is desirable to have both, a `.cpp` file with the implementation and a `.cu` file containing only `#include <PATH/TO/IMPL.cpp>` to forward to the implementation.
-The build system then has to use the `.cu` files when the *CUDA accelerator back-end* is enabled and the `.cpp` files else.
-Examples how to do this with CMake can be found in the `examples` folder.
-
 
 Authors
 -------
diff --git a/alpakaConfig.cmake b/alpakaConfig.cmake
index f6f59b1..b97af32 100644
--- a/alpakaConfig.cmake
+++ b/alpakaConfig.cmake
@@ -44,6 +44,7 @@ UNSET(_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC)
 UNSET(_ALPAKA_LINK_LIBRARIES_PUBLIC)
 UNSET(_ALPAKA_LINK_FLAGS_PUBLIC)
 UNSET(_ALPAKA_COMMON_FILE)
+UNSET(_ALPAKA_ADD_EXECUTABLE_FILE)
 UNSET(_ALPAKA_FILES_HEADER)
 UNSET(_ALPAKA_FILES_SOURCE)
 UNSET(_ALPAKA_FILES_OTHER)
@@ -75,6 +76,10 @@ LIST(APPEND CMAKE_MODULE_PATH "${_ALPAKA_ROOT_DIR}/cmake/modules")
 SET(_ALPAKA_COMMON_FILE "${_ALPAKA_ROOT_DIR}/cmake/common.cmake")
 INCLUDE("${_ALPAKA_COMMON_FILE}")
 
+# Add ALPAKA_ADD_EXECUTABLE function.
+SET(_ALPAKA_ADD_EXECUTABLE_FILE "${_ALPAKA_ROOT_DIR}/cmake/addExecutable.cmake")
+INCLUDE("${_ALPAKA_ADD_EXECUTABLE_FILE}")
+
 #-------------------------------------------------------------------------------
 # Options.
 #-------------------------------------------------------------------------------
@@ -83,7 +88,7 @@ OPTION(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE "Enable the threads CPU block threa
 OPTION(ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE "Enable the fibers CPU block thread accelerator" ON)
 OPTION(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE "Enable the OpenMP 2.0 CPU grid block accelerator" ON)
 OPTION(ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE "Enable the OpenMP 2.0 CPU block thread accelerator" ON)
-OPTION(ALPAKA_ACC_CPU_BT_OMP4_ENABLE "Enable the OpenMP 4.0 CPU block and block thread accelerator" ON)
+OPTION(ALPAKA_ACC_CPU_BT_OMP4_ENABLE "Enable the OpenMP 4.0 CPU block and block thread accelerator" OFF)
 OPTION(ALPAKA_ACC_GPU_CUDA_ENABLE "Enable the CUDA GPU accelerator" ON)
 
 # Drop-down combo box in cmake-gui.
@@ -185,13 +190,12 @@ IF(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE OR ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE OR A
 
     ELSE()
         SET(_ALPAKA_COMPILE_OPTIONS_PUBLIC ${OpenMP_CXX_FLAGS})
+        IF(NOT MSVC)
+            SET(_ALPAKA_LINK_FLAGS_PUBLIC ${OpenMP_CXX_FLAGS})
+        ENDIF()
         # CUDA requires some special handling
         IF(ALPAKA_ACC_GPU_CUDA_ENABLE)
             SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-        ELSE()
-            IF(NOT MSVC)
-                SET(_ALPAKA_LINK_FLAGS_PUBLIC ${OpenMP_CXX_FLAGS})
-            ENDIF()
         ENDIF()
     ENDIF()
 ENDIF()
@@ -356,7 +360,7 @@ SET(_ALPAKA_SUFFIXED_INCLUDE_DIR "${_ALPAKA_INCLUDE_DIRECTORY}/alpaka")
 SET(_ALPAKA_LINK_LIBRARY)
 LIST(APPEND _ALPAKA_LINK_LIBRARIES_PUBLIC "${_ALPAKA_LINK_LIBRARY}")
 
-SET(_ALPAKA_FILES_OTHER "${_ALPAKA_ROOT_DIR}/alpakaConfig.cmake" "${_ALPAKA_COMMON_FILE}" "${_ALPAKA_ROOT_DIR}/.travis.yml" "${_ALPAKA_ROOT_DIR}/README.md")
+SET(_ALPAKA_FILES_OTHER "${_ALPAKA_ROOT_DIR}/alpakaConfig.cmake" "${_ALPAKA_ADD_EXECUTABLE_FILE}" "${_ALPAKA_COMMON_FILE}" "${_ALPAKA_ROOT_DIR}/.travis.yml" "${_ALPAKA_ROOT_DIR}/README.md")
 
 # Add all the source and include files in all recursive subdirectories and group them accordingly.
 append_recursive_files_add_to_src_group("${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "${_ALPAKA_SUFFIXED_INCLUDE_DIR}" "hpp" "_ALPAKA_FILES_HEADER")
@@ -453,7 +457,8 @@ LIST(APPEND alpaka_DEFINITIONS ${_ALPAKA_COMPILE_OPTIONS_PUBLIC})
 SET(alpaka_INCLUDE_DIR ${_ALPAKA_INCLUDE_DIRECTORY})
 SET(alpaka_INCLUDE_DIRS ${_ALPAKA_INCLUDE_DIRECTORIES_PUBLIC})
 SET(alpaka_LIBRARY ${_ALPAKA_LINK_LIBRARY})
-SET(alpaka_LIBRARIES ${_ALPAKA_LINK_LIBRARIES_PUBLIC})
+SET(alpaka_LIBRARIES ${_ALPAKA_LINK_FLAGS_PUBLIC})
+LIST(APPEND alpaka_LIBRARIES ${_ALPAKA_LINK_LIBRARIES_PUBLIC})
 
 #-------------------------------------------------------------------------------
 # Print the return values.
@@ -491,6 +496,7 @@ IF(NOT _ALPAKA_FOUND)
     UNSET(_ALPAKA_LINK_LIBRARIES_PUBLIC)
     UNSET(_ALPAKA_LINK_FLAGS_PUBLIC)
     UNSET(_ALPAKA_COMMON_FILE)
+    UNSET(_ALPAKA_ADD_EXECUTABLE_FILE)
     UNSET(_ALPAKA_FILES_HEADER)
     UNSET(_ALPAKA_FILES_SOURCE)
     UNSET(_ALPAKA_FILES_OTHER)
@@ -512,6 +518,7 @@ ELSE()
         _ALPAKA_LINK_LIBRARIES_PUBLIC
         _ALPAKA_LINK_FLAGS_PUBLIC
         _ALPAKA_COMMON_FILE
+        _ALPAKA_ADD_EXECUTABLE_FILE
         _ALPAKA_FILES_HEADER
         _ALPAKA_FILES_SOURCE
         _ALPAKA_FILES_OTHER
diff --git a/cmake/addExecutable.cmake b/cmake/addExecutable.cmake
new file mode 100644
index 0000000..90a5fa4
--- /dev/null
+++ b/cmake/addExecutable.cmake
@@ -0,0 +1,39 @@
+################################################################################
+# Copyright 2015 Benjamin Worpitz
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
+# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE
+# USE OR PERFORMANCE OF THIS SOFTWARE.
+################################################################################
+
+# CUDA_SOURCE_PROPERTY_FORMAT is only supported starting from 3.3.0.
+CMAKE_MINIMUM_REQUIRED(VERSION 3.3.0)
+
+#------------------------------------------------------------------------------
+# Calls CUDA_ADD_EXECUTABLE or ADD_EXECUTABLE depending on the enabled alpaka accelerators.
+#------------------------------------------------------------------------------
+FUNCTION(ALPAKA_ADD_EXECUTABLE In_Name)
+    IF(ALPAKA_ACC_GPU_CUDA_ENABLE)
+        FOREACH(_file ${ARGN})
+            IF((${_file} MATCHES "\\.cpp$") OR (${_file} MATCHES "\\.cxx$"))
+                SET_SOURCE_FILES_PROPERTIES(${_file} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+            ENDIF()
+        ENDFOREACH()
+        CMAKE_POLICY(SET CMP0023 OLD)   # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords.
+        CUDA_ADD_EXECUTABLE(
+            ${In_Name}
+            ${ARGN})
+    ELSE()
+        ADD_EXECUTABLE(
+            ${In_Name}
+            ${ARGN})
+    ENDIF()
+ENDFUNCTION()
diff --git a/doc/Abstraction.md b/doc/Abstraction.md
deleted file mode 100644
index 770b021..0000000
--- a/doc/Abstraction.md
+++ /dev/null
@@ -1,127 +0,0 @@
-Abstraction
-===========
-
-Parallelism and memory hierarchies at all levels need to be exploited in order to achieve performance portability across various types of accelerators.
-
-On the hardware side we have nodes with multiple sockets/processors extended by accelerators like GPUs or Intel Xeon Phi each with their own processing units.
-Within a CPU or a Intel Xeon Phi there are cores with hyper-threads and vector units, within a GPU there are many small cores.
-Each of those entities has access to different memories in the hierarchy.
-For example each socket/processor manages its RAM and the cores additionally have access to L3, L2 and L1 caches.
-On a GPU there is global, constant and shared memory.
-**alpaka** is designed to abstract from these differences without sacrificing speed by defining a domain decomposition for the computation domain.
-This domain decomposition is abstract enough to map optimally to all (currently known) accelerators.
-The **alpaka** library hides the mapping to the underlying hardware together with the execution primitives like kernel-threads and fibers allowing accelerator independent performance and portability.
-
-A process running on a multi-socket node is the largest entity within **alpaka**.
-The library itself only abstracts the task and data parallel execution on the process/node level and down.
-It does not provide any primitives for inter-node communication but such libraries can build upon **alpaka**.
-The process always has a main thread and is by definition running on the host.
-It can access the host memory and various accelerator devices.
-Such accelerator devices can be GPUs, Intel Xeon Phi, the host itself or other hardware.
-Thus the host not necessarily has to be different from the accelerator device used to compute on.
-For example a Intel Xeon Phi simultaneously can be the host and the accelerator device.
-**alpaka** also allows the parallel execution on nodes without any accelerator hardware.
-The node the process is running on can itself be used as an accelerator device.
-
-**alpaka** can be used to offload the parallel execution of task and data parallel work simultaneously onto different accelerator devices.
-
-
-Task Parallelism
-----------------
-
-With the concept of streams known from CUDA, where each stream is a queue of sequential tasks, but streams can be processed in parallel, **alpaka** provides an implementation of task parallelism.
-Events that can be enqueued to the streams enhance this basic task parallelism by allowing synchronization between different streams, devices or host threads.
-
-
-Data Parallelism
-----------------
-
-The main part of **alpaka** is the way it abstracts data parallelism.
-The work is divided into a 1 to 3 dimensional grid of uniform threads appropriate to the problem at hand.
-The uniform function executed by each of the threads is called a kernel.
-
-The threads are organized hierarchically and can access local memory at each of the hierarchy levels.
-All these higher levels are hidden in the internals of the accelerator implementations and their execution order can not be controlled.
-
-The abstraction used extends the CUDA grid-blocks-threads division strategy explained below by further allowing to facilitate vectorization.
-This extended *redundant hierarchical parallelism* scheme is discussed in the paper [The Future of Accelerator Programming: Abstraction, Performance or Can We Have Both?](http://dx.doi.org/10.1109/ICPADS.2013.76) ([PDF](http://olab.is.s.u-tokyo.ac.jp/~kamil.rocki/rocki_burtscher_sac14.pdf)).
-
-### Thread Hierarchy
-
-#### Grid
-
-The whole grid consists of uniform threads each executing the same kernel.
-By default threads do not have a cheap way to communicate safely within the whole grid.
-This forces decoupling of threads and avoids global interaction and global dependencies.
-This independence allows scattering of work blocks within the grid of worker threads and their independent execution on separate processing units. 
-Utilizing this property on the higher level allows applications to scale very well.
-All threads within the grid can access a global memory.
-
-#### Block
-
-A block is a group of threads.
-The whole grid is subdivided into equal sized blocks.
-Threads within a block can synchronize and have a fast but small shared memory.
-This allows for fast interaction on a local scale.
-
-**TODO**: Why blocks?
-
-#### Thread
-
-Each thread executes the same kernel.
-The only difference is the index into the grid which allows each thread to compute a different part of the solution.
-
-**TODO**: more
-
-#### Vectorization (SIMD)
-
-To use the maximum available computing power of a x86 core the computation has to exploit the vector registers.
-
-Because of the x86 SIMD intrinsics (`<xmmintrin.h>`) not being portable, we have to rely on the loop vectorization capabilities of the compiler.
-
-The best solution to vectorization would be one, where the user does not have to do anything.
-This is not possible because the smallest unit the user supplies is a kernel which is executed in threads which can synchronize.
-It is not possible to hide the vectorization by starting a kernel-thread for e.g. each 4th thread in a block and then looping over the 4 entries.
-This would prohibit the synchronization between these threads.
-By executing 4 fibers inside such a vectorization kernel-thread we would allow synchronization again but prevent the loop vectorizer from working.
-
-The only possible usage of vectorization is one where we create a kernel-thread for e.g. each 4th thread in a block but do not loop over the 4 threads ourself but rely on the user to implement loops that can be vectorized safely.
-
-### Memory Hierarchy
-
-#### Global Memory
-
-The global memory can be accessed from every thread executing on an accelerator.
-This is typically the largest but also the slowest memory available.
-
-#### Shared Memory
-
-Each block has its own shared memory.
-This memory can only be accessed by threads within the same block and gets discarded after the complete block finished its calculation.
-This memory is typically very fast but also very small.
-Sharing has to be done explicitly.
-No variables are shared between kernels by default.
-
-#### Registers
-
-This memory is local to each thread.
-All variables with default scope defined inside a kernel are automatically saved in registers and not shared automatically.
-
-**TODO**: Constant Memory, Texture Memory?
-
-Mapping *Redundant Hierarchical Parallelism* onto Hardware
---------------------------------------------------------
-
-By providing an accelerator independent interface for kernels, their execution and memory access at different hierarchy levels **alpaka** allows the user to write accelerator independent code that does not neglect performance.
-
-The hard part, the mapping of the decomposition to the execution environment is handled by the **alpaka** library.
-The decomposition of the computation in use can not be mapped one to one to any existing hardware.
-GPUs do not have vector registers where multiple values of type `int` or `float` can be manipulated by one instruction.
-Newer versions of CUDA only implement basic SIMD instructions "on pairs of 16-bit values and quads of 8-bit values". 
-They are described in the documentation of the [NVIDIA GPU instruction set architecture](http://docs.nvidia.com/cuda/pdf/ptx_isa_3.2.pdf) chapter 8.7.13 but are only of any use in very special problem domains.
-So the vector level is omitted on the CUDA accelerator.
-CPUs in turn are not (currently) capable of running thousands of threads concurrently.
-Furthermore CPUs do not have an equivalently fast inter-thread synchronization and shared memory access as GPUs do.
-
-A major point of the *redundant hierarchical parallelism* abstraction is to ignore specific unsupported levels and utilize only the ones supported on a specific accelerator.
-Furthermore the hierarchy allows a mapping to various current and future accelerators in a variety of ways enabling optimal usage of the underlying compute and memory capabilities.
diff --git a/doc/Distinction.md b/doc/Distinction.md
deleted file mode 100644
index 0cb85f7..0000000
--- a/doc/Distinction.md
+++ /dev/null
@@ -1,125 +0,0 @@
-Distinction
-===========
-
-There are multiple other projects which pretend to achieve full performance portability.
-Many do not satisfy the requirement for full c++ support that is needed because of the usage of template meta-programming for method specialization to avoid runtime-polymorphism.
-
-
-CUDA
-------
-
-### Positive
-* Full control over memory, caches and execution.
-
-### Negative
-* Not platform independent: NVIDIA GPUs are required. No execution on other manufacturers GPUs or even standard CPUs.
-* Language extension: The nvcc compiler driver is required to understand the language constructs that are used inside the kernel methods.
-* Not compiler independent: The back-end compiler has to be one of the supported ones.
-
-
-OpenMP
---------
-
-### Negative
-* No good way to control memory hierarchies and thread interaction (shared memory).
-* Only compiler hints, no direct control over result.
-
-
-OpenACC
----------
-
-### Positive
-* Can generate x86 and CUDA code from C++ code.
-
-### Negative
-* Compiler dependent (currently not supported by many compilers and the PGI compiler is not actively enough developed and does not have a good C++ support).
-* Only compiler hints, no direct control over result.
-
-
-OpenCl
---------
-
-### Positive
-* Hardware independent (CPUs and GPUs of nearly all vendors).
-
-### Negative
-* No full C++ support.
-* Runtime compilation -> No direct inclusion into the source (syntax highlighting?, static analysis?, debugging?, templated kernels?)
-
-
-C++ AMP
----------
-
-### Positive
-* Open specification
-* Annotated C++ code can run on multiple accelerators.
-
-### Negative
-* Language extension
-* Compiler dependent (currently not supported by many compilers)
-
-
-PGI CUDA-X86
--------------
-When run on x86-based systems without a GPU, PGI CUDA C applications use multiple cores and the streaming SIMD (Single Instruction Multiple Data) capabilities of Intel and AMD CPUs for parallelvectorized execution.
-At run-time, CUDA C programs compiled for x86 executes each CUDA thread block using a single host core, eliminating synchronization where possible.
-
-### Positive
-* Lets you write standard CUDA code and execute it on x86.
-
-### Negative
-* Not actively developed.
-
-
-LLVM backends (PTX, R600)
----------------------------
-
-### Negative
-* Those back-ends never got really mature and up-to-date.
-
-
-KOKKOS
--------------
-See [here](https://www.xsede.org/documents/271087/586927/Edwards-2013-XSCALE13-Kokkos.pdf)
-[here](http://trilinos.org/oldsite/events/trilinos_user_group_2013/presentations/2013-11-TUG-Kokkos-Tutorial.pdf)
-[here](http://on-demand.gputechconf.com/supercomputing/2013/presentation/SC3103_Towards-Performance-Portable-Applications-Kokkos.pdf)
-and  [here](http://dx.doi.org/10.3233/SPR-2012-0343).
-Source is available [here](https://github.com/trilinos/trilinos/tree/master/packages/kokkos).
-The project is similar to *alpaka* in the way it abstracts the kernel as templated function object.
-It provides parallel_for, parallel_reduce, etc. similar to thrust.
-
-### Positive
-* Offers buffer views with a neutral indexing scheme that maps to the underlying hardware (row/col-major, blocking, ...). 
-
-### Negative
-* License.
-* The parameters are required to be given to the function object constructor coupling algorithm and data together.
-* The implementation of accelerator methods (atomics, ...) is selected via macros defined by the nvcc compiler. So there is no way to select between different x86 implementations for different x86 accelerators. 
-
-
-Phalanx
------------
-See [here](http://www.mgarland.org/files/papers/phalanx-sc12-preprint.pdf).
-It is very similar to *alpaka* in the way it abstracts the accelerators.
-
-### Positive
-* C++ Interface provides CUDA, OpenMP, and GASNet back-ends
-
-### Negative
-* License.
-* No official source repository available?
-
-
-thrust (bulk)
--------------
-*...
-
-
-Intel TBB
----------
-*...
-
-
-Intel Cilk Plus
----------------
-*...
diff --git a/doc/Threading.md b/doc/Threading.md
deleted file mode 100644
index 9063f01..0000000
--- a/doc/Threading.md
+++ /dev/null
@@ -1,57 +0,0 @@
-Threading
-=========
-
-There are multiple possible ways to implement threading on non CUDA accelerators.
-
-
-Serial
-------
-
-### Positive
-* Easy implementation (no sync).
-
-### Negative
-* Restricts block size to 1*1*1. Due to the threads in a block being able to synchronize they have to be executed in parallel. This can not be faked.
-
-
-OpenMP
-------
-
-### Positive
-* Lightweight threads.
-
-### Negative
-* Interaction with std::threads/pthreads unspecified.
-* Thread count limit platform dependent (some runtimes allow oversubscription). No correct way to read the maximum number of threads supported.
-* Hard to control thread affinity.
-* Non-deterministical thread change.
-
-
-Kernel-Threads
---------------
-
-std::thread, pthread, ...
-
-### Positive
-* Thread affinity is controllable (platform dependent implementation).
-
-### Negative
-* High cost of thread creation and thread change.
-* Non-deterministical thread change.
-
-
-Fibers
-------
-
-A fiber is a user-space thread with cooperative context-switch.
-They are implemented on top of coroutines. A coroutine is a function that can be suspended and resumed but has not necessarily a stack.
-boost::fiber = stackful coroutine + scheduler + sync (no wait → next fiber in thread)
-C++17: N3858, N3985, (N4134 stackless coroutines 'await')
-
-### Positive
-* Less cost of creation.
-* Less cost of thread change.
-* Deterministic thread change.
-* No locks at all (Always only one active fiber per kernel-thread).
-* Prevents false sharing because all fibers working on nearby values are in the same block and can be executed by the same kernel-thread on the same core.
-* Prevents cache thrashing (threads on the same core compete for the same cache line) by using a user-level scheduler for the fibers that can invocate them in order of access of the memory.
diff --git a/examples/mandelbrot/CMakeLists.txt b/examples/mandelbrot/CMakeLists.txt
index 1cfc4db..2e98a95 100644
--- a/examples/mandelbrot/CMakeLists.txt
+++ b/examples/mandelbrot/CMakeLists.txt
@@ -63,41 +63,15 @@ append_recursive_files_add_to_src_group("${_SUFFIXED_INCLUDE_DIR}" "" "hpp" _FIL
 # Add all the source files in all recursive subdirectories and group them accordingly.
 append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cpp" _FILES_SOURCE_CXX)
 
-# Add all the CUDA source files in all recursive subdirectories and group them accordingly.
-append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cu" _FILES_SOURCE_CU)
-
+INCLUDE_DIRECTORIES(
+    ${_INCLUDE_DIRECTORIES_PRIVATE}
+    ${alpaka_INCLUDE_DIRS})
+ADD_DEFINITIONS(
+    ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS})
 # Always add all files to the target executable build call to add them to the build project.
-IF(ALPAKA_ACC_GPU_CUDA_ENABLE)
-    # Force the .cpp files to be recognized as headers and not be compiled so there wont be a second entry point.
-    SET_SOURCE_FILES_PROPERTIES(
-        ${_FILES_SOURCE_CXX}
-        PROPERTIES HEADER_FILE_ONLY TRUE)
-    # CUDA does not work well with the much better target dependent TARGET_XXX commands but requires the settings to be available globally: https://www.cmake.org/Bug/view.php?id=14201&nbn=1
-    INCLUDE_DIRECTORIES(
-        ${_INCLUDE_DIRECTORIES_PRIVATE}
-        ${alpaka_INCLUDE_DIRS})
-    ADD_DEFINITIONS(
-        ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS})
-    CMAKE_POLICY(SET CMP0023 OLD)   # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords.
-    CUDA_ADD_EXECUTABLE(
-        "mandelbrot"
-        ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU})
-ELSE()
-    # Force the .cu files to be recognized as headers and not be compiled so there wont be a second entry point.
-    SET_SOURCE_FILES_PROPERTIES(
-        ${_FILES_SOURCE_CU}
-        PROPERTIES HEADER_FILE_ONLY TRUE)
-    ADD_EXECUTABLE(
-        "mandelbrot"
-        ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU})
-    TARGET_INCLUDE_DIRECTORIES(
-        "mandelbrot"
-        PRIVATE ${_INCLUDE_DIRECTORIES_PRIVATE})
-    TARGET_COMPILE_OPTIONS(
-        "mandelbrot"
-        PRIVATE ${ALPAKA_DEV_COMPILE_OPTIONS})
-ENDIF()
-
+ALPAKA_ADD_EXECUTABLE(
+    "mandelbrot"
+    ${_FILES_HEADER} ${_FILES_SOURCE_CXX})
 # Set the link libraries for this library (adds libs, include directories, defines and compile options).
 TARGET_LINK_LIBRARIES(
     "mandelbrot"
diff --git a/examples/mandelbrot/src/main.cpp b/examples/mandelbrot/src/main.cpp
index 2986fbd..b3e117d 100644
--- a/examples/mandelbrot/src/main.cpp
+++ b/examples/mandelbrot/src/main.cpp
@@ -277,12 +277,12 @@ auto writeTgaColorImage(
         alpaka::dim::Dim<TBuf>::value == 2,
         "The buffer has to be 2 dimensional!");
     static_assert(
-        std::is_integral<alpaka::mem::view::Elem<TBuf>>::value,
+        std::is_integral<alpaka::elem::Elem<TBuf>>::value,
         "The buffer element type has to be integral!");
 
     // The width of the input buffer is in input elements.
     auto const bufWidthElems(alpaka::extent::getWidth(bufRgba));
-    auto const bufWidthBytes(bufWidthElems * sizeof(alpaka::mem::view::Elem<TBuf>));
+    auto const bufWidthBytes(bufWidthElems * sizeof(alpaka::elem::Elem<TBuf>));
     // The row width in bytes has to be dividable by 4 Bytes (RGBA).
     assert(bufWidthBytes % sizeof(std::uint32_t) == 0);
     // The number of colors in a row.
@@ -378,7 +378,7 @@ struct MandelbrotKernelTester
         // Get a stream on this device.
         alpaka::examples::Stream<alpaka::dev::Dev<TAcc>> stream(devAcc);
 
-        alpaka::Vec2<TSize> const v2uiExtents(
+        alpaka::Vec<alpaka::dim::DimInt<2u>, TSize> const v2uiExtents(
             static_cast<TSize>(numRows),
             static_cast<TSize>(numCols));
 
diff --git a/examples/mandelbrot/src/main.cu b/examples/mandelbrot/src/main.cu
deleted file mode 100644
index 428b7f4..0000000
--- a/examples/mandelbrot/src/main.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
-* \file
-* Copyright 2014-2015 Benjamin Worpitz
-*
-* This file is part of alpaka.
-*
-* alpaka is free software: you can redistribute it and/or modify
-* it under the terms of the GNU Lesser General Public License as published by
-* the Free Software Foundation, either version 3 of the License, or
-* (at your option) any later version.
-*
-* alpaka is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-* GNU Lesser General Public License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public License
-* along with alpaka.
-* If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "main.cpp"
diff --git a/examples/matMul/CMakeLists.txt b/examples/matMul/CMakeLists.txt
index e952292..d87e3e1 100644
--- a/examples/matMul/CMakeLists.txt
+++ b/examples/matMul/CMakeLists.txt
@@ -63,41 +63,15 @@ append_recursive_files_add_to_src_group("${_SUFFIXED_INCLUDE_DIR}" "" "hpp" _FIL
 # Add all the source files in all recursive subdirectories and group them accordingly.
 append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cpp" _FILES_SOURCE_CXX)
 
-# Add all the CUDA source files in all recursive subdirectories and group them accordingly.
-append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cu" _FILES_SOURCE_CU)
-
+INCLUDE_DIRECTORIES(
+    ${_INCLUDE_DIRECTORIES_PRIVATE}
+    ${alpaka_INCLUDE_DIRS})
+ADD_DEFINITIONS(
+    ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS})
 # Always add all files to the target executable build call to add them to the build project.
-IF(ALPAKA_ACC_GPU_CUDA_ENABLE)
-    # Force the .cpp files to be recognized as headers and not be compiled so there wont be a second entry point.
-    SET_SOURCE_FILES_PROPERTIES(
-        ${_FILES_SOURCE_CXX}
-        PROPERTIES HEADER_FILE_ONLY TRUE)
-    # CUDA does not work well with the much better target dependent TARGET_XXX commands but requires the settings to be available globally: https://www.cmake.org/Bug/view.php?id=14201&nbn=1
-    INCLUDE_DIRECTORIES(
-        ${_INCLUDE_DIRECTORIES_PRIVATE}
-        ${alpaka_INCLUDE_DIRS})
-    ADD_DEFINITIONS(
-        ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS})
-    CMAKE_POLICY(SET CMP0023 OLD)   # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords.
-    CUDA_ADD_EXECUTABLE(
-        "matMul"
-        ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU})
-ELSE()
-    # Force the .cu files to be recognized as headers and not be compiled so there wont be a second entry point.
-    SET_SOURCE_FILES_PROPERTIES(
-        ${_FILES_SOURCE_CU}
-        PROPERTIES HEADER_FILE_ONLY TRUE)
-    ADD_EXECUTABLE(
-        "matMul"
-        ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU})
-    TARGET_INCLUDE_DIRECTORIES(
-        "matMul"
-        PRIVATE ${_INCLUDE_DIRECTORIES_PRIVATE})
-    TARGET_COMPILE_OPTIONS(
-        "matMul"
-        PRIVATE ${ALPAKA_DEV_COMPILE_OPTIONS})
-ENDIF()
-
+ALPAKA_ADD_EXECUTABLE(
+    "matMul"
+    ${_FILES_HEADER} ${_FILES_SOURCE_CXX})
 # Set the link libraries for this library (adds libs, include directories, defines and compile options).
 TARGET_LINK_LIBRARIES(
     "matMul"
diff --git a/examples/matMul/src/main.cpp b/examples/matMul/src/main.cpp
index 7d1191c..ed99e8e 100644
--- a/examples/matMul/src/main.cpp
+++ b/examples/matMul/src/main.cpp
@@ -251,16 +251,16 @@ struct MatMulTester
         // Get a stream on the accelerator device.
         alpaka::examples::Stream<alpaka::dev::Dev<TAcc>> streamAcc(devAcc);
 
-        alpaka::Vec2<TSize> const v2uiExtentsA(
+        alpaka::Vec<alpaka::dim::DimInt<2u>, TSize> const v2uiExtentsA(
             static_cast<TSize>(m),
             static_cast<TSize>(k));
 
-        alpaka::Vec2<TSize> const v2uiExtentsB(
+        alpaka::Vec<alpaka::dim::DimInt<2u>, TSize> const v2uiExtentsB(
             static_cast<TSize>(k),
             static_cast<TSize>(n));
 
         // Result matrix is MxN. We create one worker per result matrix cell.
-        alpaka::Vec2<TSize> const v2uiExtentsC(
+        alpaka::Vec<alpaka::dim::DimInt<2u>, TSize> const v2uiExtentsC(
             static_cast<TSize>(m),
             static_cast<TSize>(n));
 
diff --git a/examples/matMul/src/main.cu b/examples/matMul/src/main.cu
deleted file mode 100644
index 428b7f4..0000000
--- a/examples/matMul/src/main.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
-* \file
-* Copyright 2014-2015 Benjamin Worpitz
-*
-* This file is part of alpaka.
-*
-* alpaka is free software: you can redistribute it and/or modify
-* it under the terms of the GNU Lesser General Public License as published by
-* the Free Software Foundation, either version 3 of the License, or
-* (at your option) any later version.
-*
-* alpaka is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-* GNU Lesser General Public License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public License
-* along with alpaka.
-* If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "main.cpp"
diff --git a/examples/sharedMem/CMakeLists.txt b/examples/sharedMem/CMakeLists.txt
index 9cf17c1..9250f01 100644
--- a/examples/sharedMem/CMakeLists.txt
+++ b/examples/sharedMem/CMakeLists.txt
@@ -63,41 +63,15 @@ append_recursive_files_add_to_src_group("${_SUFFIXED_INCLUDE_DIR}" "" "hpp" _FIL
 # Add all the source files in all recursive subdirectories and group them accordingly.
 append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cpp" _FILES_SOURCE_CXX)
 
-# Add all the CUDA source files in all recursive subdirectories and group them accordingly.
-append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cu" _FILES_SOURCE_CU)
-
+INCLUDE_DIRECTORIES(
+    ${_INCLUDE_DIRECTORIES_PRIVATE}
+    ${alpaka_INCLUDE_DIRS})
+ADD_DEFINITIONS(
+    ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS})
 # Always add all files to the target executable build call to add them to the build project.
-IF(ALPAKA_ACC_GPU_CUDA_ENABLE)
-    # Force the .cpp files to be recognized as headers and not be compiled so there wont be a second entry point.
-    SET_SOURCE_FILES_PROPERTIES(
-        ${_FILES_SOURCE_CXX}
-        PROPERTIES HEADER_FILE_ONLY TRUE)
-    # CUDA does not work well with the much better target dependent TARGET_XXX commands but requires the settings to be available globally: https://www.cmake.org/Bug/view.php?id=14201&nbn=1
-    INCLUDE_DIRECTORIES(
-        ${_INCLUDE_DIRECTORIES_PRIVATE}
-        ${alpaka_INCLUDE_DIRS})
-    ADD_DEFINITIONS(
-        ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS})
-    CMAKE_POLICY(SET CMP0023 OLD)   # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords.
-    CUDA_ADD_EXECUTABLE(
-        "sharedMem"
-        ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU})
-ELSE()
-    # Force the .cu files to be recognized as headers and not be compiled so there wont be a second entry point.
-    SET_SOURCE_FILES_PROPERTIES(
-        ${_FILES_SOURCE_CU}
-        PROPERTIES HEADER_FILE_ONLY TRUE)
-    ADD_EXECUTABLE(
-        "sharedMem"
-        ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU})
-    TARGET_INCLUDE_DIRECTORIES(
-        "sharedMem"
-        PRIVATE ${_INCLUDE_DIRECTORIES_PRIVATE})
-    TARGET_COMPILE_OPTIONS(
-        "sharedMem"
-        PRIVATE ${ALPAKA_DEV_COMPILE_OPTIONS})
-ENDIF()
-
+ALPAKA_ADD_EXECUTABLE(
+    "sharedMem"
+    ${_FILES_HEADER} ${_FILES_SOURCE_CXX})
 # Set the link libraries for this library (adds libs, include directories, defines and compile options).
 TARGET_LINK_LIBRARIES(
     "sharedMem"
diff --git a/examples/sharedMem/src/main.cu b/examples/sharedMem/src/main.cu
deleted file mode 100644
index 428b7f4..0000000
--- a/examples/sharedMem/src/main.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
-* \file
-* Copyright 2014-2015 Benjamin Worpitz
-*
-* This file is part of alpaka.
-*
-* alpaka is free software: you can redistribute it and/or modify
-* it under the terms of the GNU Lesser General Public License as published by
-* the Free Software Foundation, either version 3 of the License, or
-* (at your option) any later version.
-*
-* alpaka is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-* GNU Lesser General Public License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public License
-* along with alpaka.
-* If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "main.cpp"
diff --git a/examples/vectorAdd/CMakeLists.txt b/examples/vectorAdd/CMakeLists.txt
index 49a3a6e..74a50b2 100644
--- a/examples/vectorAdd/CMakeLists.txt
+++ b/examples/vectorAdd/CMakeLists.txt
@@ -63,41 +63,15 @@ append_recursive_files_add_to_src_group("${_SUFFIXED_INCLUDE_DIR}" "" "hpp" _FIL
 # Add all the source files in all recursive subdirectories and group them accordingly.
 append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cpp" _FILES_SOURCE_CXX)
 
-# Add all the CUDA source files in all recursive subdirectories and group them accordingly.
-append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cu" _FILES_SOURCE_CU)
-
+INCLUDE_DIRECTORIES(
+    ${_INCLUDE_DIRECTORIES_PRIVATE}
+    ${alpaka_INCLUDE_DIRS})
+ADD_DEFINITIONS(
+    ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS})
 # Always add all files to the target executable build call to add them to the build project.
-IF(ALPAKA_ACC_GPU_CUDA_ENABLE)
-    # Force the .cpp files to be recognized as headers and not be compiled so there wont be a second entry point.
-    SET_SOURCE_FILES_PROPERTIES(
-        ${_FILES_SOURCE_CXX}
-        PROPERTIES HEADER_FILE_ONLY TRUE)
-    # CUDA does not work well with the much better target dependent TARGET_XXX commands but requires the settings to be available globally: https://www.cmake.org/Bug/view.php?id=14201&nbn=1
-    INCLUDE_DIRECTORIES(
-        ${_INCLUDE_DIRECTORIES_PRIVATE}
-        ${alpaka_INCLUDE_DIRS})
-    ADD_DEFINITIONS(
-        ${alpaka_DEFINITIONS} ${ALPAKA_DEV_COMPILE_OPTIONS})
-    CMAKE_POLICY(SET CMP0023 OLD)   # CUDA_ADD_EXECUTABLE calls TARGET_LINK_LIBRARIES without keywords.
-    CUDA_ADD_EXECUTABLE(
-        "vectorAdd"
-        ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU})
-ELSE()
-    # Force the .cu files to be recognized as headers and not be compiled so there wont be a second entry point.
-    SET_SOURCE_FILES_PROPERTIES(
-        ${_FILES_SOURCE_CU}
-        PROPERTIES HEADER_FILE_ONLY TRUE)
-    ADD_EXECUTABLE(
-        "vectorAdd"
-        ${_FILES_HEADER} ${_FILES_SOURCE_CXX} ${_FILES_SOURCE_CU})
-    TARGET_INCLUDE_DIRECTORIES(
-        "vectorAdd"
-        PRIVATE ${_INCLUDE_DIRECTORIES_PRIVATE})
-    TARGET_COMPILE_OPTIONS(
-        "vectorAdd"
-        PRIVATE ${ALPAKA_DEV_COMPILE_OPTIONS})
-ENDIF()
-
+ALPAKA_ADD_EXECUTABLE(
+    "vectorAdd"
+    ${_FILES_HEADER} ${_FILES_SOURCE_CXX})
 # Set the link libraries for this library (adds libs, include directories, defines and compile options).
 TARGET_LINK_LIBRARIES(
     "vectorAdd"
diff --git a/examples/vectorAdd/src/main.cpp b/examples/vectorAdd/src/main.cpp
index 673c466..03abb21 100644
--- a/examples/vectorAdd/src/main.cpp
+++ b/examples/vectorAdd/src/main.cpp
@@ -102,7 +102,7 @@ struct VectorAddKernelTester
         // Get a stream on this device.
         alpaka::examples::Stream<alpaka::dev::Dev<TAcc>> stream(devAcc);
 
-        alpaka::Vec1<TSize> const v1uiExtents(
+        alpaka::Vec<alpaka::dim::DimInt<1u>, TSize> const v1uiExtents(
             numElements);
 
         // Let alpaka calculate good block and grid sizes given our full problem extents.
diff --git a/examples/vectorAdd/src/main.cu b/examples/vectorAdd/src/main.cu
deleted file mode 100644
index 428b7f4..0000000
--- a/examples/vectorAdd/src/main.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
-* \file
-* Copyright 2014-2015 Benjamin Worpitz
-*
-* This file is part of alpaka.
-*
-* alpaka is free software: you can redistribute it and/or modify
-* it under the terms of the GNU Lesser General Public License as published by
-* the Free Software Foundation, either version 3 of the License, or
-* (at your option) any later version.
-*
-* alpaka is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-* GNU Lesser General Public License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public License
-* along with alpaka.
-* If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "main.cpp"
diff --git a/include/alpaka/acc/AccCpuFibers.hpp b/include/alpaka/acc/AccCpuFibers.hpp
index 2b27e0e..51494be 100644
--- a/include/alpaka/acc/AccCpuFibers.hpp
+++ b/include/alpaka/acc/AccCpuFibers.hpp
@@ -29,6 +29,7 @@
 #include <alpaka/math/MathStl.hpp>              // MathStl
 #include <alpaka/block/shared/BlockSharedAllocMasterSync.hpp>   // BlockSharedAllocMasterSync
 #include <alpaka/block/sync/BlockSyncFiberIdMapBarrier.hpp>     // BlockSyncFiberIdMapBarrier
+#include <alpaka/rand/RandStl.hpp>              // RandStl
 
 // Specialized traits.
 #include <alpaka/acc/Traits.hpp>                // acc::traits::AccType
@@ -79,7 +80,8 @@ namespace alpaka
             public atomic::AtomicNoOp,
             public math::MathStl,
             public block::shared::BlockSharedAllocMasterSync,
-            public block::sync::BlockSyncFiberIdMapBarrier<TSize>
+            public block::sync::BlockSyncFiberIdMapBarrier<TSize>,
+            public rand::RandStl
         {
         public:
             // Partial specialization with the correct TDim and TSize is not allowed.
@@ -109,6 +111,7 @@ namespace alpaka
                     block::sync::BlockSyncFiberIdMapBarrier<TSize>(
                         m_threadsPerBlockCount,
                         m_fibersToBarrier),
+                    rand::RandStl(),
                     m_gridBlockIdx(Vec<TDim, TSize>::zeros()),
                     m_threadsPerBlockCount(workdiv::getWorkDiv<Block, Threads>(workDiv).prod())
             {}
diff --git a/include/alpaka/acc/AccCpuOmp2Blocks.hpp b/include/alpaka/acc/AccCpuOmp2Blocks.hpp
index 2d0ac1e..4fadb77 100644
--- a/include/alpaka/acc/AccCpuOmp2Blocks.hpp
+++ b/include/alpaka/acc/AccCpuOmp2Blocks.hpp
@@ -25,10 +25,11 @@
 #include <alpaka/workdiv/WorkDivMembers.hpp>    // workdiv::WorkDivMembers
 #include <alpaka/idx/gb/IdxGbRef.hpp>           // IdxGbRef
 #include <alpaka/idx/bt/IdxBtZero.hpp>          // IdxBtZero
-#include <alpaka/atomic/AtomicNoOp.hpp>         // AtomicNoOp
+#include <alpaka/atomic/AtomicOmpCritSec.hpp>   // AtomicOmpCritSec
 #include <alpaka/math/MathStl.hpp>              // MathStl
 #include <alpaka/block/shared/BlockSharedAllocNoSync.hpp>  // BlockSharedAllocNoSync
 #include <alpaka/block/sync/BlockSyncNoOp.hpp>  // BlockSyncNoOp
+#include <alpaka/rand/RandStl.hpp>              // RandStl
 
 // Specialized traits.
 #include <alpaka/acc/Traits.hpp>                // acc::traits::AccType
@@ -73,10 +74,11 @@ namespace alpaka
             public workdiv::WorkDivMembers<TDim, TSize>,
             public idx::gb::IdxGbRef<TDim, TSize>,
             public idx::bt::IdxBtZero<TDim, TSize>,
-            public atomic::AtomicNoOp,
+            public atomic::AtomicOmpCritSec,
             public math::MathStl,
             public block::shared::BlockSharedAllocNoSync,
-            public block::sync::BlockSyncNoOp
+            public block::sync::BlockSyncNoOp,
+            public rand::RandStl
         {
         public:
             // Partial specialization with the correct TDim and TSize is not allowed.
@@ -98,10 +100,11 @@ namespace alpaka
                     workdiv::WorkDivMembers<TDim, TSize>(workDiv),
                     idx::gb::IdxGbRef<TDim, TSize>(m_gridBlockIdx),
                     idx::bt::IdxBtZero<TDim, TSize>(),
-                    atomic::AtomicNoOp(),
+                    atomic::AtomicOmpCritSec(),
                     math::MathStl(),
                     block::shared::BlockSharedAllocNoSync(),
                     block::sync::BlockSyncNoOp(),
+                    rand::RandStl(),
                     m_gridBlockIdx(Vec<TDim, TSize>::zeros())
             {}
 
diff --git a/include/alpaka/acc/AccCpuOmp2Threads.hpp b/include/alpaka/acc/AccCpuOmp2Threads.hpp
index 68ecb68..1baba83 100644
--- a/include/alpaka/acc/AccCpuOmp2Threads.hpp
+++ b/include/alpaka/acc/AccCpuOmp2Threads.hpp
@@ -29,6 +29,7 @@
 #include <alpaka/math/MathStl.hpp>              // MathStl
 #include <alpaka/block/shared/BlockSharedAllocMasterSync.hpp>  // BlockSharedAllocMasterSync
 #include <alpaka/block/sync/BlockSyncOmpBarrier.hpp>    // BlockSyncOmpBarrier
+#include <alpaka/rand/RandStl.hpp>              // RandStl
 
 // Specialized traits.
 #include <alpaka/acc/Traits.hpp>                // acc::traits::AccType
@@ -75,7 +76,8 @@ namespace alpaka
             public atomic::AtomicOmpCritSec,
             public math::MathStl,
             public block::shared::BlockSharedAllocMasterSync,
-            public block::sync::BlockSyncOmpBarrier
+            public block::sync::BlockSyncOmpBarrier,
+            public rand::RandStl
         {
         public:
             // Partial specialization with the correct TDim and TSize is not allowed.
@@ -103,6 +105,7 @@ namespace alpaka
                         [this](){block::sync::syncBlockThreads(*this);},
                         [](){return (::omp_get_thread_num() == 0);}),
                     block::sync::BlockSyncOmpBarrier(),
+                    rand::RandStl(),
                     m_gridBlockIdx(Vec<TDim, TSize>::zeros())
             {}
 
diff --git a/include/alpaka/acc/AccCpuOmp4.hpp b/include/alpaka/acc/AccCpuOmp4.hpp
index 8f50de7..d39ec42 100644
--- a/include/alpaka/acc/AccCpuOmp4.hpp
+++ b/include/alpaka/acc/AccCpuOmp4.hpp
@@ -29,6 +29,7 @@
 #include <alpaka/math/MathStl.hpp>              // MathStl
 #include <alpaka/block/shared/BlockSharedAllocMasterSync.hpp>  // BlockSharedAllocMasterSync
 #include <alpaka/block/sync/BlockSyncOmpBarrier.hpp>    // BlockSyncOmpBarrier
+#include <alpaka/rand/RandStl.hpp>              // RandStl
 
 // Specialized traits.
 #include <alpaka/acc/Traits.hpp>                // acc::traits::AccType
@@ -75,7 +76,8 @@ namespace alpaka
             public atomic::AtomicOmpCritSec,
             public math::MathStl,
             public block::shared::BlockSharedAllocMasterSync,
-            public block::sync::BlockSyncOmpBarrier
+            public block::sync::BlockSyncOmpBarrier,
+            public rand::RandStl
         {
         public:
             // Partial specialization with the correct TDim and TSize is not allowed.
@@ -103,6 +105,7 @@ namespace alpaka
                         [this](){block::sync::syncBlockThreads(*this);},
                         [](){return (::omp_get_thread_num() == 0);}),
                     block::sync::BlockSyncOmpBarrier(),
+                    rand::RandStl(),
                     m_gridBlockIdx(Vec<TDim, TSize>::zeros())
             {}
 
diff --git a/include/alpaka/acc/AccCpuSerial.hpp b/include/alpaka/acc/AccCpuSerial.hpp
index d0543e0..26eac76 100644
--- a/include/alpaka/acc/AccCpuSerial.hpp
+++ b/include/alpaka/acc/AccCpuSerial.hpp
@@ -29,6 +29,7 @@
 #include <alpaka/math/MathStl.hpp>              // MathStl
 #include <alpaka/block/shared/BlockSharedAllocNoSync.hpp>  // BlockSharedAllocNoSync
 #include <alpaka/block/sync/BlockSyncNoOp.hpp>  // BlockSyncNoOp
+#include <alpaka/rand/RandStl.hpp>              // RandStl
 
 // Specialized traits.
 #include <alpaka/acc/Traits.hpp>                // acc::traits::AccType
@@ -73,7 +74,8 @@ namespace alpaka
             public atomic::AtomicNoOp,
             public math::MathStl,
             public block::shared::BlockSharedAllocNoSync,
-            public block::sync::BlockSyncNoOp
+            public block::sync::BlockSyncNoOp,
+            public rand::RandStl
         {
         public:
             // Partial specialization with the correct TDim and TSize is not allowed.
@@ -99,6 +101,7 @@ namespace alpaka
                     math::MathStl(),
                     block::shared::BlockSharedAllocNoSync(),
                     block::sync::BlockSyncNoOp(),
+                    rand::RandStl(),
                     m_gridBlockIdx(Vec<TDim, TSize>::zeros())
             {}
 
diff --git a/include/alpaka/acc/AccCpuThreads.hpp b/include/alpaka/acc/AccCpuThreads.hpp
index 8b8b535..8f64a2b 100644
--- a/include/alpaka/acc/AccCpuThreads.hpp
+++ b/include/alpaka/acc/AccCpuThreads.hpp
@@ -29,6 +29,7 @@
 #include <alpaka/math/MathStl.hpp>                  // MathStl
 #include <alpaka/block/shared/BlockSharedAllocMasterSync.hpp>   // BlockSharedAllocMasterSync
 #include <alpaka/block/sync/BlockSyncThreadIdMapBarrier.hpp>    // BlockSyncThreadIdMapBarrier
+#include <alpaka/rand/RandStl.hpp>              // RandStl
 
 // Specialized traits.
 #include <alpaka/acc/Traits.hpp>                    // acc::traits::AccType
@@ -76,7 +77,8 @@ namespace alpaka
             public atomic::AtomicStlLock,
             public math::MathStl,
             public block::shared::BlockSharedAllocMasterSync,
-            public block::sync::BlockSyncThreadIdMapBarrier<TSize>
+            public block::sync::BlockSyncThreadIdMapBarrier<TSize>,
+            public rand::RandStl
         {
         public:
             // Partial specialization with the correct TDim and TSize is not allowed.
@@ -106,6 +108,7 @@ namespace alpaka
                     block::sync::BlockSyncThreadIdMapBarrier<TSize>(
                         m_threadsPerBlockCount,
                         m_mThreadsToBarrier),
+                    rand::RandStl(),
                     m_gridBlockIdx(Vec<TDim, TSize>::zeros()),
                     m_threadsPerBlockCount(workdiv::getWorkDiv<Block, Threads>(workDiv).prod())
             {}
@@ -147,7 +150,7 @@ namespace alpaka
             // getIdx
             std::mutex mutable m_mtxMapInsert;                              //!< The mutex used to secure insertion into the ThreadIdToIdxMap.
             typename idx::bt::IdxBtRefThreadIdMap<TDim, TSize>::ThreadIdToIdxMap mutable m_threadsToIndices;    //!< The mapping of thread id's to indices.
-            alignas(16u) Vec<TDim, TSize> mutable m_gridBlockIdx;        //!< The index of the currently executed block.
+            alignas(16u) Vec<TDim, TSize> mutable m_gridBlockIdx;           //!< The index of the currently executed block.
 
             // syncBlockThreads
             TSize const m_threadsPerBlockCount;                             //!< The number of threads per block the barrier has to wait for.
diff --git a/include/alpaka/acc/AccGpuCudaRt.hpp b/include/alpaka/acc/AccGpuCudaRt.hpp
index d340a11..3f5098e 100644
--- a/include/alpaka/acc/AccGpuCudaRt.hpp
+++ b/include/alpaka/acc/AccGpuCudaRt.hpp
@@ -29,6 +29,7 @@
 #include <alpaka/math/MathCudaBuiltIn.hpp>          // MathCudaBuiltIn
 #include <alpaka/block/shared/BlockSharedAllocCudaBuiltIn.hpp>  // BlockSharedAllocCudaBuiltIn
 #include <alpaka/block/sync/BlockSyncCudaBuiltIn.hpp>   // BlockSyncCudaBuiltIn
+#include <alpaka/rand/RandCuRand.hpp>               // RandCuRand
 
 // Specialized traits.
 #include <alpaka/acc/Traits.hpp>                    // acc::traits::AccType
@@ -73,7 +74,8 @@ namespace alpaka
             public atomic::AtomicCudaBuiltIn,
             public math::MathCudaBuiltIn,
             public block::shared::BlockSharedAllocCudaBuiltIn,
-            public block::sync::BlockSyncCudaBuiltIn
+            public block::sync::BlockSyncCudaBuiltIn,
+            public rand::RandCuRand
         {
         public:
             //-----------------------------------------------------------------------------
@@ -86,7 +88,8 @@ namespace alpaka
                 atomic::AtomicCudaBuiltIn(),
                 math::MathCudaBuiltIn(),
                 block::shared::BlockSharedAllocCudaBuiltIn(),
-                block::sync::BlockSyncCudaBuiltIn()
+                block::sync::BlockSyncCudaBuiltIn(),
+                rand::RandCuRand()
             {}
 
         public:
diff --git a/include/alpaka/atomic/Traits.hpp b/include/alpaka/atomic/Traits.hpp
index 24a6408..514ce5f 100644
--- a/include/alpaka/atomic/Traits.hpp
+++ b/include/alpaka/atomic/Traits.hpp
@@ -69,14 +69,15 @@ namespace alpaka
             T const & value)
         -> T
         {
-            return traits::AtomicOp<
-                TOp,
-                TAtomic,
-                T>
-            ::atomicOp(
-                atomic,
-                addr,
-                value);
+            return
+                traits::AtomicOp<
+                    TOp,
+                    TAtomic,
+                    T>
+                ::atomicOp(
+                    atomic,
+                    addr,
+                    value);
         }
 
         namespace traits
diff --git a/include/alpaka/block/shared/BlockSharedAllocCudaBuiltIn.hpp b/include/alpaka/block/shared/BlockSharedAllocCudaBuiltIn.hpp
index 053b9a9..2ce57f4 100644
--- a/include/alpaka/block/shared/BlockSharedAllocCudaBuiltIn.hpp
+++ b/include/alpaka/block/shared/BlockSharedAllocCudaBuiltIn.hpp
@@ -95,7 +95,7 @@ namespace alpaka
                     -> T &
                     {
                         __shared__ T shMem;
-                        return std::ref(shMem);
+                        return shMem;
                     }
                 };
                 //#############################################################################
diff --git a/include/alpaka/block/shared/Traits.hpp b/include/alpaka/block/shared/Traits.hpp
index 7b1c56b..aaf88bd 100644
--- a/include/alpaka/block/shared/Traits.hpp
+++ b/include/alpaka/block/shared/Traits.hpp
@@ -83,11 +83,12 @@ namespace alpaka
                 TBlockSharedAlloc const & blockSharedAlloc)
             -> T &
             {
-                return traits::AllocVar<
-                    T,
-                    TBlockSharedAlloc>
-                ::allocVar(
-                    blockSharedAlloc);
+                return
+                    traits::AllocVar<
+                        T,
+                        TBlockSharedAlloc>
+                    ::allocVar(
+                        blockSharedAlloc);
             }
 
             //-----------------------------------------------------------------------------
@@ -111,12 +112,13 @@ namespace alpaka
                     TnumElements > 0,
                     "The number of elements to allocate in block shared memory must not be zero!");
 
-                return traits::AllocArr<
-                    T,
-                    TnumElements,
-                    TBlockSharedAlloc>
-                ::allocArr(
-                    blockSharedAlloc);
+                return
+                    traits::AllocArr<
+                        T,
+                        TnumElements,
+                        TBlockSharedAlloc>
+                    ::allocArr(
+                        blockSharedAlloc);
             }
 
             //-----------------------------------------------------------------------------
diff --git a/include/alpaka/block/sync/BlockSyncCudaBuiltIn.hpp b/include/alpaka/block/sync/BlockSyncCudaBuiltIn.hpp
index c95fa86..e2d7821 100644
--- a/include/alpaka/block/sync/BlockSyncCudaBuiltIn.hpp
+++ b/include/alpaka/block/sync/BlockSyncCudaBuiltIn.hpp
@@ -78,7 +78,7 @@ namespace alpaka
                     //
                     //-----------------------------------------------------------------------------
                     ALPAKA_FN_ACC_CUDA_ONLY static auto syncBlockThreads(
-                        block::sync::BlockSyncCudaBuiltIn const &)
+                        block::sync::BlockSyncCudaBuiltIn const & /*blockSync*/)
                     -> void
                     {
                         __syncthreads();
diff --git a/include/alpaka/block/sync/BlockSyncFiberIdMapBarrier.hpp b/include/alpaka/block/sync/BlockSyncFiberIdMapBarrier.hpp
index 0cba118..0da54fd 100644
--- a/include/alpaka/block/sync/BlockSyncFiberIdMapBarrier.hpp
+++ b/include/alpaka/block/sync/BlockSyncFiberIdMapBarrier.hpp
@@ -92,7 +92,7 @@ namespace alpaka
                     auto & barrierIdx(itFind->second);
                     TSize const modBarrierIdx(barrierIdx % 2);
 
-                    auto & bar(m_abarSyncThreads[modBarrierIdx]);
+                    auto & bar(m_barriers[modBarrierIdx]);
 
                     // (Re)initialize a barrier if this is the first thread to reach it.
                     // DCLP: Double checked locking pattern for better performance.
@@ -111,7 +111,7 @@ namespace alpaka
 
                 ThreadIdToBarrierIdxMap & m_threadIdToBarrierIdxMap;
                 //!< We have to keep the current and the last barrier because one of the threads can reach the next barrier before a other thread was wakeup from the last one and has checked if it can run.
-                Barrier mutable m_abarSyncThreads[2];           //!< The barriers for the synchronization of threads.
+                Barrier mutable m_barriers[2];           //!< The barriers for the synchronization of threads.
             };
 
             namespace traits
diff --git a/include/alpaka/block/sync/BlockSyncNoOp.hpp b/include/alpaka/block/sync/BlockSyncNoOp.hpp
index 58b60ed..91893d2 100644
--- a/include/alpaka/block/sync/BlockSyncNoOp.hpp
+++ b/include/alpaka/block/sync/BlockSyncNoOp.hpp
@@ -1,6 +1,6 @@
 /**
 * \file
-* Copyright 2014-2015 Benjamin Worpitz
+* Copyright 2014-2015 Benjamin Worpitz, Rene Widera
 *
 * This file is part of alpaka.
 *
@@ -79,6 +79,7 @@ namespace alpaka
                     //-----------------------------------------------------------------------------
                     //
                     //-----------------------------------------------------------------------------
+                    ALPAKA_NO_HOST_ACC_WARNING
                     ALPAKA_FN_ACC static auto syncBlockThreads(
                         block::sync::BlockSyncNoOp const & /*blockSync*/)
                     -> void
diff --git a/include/alpaka/block/sync/BlockSyncThreadIdMapBarrier.hpp b/include/alpaka/block/sync/BlockSyncThreadIdMapBarrier.hpp
index f33991e..6a63ca7 100644
--- a/include/alpaka/block/sync/BlockSyncThreadIdMapBarrier.hpp
+++ b/include/alpaka/block/sync/BlockSyncThreadIdMapBarrier.hpp
@@ -92,7 +92,7 @@ namespace alpaka
                     auto & barrierIdx(itFind->second);
                     TSize const modBarrierIdx(barrierIdx % 2);
 
-                    auto & bar(m_abarSyncThreads[modBarrierIdx]);
+                    auto & bar(m_barriers[modBarrierIdx]);
 
                     // (Re)initialize a barrier if this is the first thread to reach it.
                     // DCLP: Double checked locking pattern for better performance.
@@ -114,7 +114,7 @@ namespace alpaka
 
                 ThreadIdToBarrierIdxMap & m_threadIdToBarrierIdxMap;
                 //!< We have to keep the current and the last barrier because one of the threads can reach the next barrier before a other thread was wakeup from the last one and has checked if it can run.
-                Barrier mutable m_abarSyncThreads[2];           //!< The barriers for the synchronization of threads.
+                Barrier mutable m_barriers[2];           //!< The barriers for the synchronization of threads.
                 std::mutex mutable m_mtxBarrier;
             };
 
diff --git a/include/alpaka/core/ConcurrentExecPool.hpp b/include/alpaka/core/ConcurrentExecPool.hpp
index 36cc99c..6f4254f 100644
--- a/include/alpaka/core/ConcurrentExecPool.hpp
+++ b/include/alpaka/core/ConcurrentExecPool.hpp
@@ -22,6 +22,7 @@
 #pragma once
 
 #include <boost/predef.h>   // workarounds
+#include <boost/version.hpp>// workarounds
 
 // nvcc does not currently support boost correctly.
 // boost/utility/detail/result_of_iterate.hpp:148:75: error: invalid use of qualified-name 'std::allocator_traits<_Alloc>::propagate_on_container_swap'
@@ -398,7 +399,7 @@ namespace alpaka
                 auto isQueueEmpty() const
                 -> bool
                 {
-#if BOOST_COMP_GNUC
+#if (BOOST_VERSION < 105700)
                     return const_cast<ThreadSafeQueue<ITaskPkg *> &>(m_qTasks).empty();
 #else
                     return m_qTasks.empty();
@@ -615,7 +616,7 @@ namespace alpaka
                 auto isQueueEmpty() const
                 -> bool
                 {
-#if BOOST_COMP_GNUC
+#if (BOOST_VERSION < 105700)
                     return const_cast<ThreadSafeQueue<ITaskPkg *> &>(m_qTasks).empty();
 #else
                     return m_qTasks.empty();
diff --git a/include/alpaka/core/Cuda.hpp b/include/alpaka/core/Cuda.hpp
index fbad1b3..4d28558 100644
--- a/include/alpaka/core/Cuda.hpp
+++ b/include/alpaka/core/Cuda.hpp
@@ -24,6 +24,10 @@
 #include <alpaka/core/Common.hpp>           // ALPAKA_FN_HOST
 #include <alpaka/vec/Vec.hpp>               // Vec
 #include <alpaka/core/IntegerSequence.hpp>  // integer_sequence
+#include <alpaka/elem/Traits.hpp>           // ElemType
+#include <alpaka/offset/Traits.hpp>         // GetOffset/SetOffset
+#include <alpaka/extent/Traits.hpp>         // GetExtent/SetExtent
+#include <alpaka/size/Traits.hpp>           // SizeType
 
 #include <boost/mpl/apply.hpp>              // boost::mpl::apply
 #include <boost/mpl/and.hpp>                // boost::mpl::and_
@@ -331,6 +335,24 @@ namespace alpaka
             };
         }
     }
+    namespace elem
+    {
+        namespace traits
+        {
+            //#############################################################################
+            //! The CUDA vectors elem type trait specialization.
+            //#############################################################################
+            template<
+                typename TSize>
+            struct ElemType<
+                TSize,
+                typename std::enable_if<
+                    cuda::traits::IsCudaBuiltInType<TSize>::value>::type>
+            {
+                using type = decltype(TSize().x);
+            };
+        }
+    }
     namespace extent
     {
         namespace traits
@@ -701,7 +723,7 @@ namespace alpaka
                 typename std::enable_if<
                     cuda::traits::IsCudaBuiltInType<TSize>::value>::type>
             {
-                using type = decltype(TSize().x);
+                using type = std::size_t;
             };
         }
     }
diff --git a/include/alpaka/core/Fold.hpp b/include/alpaka/core/Fold.hpp
index a931cf7..5a4077d 100644
--- a/include/alpaka/core/Fold.hpp
+++ b/include/alpaka/core/Fold.hpp
@@ -21,13 +21,15 @@
 
 #pragma once
 
-#include <alpaka/core/Common.hpp>       // ALPAKA_FN_HOST_ACC
+#include <alpaka/core/Common.hpp>           // ALPAKA_FN_HOST_ACC
 
 #if !defined(__CUDA_ARCH__)
     #include <boost/core/ignore_unused.hpp> // boost::ignore_unused
 #endif
 
-#include <type_traits>                  // std::result_of
+#if !(__cplusplus >= 201402L)
+    #include <type_traits>                  // std::result_of
+#endif
 
 namespace alpaka
 {
@@ -122,7 +124,7 @@ namespace alpaka
             T0 const & t0,
             T1 const & t1,
             Ts const & ... ts)
-        // NOTE: This is not allowed because the point of function declaration is after the trailing return type.
+        // NOTE: The following line is not allowed because the point of function declaration is after the trailing return type.
         // Thus the function itself is not available inside its return type declaration.
         // http://www.open-std.org/jtc1/sc22/wg21/docs/cwg_closed.html#1433
         // http://stackoverflow.com/questions/3744400/trailing-return-type-using-decltype-with-a-variadic-template-function
diff --git a/include/alpaka/core/NdLoop.hpp b/include/alpaka/core/NdLoop.hpp
index 2b45467..78ed6d8 100644
--- a/include/alpaka/core/NdLoop.hpp
+++ b/include/alpaka/core/NdLoop.hpp
@@ -161,7 +161,7 @@ namespace alpaka
             typename TFnObj,
             std::size_t... Tdims>
         ALPAKA_FN_HOST_ACC auto ndLoop(
-            core::detail::index_sequence<Tdims...> const & indexSequence,
+            core::detail::index_sequence<Tdims...> const & /*indexSequence*/,
             TExtentsVec const & extents,
             TFnObj const & f)
         -> void
diff --git a/include/alpaka/core/OpenMp.hpp b/include/alpaka/core/OpenMp.hpp
index 801da11..be39b81 100644
--- a/include/alpaka/core/OpenMp.hpp
+++ b/include/alpaka/core/OpenMp.hpp
@@ -28,17 +28,26 @@ namespace alpaka
     namespace omp
     {
         //-----------------------------------------------------------------------------
-        //! \return The device this object is bound to.
+        //! \return The maximum number of threads the OpenMP 2.0 runtime is capable of.
         //-----------------------------------------------------------------------------
         ALPAKA_FN_HOST auto getMaxOmpThreads()
         -> int
         {
-            // HACK: ::omp_get_max_threads() does not return the real limit of the underlying OpenMP 2.0 runtime:
+            // NOTE: ::omp_get_max_threads() does not return the real limit of the underlying OpenMP 2.0 runtime at any time:
             // 'The omp_get_max_threads routine returns the value of the internal control variable, which is used to determine the number of threads that would form the new team,
             // if an active parallel region without a num_threads clause were to be encountered at that point in the program.'
             // How to do this correctly? Is there even a way to get the hard limit apart from omp_set_num_threads(high_value) -> omp_get_max_threads()?
+
+            // Get the current thread number. This is OMP_NUM_THREADS if it has not been changed up to here.
+            auto const maxThreadsOld(::omp_get_max_threads());
+
             ::omp_set_num_threads(1024);
-            return ::omp_get_max_threads();
+            auto const maxThreadsReal(::omp_get_max_threads());
+
+            // Reset the max threads.
+            ::omp_set_num_threads(maxThreadsOld);
+
+            return maxThreadsReal;
         }
     }
 }
diff --git a/include/alpaka/core/Vectorize.hpp b/include/alpaka/core/Vectorize.hpp
index 9b12c61..726a7d9 100644
--- a/include/alpaka/core/Vectorize.hpp
+++ b/include/alpaka/core/Vectorize.hpp
@@ -59,7 +59,7 @@ namespace alpaka
             //-----------------------------------------------------------------------------
             template<
                 typename TElem>
-            struct GetCurrentThreadVectorRegisterSizeElems
+            struct GetVectorizationSizeElems
             {
                 static constexpr std::size_t value = 1u;
             };
@@ -68,7 +68,7 @@ namespace alpaka
             // Number of elements of the given type that can be processed in parallel in a vector register.
             //-----------------------------------------------------------------------------
             template<>
-            struct GetCurrentThreadVectorRegisterSizeElems<
+            struct GetVectorizationSizeElems<
                 double>
             {
                 static constexpr std::size_t value =
@@ -100,7 +100,7 @@ namespace alpaka
             // Number of elements of the given type that can be processed in parallel in a vector register.
             //-----------------------------------------------------------------------------
             template<>
-            struct GetCurrentThreadVectorRegisterSizeElems<
+            struct GetVectorizationSizeElems<
                 float>
             {
                 static constexpr std::size_t value =
@@ -131,7 +131,7 @@ namespace alpaka
             // Number of elements of the given type that can be processed in parallel in a vector register.
             //-----------------------------------------------------------------------------
             template<>
-            struct GetCurrentThreadVectorRegisterSizeElems<
+            struct GetVectorizationSizeElems<
                 std::int8_t>
             {
                 static constexpr std::size_t value =
@@ -167,7 +167,7 @@ namespace alpaka
             // Number of elements of the given type that can be processed in parallel in a vector register.
             //-----------------------------------------------------------------------------
             template<>
-            struct GetCurrentThreadVectorRegisterSizeElems<
+            struct GetVectorizationSizeElems<
                 std::uint8_t>
             {
                 static constexpr std::size_t value =
@@ -203,7 +203,7 @@ namespace alpaka
             // Number of elements of the given type that can be processed in parallel in a vector register.
             //-----------------------------------------------------------------------------
             template<>
-            struct GetCurrentThreadVectorRegisterSizeElems<
+            struct GetVectorizationSizeElems<
                 std::int16_t>
             {
                 static constexpr std::size_t value =
@@ -239,7 +239,7 @@ namespace alpaka
             // Number of elements of the given type that can be processed in parallel in a vector register.
             //-----------------------------------------------------------------------------
             template<>
-            struct GetCurrentThreadVectorRegisterSizeElems<
+            struct GetVectorizationSizeElems<
                 std::uint16_t>
             {
                 static constexpr std::size_t value =
@@ -275,7 +275,7 @@ namespace alpaka
             // Number of elements of the given type that can be processed in parallel in a vector register.
             //-----------------------------------------------------------------------------
             template<>
-            struct GetCurrentThreadVectorRegisterSizeElems<
+            struct GetVectorizationSizeElems<
                 std::int32_t>
             {
                 static constexpr std::size_t value =
@@ -306,7 +306,7 @@ namespace alpaka
             // Number of elements of the given type that can be processed in parallel in a vector register.
             //-----------------------------------------------------------------------------
             template<>
-            struct GetCurrentThreadVectorRegisterSizeElems<
+            struct GetVectorizationSizeElems<
                 std::uint32_t>
             {
                 static constexpr std::size_t value =
@@ -337,7 +337,7 @@ namespace alpaka
             // Number of elements of the given type that can be processed in parallel in a vector register.
             //-----------------------------------------------------------------------------
             template<>
-            struct GetCurrentThreadVectorRegisterSizeElems<
+            struct GetVectorizationSizeElems<
                 std::int64_t>
             {
                 static constexpr std::size_t value =
@@ -366,7 +366,7 @@ namespace alpaka
             // Number of elements of the given type that can be processed in parallel in a vector register.
             //-----------------------------------------------------------------------------
             template<>
-            struct GetCurrentThreadVectorRegisterSizeElems<
+            struct GetVectorizationSizeElems<
                 std::uint64_t>
             {
                 static constexpr std::size_t value =
@@ -395,10 +395,10 @@ namespace alpaka
             namespace detail
             {
                 //#############################################################################
-                //!
+                //! The general vectorization implementation.
                 //#############################################################################
                 template<
-                    std::size_t TvecRegElemCount>
+                    std::size_t TvectorizationCount>
                 struct Vectorize
                 {
                     //-----------------------------------------------------------------------------
@@ -414,29 +414,33 @@ namespace alpaka
                     {
 #ifdef __CUDACC__   // nvcc seems not to support constexpr correctly
                         static_assert(
-                            (TvecRegElemCount > 0),
-                            "The number of elements per vector register has to be greater zero!");
+                            (static_cast<TSize>(TvectorizationCount) > 0),
+                            "The number of loop invocations to vectorize (TvectorizationCount) has to be greater zero!");
 
                         TSize i(0);
-                        while((count-i) >= TvecRegElemCount)
+                        while((count-i) >= static_cast<TSize>(TvectorizationCount))
                         {
-                            for(TSize j(0); j<TvecRegElemCount; ++j)
+                            for(TSize j(0); j<static_cast<TSize>(TvectorizationCount); ++j)
                             {
                                 fnObj(i+j);
                             }
-                            i += TvecRegElemCount;
+                            i += static_cast<TSize>(TvectorizationCount);
                         }
 #else
+                        // Cast it to the user defined type.
                         constexpr TSize vecRegElems(
-                            static_cast<TSize>(TvecRegElemCount));
+                            static_cast<TSize>(TvectorizationCount));
 
                         static_assert(
                             (vecRegElems > 0),
-                            "The number of elements per vector register has to be greater zero!");
+                            "The number of loop invocations to vectorize (TvectorizationCount) has to be greater zero!");
 
+                        // While we have more then TvectorizationCount elements left ...
                         TSize i(0);
                         while((count-i) >= vecRegElems)
                         {
+                            // ... execute exactly TvectorizationCount invocations of the loop.
+                            // This enables the compiler to optimize much better because of the compile-time loop bounds.
                             for(TSize j(0); j<vecRegElems; ++j)
                             {
                                 fnObj(i+j);
@@ -444,6 +448,7 @@ namespace alpaka
                             i += vecRegElems;
                         }
 #endif
+                        // Execute the last (count % TvectorizationCount) invocations.
                         for(; i<count; ++i)
                         {
                             fnObj(i);
@@ -451,7 +456,7 @@ namespace alpaka
                     }
                 };
                 //#############################################################################
-                //!
+                //! The vectorization specialization for one element vectors.
                 //#############################################################################
                 template<>
                 struct Vectorize<
@@ -484,41 +489,29 @@ namespace alpaka
             }
 
             //-----------------------------------------------------------------------------
+            //! Calls the given function object fnObj count times trying to utilize vectorization with TVectorizationCount elements.
             //!
+            //! The supported number of elements per vector of a given type on the current architecture can be queried with registerGetVectorizationSizeElems<TElem>::value.
+            //! For optimal vectorization, the fnObj should be only consist of one line.
             //-----------------------------------------------------------------------------
             template<
+                std::size_t TvectorizationCount,
                 typename TSize,
-                typename TElem,
                 typename TFnObj>
             ALPAKA_FN_HOST auto vectorize(
                 TSize const & count,
                 TFnObj const & fnObj)
             -> void
             {
-#ifdef __CUDACC__   // nvcc seems not to support constexpr correctly
-                static_assert(
-                    (GetCurrentThreadVectorRegisterSizeElems<TElem>::value > 0),
-                    "The number of elements per vector register has to be greater zero!");
-
-                detail::Vectorize<
-                    GetCurrentThreadVectorRegisterSizeElems<TElem>::value>
-                ::vectorize(
-                    count,
-                    fnObj);
-#else
-                constexpr auto vecRegElems(
-                    GetCurrentThreadVectorRegisterSizeElems<TElem>::value);
-
                 static_assert(
-                    (vecRegElems > 0),
-                    "The number of elements per vector register has to be greater zero!");
+                    (TvectorizationCount > 0),
+                    "The number of loop invocations to vectorize (TvectorizationCount) has to be greater zero!");
 
                 detail::Vectorize<
-                    vecRegElems>
+                    TvectorizationCount>
                 ::vectorize(
                     count,
                     fnObj);
-#endif
             }
         }
     }
diff --git a/include/alpaka/dev/DevCpu.hpp b/include/alpaka/dev/DevCpu.hpp
index 89c7d10..89690ad 100644
--- a/include/alpaka/dev/DevCpu.hpp
+++ b/include/alpaka/dev/DevCpu.hpp
@@ -28,7 +28,7 @@
 #include <alpaka/mem/view/Traits.hpp>   // mem::view::traits::ViewType
 
 #include <alpaka/stream/Traits.hpp>     // stream::enqueue
-#include <alpaka/dev/cpu/SysInfo.hpp>   // getCpuName, getGlobalMemSizeBytes
+#include <alpaka/dev/cpu/SysInfo.hpp>   // getCpuName, getTotalGlobalMemSizeBytes, getFreeGlobalMemSizeBytes
 
 #include <boost/core/ignore_unused.hpp> // boost::ignore_unused
 
@@ -361,7 +361,7 @@ namespace alpaka
                 {
                     boost::ignore_unused(dev);
 
-                    return dev::cpu::detail::getGlobalMemSizeBytes();
+                    return dev::cpu::detail::getTotalGlobalMemSizeBytes();
                 }
             };
 
@@ -381,8 +381,7 @@ namespace alpaka
                 {
                     boost::ignore_unused(dev);
 
-                    // \FIXME: Get correct free memory size!
-                    return dev::cpu::detail::getGlobalMemSizeBytes();
+                    return dev::cpu::detail::getFreeGlobalMemSizeBytes();
                 }
             };
 
diff --git a/include/alpaka/dev/Traits.hpp b/include/alpaka/dev/Traits.hpp
index a010b63..e11f0c7 100644
--- a/include/alpaka/dev/Traits.hpp
+++ b/include/alpaka/dev/Traits.hpp
@@ -115,10 +115,11 @@ namespace alpaka
             T const & t)
         -> decltype(traits::GetDev<T>::getDev(t))
         {
-            return traits::GetDev<
-                T>
-            ::getDev(
-                t);
+            return
+                traits::GetDev<
+                    T>
+                ::getDev(
+                    t);
         }
 
         //-----------------------------------------------------------------------------
@@ -149,10 +150,11 @@ namespace alpaka
             TDev const & dev)
         -> std::string
         {
-            return traits::GetName<
-                TDev>
-            ::getName(
-                dev);
+            return
+                traits::GetName<
+                    TDev>
+                ::getName(
+                    dev);
         }
 
         //-----------------------------------------------------------------------------
@@ -164,10 +166,11 @@ namespace alpaka
             TDev const & dev)
         -> std::size_t
         {
-            return traits::GetMemBytes<
-                TDev>
-            ::getMemBytes(
-                dev);
+            return
+                traits::GetMemBytes<
+                    TDev>
+                ::getMemBytes(
+                    dev);
         }
 
         //-----------------------------------------------------------------------------
@@ -179,10 +182,11 @@ namespace alpaka
             TDev const & dev)
         -> std::size_t
         {
-            return traits::GetFreeMemBytes<
-                TDev>
-            ::getFreeMemBytes(
-                dev);
+            return
+                traits::GetFreeMemBytes<
+                    TDev>
+                ::getFreeMemBytes(
+                    dev);
         }
 
         //-----------------------------------------------------------------------------
diff --git a/include/alpaka/dev/cpu/SysInfo.hpp b/include/alpaka/dev/cpu/SysInfo.hpp
index b05da74..182641f 100644
--- a/include/alpaka/dev/cpu/SysInfo.hpp
+++ b/include/alpaka/dev/cpu/SysInfo.hpp
@@ -21,6 +21,8 @@
 
 #pragma once
 
+#include <boost/predef.h>   // BOOST_XXX
+
 #if BOOST_OS_WINDOWS || BOOST_OS_CYGWIN
     #ifndef NOMINMAX
         #define NOMINMAX
@@ -40,7 +42,9 @@
     #endif
 #endif
 
-#include <boost/predef.h>   // BOOST_XXX
+#if BOOST_OS_LINUX
+    #include <fstream>
+#endif
 
 #include <cstring>          // std::memcpy
 #include <string>           // std::string
@@ -54,7 +58,7 @@ namespace alpaka
 #if BOOST_ARCH_X86
             namespace detail
             {
-    #if BOOST_COMP_GNUC || BOOST_COMP_CLANG
+    #if BOOST_COMP_GNUC || BOOST_COMP_CLANG || __INTEL_COMPILER
         #include <cpuid.h>
                 //-----------------------------------------------------------------------------
                 //!
@@ -65,7 +69,7 @@ namespace alpaka
                     __cpuid_count(level, subfunction, ex[0], ex[1], ex[2], ex[3]);
                 }
 
-    #elif BOOST_COMP_MSVC || BOOST_COMP_INTEL
+    #elif BOOST_COMP_MSVC
         #include <intrin.h>
                 //-----------------------------------------------------------------------------
                 //!
@@ -90,7 +94,7 @@ namespace alpaka
                     std::uint32_t const nExIds(ex[0]);
 
                     // Get the information associated with each extended ID.
-                    char pCpuBrandString[0x40] = {0};
+                    char cpuBrandString[0x40] = {0};
                     for(std::uint32_t i(0x80000000); i<=nExIds; ++i)
                     {
                         cpuid(i, 0, ex);
@@ -98,18 +102,18 @@ namespace alpaka
                         // Interpret CPU brand string and cache information.
                         if(i == 0x80000002)
                         {
-                            std::memcpy(pCpuBrandString, ex, sizeof(ex));
+                            std::memcpy(cpuBrandString, ex, sizeof(ex));
                         }
                         else if(i == 0x80000003)
                         {
-                            std::memcpy(pCpuBrandString + 16, ex, sizeof(ex));
+                            std::memcpy(cpuBrandString + 16, ex, sizeof(ex));
                         }
                         else if(i == 0x80000004)
                         {
-                            std::memcpy(pCpuBrandString + 32, ex, sizeof(ex));
+                            std::memcpy(cpuBrandString + 32, ex, sizeof(ex));
                         }
                     }
-                    return std::string(pCpuBrandString);
+                    return std::string(cpuBrandString);
 #else
                     return "<unknown>";
 #endif
@@ -124,10 +128,10 @@ namespace alpaka
                     return 0;
                 }*/
                 //-----------------------------------------------------------------------------
-                //! \return The number of bytes of global memory.
+                //! \return The total number of bytes of global memory.
                 //! Adapted from David Robert Nadeau: http://nadeausoftware.com/articles/2012/09/c_c_tip_how_get_physical_memory_size_system
                 //-----------------------------------------------------------------------------
-                inline auto getGlobalMemSizeBytes()
+                inline auto getTotalGlobalMemSizeBytes()
                 -> std::size_t
                 {
 #if BOOST_OS_WINDOWS
@@ -157,7 +161,7 @@ namespace alpaka
                     std::size_t const sizeLen{sizeof(size)};
                     if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
                     {
-                        throw std::logic_error("getGlobalMemSizeBytes failed calling sysctl!");
+                        throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!");
                     }
                     return static_cast<std::size_t>(size);
 
@@ -182,13 +186,58 @@ namespace alpaka
                     std::size_t const sizeLen{sizeof(size)};
                     if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
                     {
-                        throw std::logic_error("getGlobalMemSizeBytes failed calling sysctl!");
+                        throw std::logic_error("getTotalGlobalMemSizeBytes failed calling sysctl!");
                     }
                     return static_cast<std::size_t>(size);
     #endif
 
 #else
-                    throw std::logic_error("getGlobalMemSizeBytes not implemented for this system!");
+    #error "getTotalGlobalMemSizeBytes not implemented for this system!"
+#endif
+                }
+                //-----------------------------------------------------------------------------
+                //! \return The free number of bytes of global memory.
+                //! \throws std::logic_error if not implemented on the system and std::runtime_error on other errors.
+                //-----------------------------------------------------------------------------
+                inline auto getFreeGlobalMemSizeBytes()
+                -> std::size_t
+                {
+#if BOOST_OS_WINDOWS
+                    MEMORYSTATUSEX status;
+                    status.dwLength = sizeof(status);
+                    GlobalMemoryStatusEx(&status);
+                    return static_cast<std::size_t>(status.ullAvailPhys);
+
+#elif BOOST_OS_LINUX
+                    std::string token;
+                    std::ifstream file("/proc/meminfo");
+                    if(file)
+                    {
+                        while(file >> token)
+                        {
+                            if(token == "MemFree:")
+                            {
+                                std::size_t freeGlobalMemSizeBytes(0);
+                                if(file >> freeGlobalMemSizeBytes)
+                                {
+                                    return freeGlobalMemSizeBytes;
+                                }
+                                else
+                                {
+                                    throw std::runtime_error("Unable to read MemFree value!");
+                                }
+                            }
+                        }
+                        throw std::runtime_error("Unable to find MemFree in '/proc/meminfo'!");
+                    }
+                    else
+                    {
+                        throw std::runtime_error("Unable to open '/proc/meminfo'!");
+                    }
+#elif BOOST_OS_MACOS
+    #error "getFreeGlobalMemSizeBytes not implemented for __APPLE__!"
+#else
+    #error "getFreeGlobalMemSizeBytes not implemented for this system!"
 #endif
                 }
             }
diff --git a/include/alpaka/dim/DimArithmetic.hpp b/include/alpaka/dim/DimArithmetic.hpp
index c36f9a9..e48b091 100644
--- a/include/alpaka/dim/DimArithmetic.hpp
+++ b/include/alpaka/dim/DimArithmetic.hpp
@@ -43,7 +43,7 @@ namespace alpaka
                 T,
                 typename std::enable_if<std::is_arithmetic<T>::value>::type>
             {
-                using type = std::integral_constant<std::size_t, 1u>;
+                using type = dim::DimInt<1u>;
             };
         }
     }
diff --git a/include/alpaka/elem/Traits.hpp b/include/alpaka/elem/Traits.hpp
new file mode 100644
index 0000000..884a3de
--- /dev/null
+++ b/include/alpaka/elem/Traits.hpp
@@ -0,0 +1,72 @@
+/**
+* \file
+* Copyright 2014-2015 Benjamin Worpitz
+*
+* This file is part of alpaka.
+*
+* alpaka is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Lesser General Public License as published by
+* the Free Software Foundation, either version 3 of the License, or
+* (at your option) any later version.
+*
+* alpaka is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License
+* along with alpaka.
+* If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <type_traits>  // std::enable_if, std::is_fundamental
+
+namespace alpaka
+{
+    //-----------------------------------------------------------------------------
+    //! The element specifics.
+    //-----------------------------------------------------------------------------
+    namespace elem
+    {
+        //-----------------------------------------------------------------------------
+        //! The element traits.
+        //-----------------------------------------------------------------------------
+        namespace traits
+        {
+            //#############################################################################
+            //! The element type trait.
+            //#############################################################################
+            template<
+                typename TView,
+                typename TSfinae = void>
+            struct ElemType;
+        }
+
+        //#############################################################################
+        //! The element type trait alias template to remove the ::type.
+        //#############################################################################
+        template<
+            typename TView>
+        using Elem = typename std::remove_volatile<typename traits::ElemType<TView>::type>::type;
+
+        //-----------------------------------------------------------------------------
+        // Trait specializations for unsigned integral types.
+        //-----------------------------------------------------------------------------
+        namespace traits
+        {
+            //#############################################################################
+            //! The fundamental type elem type trait specialization.
+            //#############################################################################
+            template<
+                typename T>
+            struct ElemType<
+                T,
+                typename std::enable_if<std::is_fundamental<T>::value>::type>
+            {
+                using type = T;
+            };
+        }
+    }
+}
diff --git a/include/alpaka/event/EventCpu.hpp b/include/alpaka/event/EventCpu.hpp
index 0ec411b..3ce682f 100644
--- a/include/alpaka/event/EventCpu.hpp
+++ b/include/alpaka/event/EventCpu.hpp
@@ -261,7 +261,7 @@ namespace alpaka
 #endif
                             return;
                         }
-                        // ... and was enqueued before, increment the cancel counter.
+                        // ... and noby is waiting for it, increment the cancel counter.
                         else
                         {
                             ++spEventCpuImpl->m_canceledEnqueueCount;
@@ -353,11 +353,11 @@ namespace alpaka
                             if(spEventCpuImpl->m_bIsWaitedFor)
                             {
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-                               std::cout << BOOST_CURRENT_FUNCTION << "WARNING: The event to enqueue is already enqueued AND waited on. It can NOT be re-enqueued!" << std::endl;
+                                std::cout << BOOST_CURRENT_FUNCTION << "WARNING: The event to enqueue is already enqueued AND waited on. It can NOT be re-enqueued!" << std::endl;
 #endif
                                 return;
                             }
-                            // ... and was enqueued before, increment the cancel counter.
+                            // ... and noby is waiting for it, increment the cancel counter.
                             else
                             {
                                 ++spEventCpuImpl->m_canceledEnqueueCount;
diff --git a/include/alpaka/event/Traits.hpp b/include/alpaka/event/Traits.hpp
index a67d177..0270da8 100644
--- a/include/alpaka/event/Traits.hpp
+++ b/include/alpaka/event/Traits.hpp
@@ -80,10 +80,11 @@ namespace alpaka
             TEvent const & event)
         -> bool
         {
-            return traits::EventTest<
-                TEvent>
-            ::eventTest(
-                event);
+            return
+                traits::EventTest<
+                    TEvent>
+                ::eventTest(
+                    event);
         }
     }
 }
diff --git a/include/alpaka/exec/ExecCpuOmp2Blocks.hpp b/include/alpaka/exec/ExecCpuOmp2Blocks.hpp
index 036ab38..0f358ff 100644
--- a/include/alpaka/exec/ExecCpuOmp2Blocks.hpp
+++ b/include/alpaka/exec/ExecCpuOmp2Blocks.hpp
@@ -192,9 +192,9 @@ namespace alpaka
                         acc.m_gridBlockIdx =
                             core::mapIdx<TDim::value>(
 #if _OPENMP < 200805
-                                Vec1<TSize>(static_cast<TSize>(i)),
+                                Vec<dim::DimInt<1u>, TSize>(static_cast<TSize>(i)),
 #else
-                                Vec1<TSize>(i),
+                                Vec<dim::DimInt<1u>, TSize>(i),
 #endif
                                 gridBlockExtents);
 
diff --git a/include/alpaka/exec/ExecCpuOmp4.hpp b/include/alpaka/exec/ExecCpuOmp4.hpp
index 4d749d3..0c77bca 100644
--- a/include/alpaka/exec/ExecCpuOmp4.hpp
+++ b/include/alpaka/exec/ExecCpuOmp4.hpp
@@ -182,7 +182,7 @@ namespace alpaka
                         #pragma omp distribute
                         for(TSize b = 0u; b<numBlocksInGrid; ++b)
                         {
-                            Vec1<TSize> const gridBlockIdx(b);
+                            Vec<dim::DimInt<1u>, TSize> const gridBlockIdx(b);
                             // When this is not repeated here:
                             // error: �gridBlockExtents� referenced in target region does not have a mappable type
                             auto const gridBlockExtents2(
@@ -210,7 +210,7 @@ namespace alpaka
                                     int const numThreads(::omp_get_num_threads());
                                     // NOTE: No std::cout in omp target!
                                     printf("%s omp_get_num_threads: %d\n", BOOST_CURRENT_FUNCTION, numThreads);
-                                    if(numThreads != numThreadsInBlock)
+                                    if(numThreads != static_cast<int>(numThreadsInBlock))
                                     {
                                         throw std::runtime_error("The CPU OpenMP4 runtime did not use the number of threads that had been required!");
                                     }
diff --git a/include/alpaka/exec/ExecGpuCudaRt.hpp b/include/alpaka/exec/ExecGpuCudaRt.hpp
index 7d389ac..705905c 100644
--- a/include/alpaka/exec/ExecGpuCudaRt.hpp
+++ b/include/alpaka/exec/ExecGpuCudaRt.hpp
@@ -326,8 +326,10 @@ namespace alpaka
                     }
 
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
-                    std::cout << BOOST_CURRENT_FUNCTION << "gridDim: " <<  gridDim.z << " " <<  gridDim.y << " " <<  gridDim.x << std::endl;
-                    std::cout << BOOST_CURRENT_FUNCTION << "blockDim: " <<  blockDim.z << " " <<  blockDim.y << " " <<  blockDim.x << std::endl;
+                    std::cout << BOOST_CURRENT_FUNCTION
+                        << " gridDim: " <<  gridDim.z << " " <<  gridDim.y << " " <<  gridDim.x
+                        << " blockDim: " <<  blockDim.z << " " <<  blockDim.y << " " <<  blockDim.x
+                        << std::endl;
 #endif
 
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
diff --git a/include/alpaka/extent/Traits.hpp b/include/alpaka/extent/Traits.hpp
index cb8366b..2478e3e 100644
--- a/include/alpaka/extent/Traits.hpp
+++ b/include/alpaka/extent/Traits.hpp
@@ -89,7 +89,7 @@ namespace alpaka
         {
             return
                 traits::GetExtent<
-                    std::integral_constant<std::size_t, Tidx>,
+                    dim::DimInt<Tidx>,
                     TExtents>
                 ::getExtent(
                     extents);
@@ -186,7 +186,7 @@ namespace alpaka
         -> void
         {
             traits::SetExtent<
-                std::integral_constant<std::size_t, Tidx>,
+                dim::DimInt<Tidx>,
                 TExtents,
                 TExtent>
             ::setExtent(
@@ -247,7 +247,7 @@ namespace alpaka
             template<
                 typename TExtents>
             struct GetExtent<
-                std::integral_constant<std::size_t, 0u>,
+                dim::DimInt<0u>,
                 TExtents,
                 typename std::enable_if<
                     std::is_integral<TExtents>::value>::type>
@@ -267,7 +267,7 @@ namespace alpaka
                 typename TExtents,
                 typename TExtent>
             struct SetExtent<
-                std::integral_constant<std::size_t, 0u>,
+                dim::DimInt<0u>,
                 TExtents,
                 TExtent,
                 typename std::enable_if<
diff --git a/include/alpaka/idx/Traits.hpp b/include/alpaka/idx/Traits.hpp
index 244b8cb..5393c7a 100644
--- a/include/alpaka/idx/Traits.hpp
+++ b/include/alpaka/idx/Traits.hpp
@@ -69,13 +69,14 @@ namespace alpaka
             TWorkDiv const & workDiv)
         -> Vec<dim::Dim<TWorkDiv>, size::Size<TIdx>>
         {
-            return traits::GetIdx<
-                TIdx,
-                TOrigin,
-                TUnit>
-            ::getIdx(
-                idx,
-                workDiv);
+            return
+                traits::GetIdx<
+                    TIdx,
+                    TOrigin,
+                    TUnit>
+                ::getIdx(
+                    idx,
+                    workDiv);
         }
         //-----------------------------------------------------------------------------
         //! Get the indices requested.
@@ -89,13 +90,14 @@ namespace alpaka
             TIdxWorkDiv const & idxWorkDiv)
         -> Vec<dim::Dim<TIdxWorkDiv>, size::Size<TIdxWorkDiv>>
         {
-            return traits::GetIdx<
-                TIdxWorkDiv,
-                TOrigin,
-                TUnit>
-            ::getIdx(
-                idxWorkDiv,
-                idxWorkDiv);
+            return
+                traits::GetIdx<
+                    TIdxWorkDiv,
+                    TOrigin,
+                    TUnit>
+                ::getIdx(
+                    idxWorkDiv,
+                    idxWorkDiv);
         }
 
         namespace traits
diff --git a/include/alpaka/idx/bt/IdxBtOmp.hpp b/include/alpaka/idx/bt/IdxBtOmp.hpp
index 0c08871..6e752c8 100644
--- a/include/alpaka/idx/bt/IdxBtOmp.hpp
+++ b/include/alpaka/idx/bt/IdxBtOmp.hpp
@@ -120,7 +120,7 @@ namespace alpaka
                     assert(::omp_get_thread_num()>=0);
                     // \TODO: Would it be faster to precompute the index and cache it inside an array?
                     return core::mapIdx<TDim::value>(
-                        Vec1<TSize>(static_cast<TSize>(::omp_get_thread_num())),
+                        Vec<dim::DimInt<1u>, TSize>(static_cast<TSize>(::omp_get_thread_num())),
                         workdiv::getWorkDiv<Block, Threads>(workDiv));
                 }
             };
diff --git a/include/alpaka/math/abs/Traits.hpp b/include/alpaka/math/abs/Traits.hpp
index 44ffbcf..838b849 100644
--- a/include/alpaka/math/abs/Traits.hpp
+++ b/include/alpaka/math/abs/Traits.hpp
@@ -67,12 +67,13 @@ namespace alpaka
                 abs,
                 arg))
         {
-            return traits::Abs<
-                T,
-                TArg>
-            ::abs(
-                abs,
-                arg);
+            return
+                traits::Abs<
+                    T,
+                    TArg>
+                ::abs(
+                    abs,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/acos/Traits.hpp b/include/alpaka/math/acos/Traits.hpp
index b795145..1fb240e 100644
--- a/include/alpaka/math/acos/Traits.hpp
+++ b/include/alpaka/math/acos/Traits.hpp
@@ -63,12 +63,13 @@ namespace alpaka
                 acos,
                 arg))
         {
-            return traits::Acos<
-                T,
-                TArg>
-            ::acos(
-                acos,
-                arg);
+            return
+                traits::Acos<
+                    T,
+                    TArg>
+                ::acos(
+                    acos,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/asin/Traits.hpp b/include/alpaka/math/asin/Traits.hpp
index f02f31b..f831eee 100644
--- a/include/alpaka/math/asin/Traits.hpp
+++ b/include/alpaka/math/asin/Traits.hpp
@@ -63,12 +63,13 @@ namespace alpaka
                 asin,
                 arg))
         {
-            return traits::Asin<
-                T,
-                TArg>
-            ::asin(
-                asin,
-                arg);
+            return
+                traits::Asin<
+                    T,
+                    TArg>
+                ::asin(
+                    asin,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/atan/Traits.hpp b/include/alpaka/math/atan/Traits.hpp
index 640a532..27ab455 100644
--- a/include/alpaka/math/atan/Traits.hpp
+++ b/include/alpaka/math/atan/Traits.hpp
@@ -63,12 +63,13 @@ namespace alpaka
                 atan,
                 arg))
         {
-            return traits::Atan<
-                T,
-                TArg>
-            ::atan(
-                atan,
-                arg);
+            return
+                traits::Atan<
+                    T,
+                    TArg>
+                ::atan(
+                    atan,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/atan2/Traits.hpp b/include/alpaka/math/atan2/Traits.hpp
index e1c341b..5b8958a 100644
--- a/include/alpaka/math/atan2/Traits.hpp
+++ b/include/alpaka/math/atan2/Traits.hpp
@@ -71,14 +71,15 @@ namespace alpaka
                 y,
                 x))
         {
-            return traits::Atan2<
-                T,
-                Ty,
-                Tx>
-            ::atan2(
-                atan2,
-                y,
-                x);
+            return
+                traits::Atan2<
+                    T,
+                    Ty,
+                    Tx>
+                ::atan2(
+                    atan2,
+                    y,
+                    x);
         }
 
         namespace traits
diff --git a/include/alpaka/math/cbrt/Traits.hpp b/include/alpaka/math/cbrt/Traits.hpp
index 5f4b454..d24959b 100644
--- a/include/alpaka/math/cbrt/Traits.hpp
+++ b/include/alpaka/math/cbrt/Traits.hpp
@@ -64,12 +64,13 @@ namespace alpaka
                 cbrt,
                 arg))
         {
-            return traits::Cbrt<
-                T,
-                TArg>
-            ::cbrt(
-                cbrt,
-                arg);
+            return
+                traits::Cbrt<
+                    T,
+                    TArg>
+                ::cbrt(
+                    cbrt,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/ceil/Traits.hpp b/include/alpaka/math/ceil/Traits.hpp
index 8fcef22..1aab94e 100644
--- a/include/alpaka/math/ceil/Traits.hpp
+++ b/include/alpaka/math/ceil/Traits.hpp
@@ -64,12 +64,13 @@ namespace alpaka
                 ceil,
                 arg))
         {
-            return traits::Ceil<
-                T,
-                TArg>
-            ::ceil(
-                ceil,
-                arg);
+            return
+                traits::Ceil<
+                    T,
+                    TArg>
+                ::ceil(
+                    ceil,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/cos/Traits.hpp b/include/alpaka/math/cos/Traits.hpp
index 26ae1a9..f74e41a 100644
--- a/include/alpaka/math/cos/Traits.hpp
+++ b/include/alpaka/math/cos/Traits.hpp
@@ -64,12 +64,13 @@ namespace alpaka
                 cos,
                 arg))
         {
-            return traits::Cos<
-                T,
-                TArg>
-            ::cos(
-                cos,
-                arg);
+            return
+                traits::Cos<
+                    T,
+                    TArg>
+                ::cos(
+                    cos,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/erf/Traits.hpp b/include/alpaka/math/erf/Traits.hpp
index 372cc52..2b1d453 100644
--- a/include/alpaka/math/erf/Traits.hpp
+++ b/include/alpaka/math/erf/Traits.hpp
@@ -64,12 +64,13 @@ namespace alpaka
                 erf,
                 arg))
         {
-            return traits::Erf<
-                T,
-                TArg>
-            ::erf(
-                erf,
-                arg);
+            return
+                traits::Erf<
+                    T,
+                    TArg>
+                ::erf(
+                    erf,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/exp/Traits.hpp b/include/alpaka/math/exp/Traits.hpp
index 45a21b4..99544d7 100644
--- a/include/alpaka/math/exp/Traits.hpp
+++ b/include/alpaka/math/exp/Traits.hpp
@@ -63,12 +63,13 @@ namespace alpaka
                 exp,
                 arg))
         {
-            return traits::Exp<
-                T,
-                TArg>
-            ::exp(
-                exp,
-                arg);
+            return
+                traits::Exp<
+                    T,
+                    TArg>
+                ::exp(
+                    exp,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/floor/Traits.hpp b/include/alpaka/math/floor/Traits.hpp
index 926e1e4..60fd5a2 100644
--- a/include/alpaka/math/floor/Traits.hpp
+++ b/include/alpaka/math/floor/Traits.hpp
@@ -64,12 +64,13 @@ namespace alpaka
                 floor,
                 arg))
         {
-            return traits::Floor<
-                T,
-                TArg>
-            ::floor(
-                floor,
-                arg);
+            return
+                traits::Floor<
+                    T,
+                    TArg>
+                ::floor(
+                    floor,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/fmod/Traits.hpp b/include/alpaka/math/fmod/Traits.hpp
index 9726a7e..e172ca2 100644
--- a/include/alpaka/math/fmod/Traits.hpp
+++ b/include/alpaka/math/fmod/Traits.hpp
@@ -71,14 +71,15 @@ namespace alpaka
                 x,
                 y))
         {
-            return traits::Fmod<
-                T,
-                Tx,
-                Ty>
-            ::fmod(
-                fmod,
-                x,
-                y);
+            return
+                traits::Fmod<
+                    T,
+                    Tx,
+                    Ty>
+                ::fmod(
+                    fmod,
+                    x,
+                    y);
         }
 
         namespace traits
diff --git a/include/alpaka/math/log/Traits.hpp b/include/alpaka/math/log/Traits.hpp
index 7ce49c0..e8d9b6d 100644
--- a/include/alpaka/math/log/Traits.hpp
+++ b/include/alpaka/math/log/Traits.hpp
@@ -64,12 +64,13 @@ namespace alpaka
                 log,
                 arg))
         {
-            return traits::Log<
-                T,
-                TArg>
-            ::log(
-                log,
-                arg);
+            return
+                traits::Log<
+                    T,
+                    TArg>
+                ::log(
+                    log,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/max/Traits.hpp b/include/alpaka/math/max/Traits.hpp
index e882898..7d9f25a 100644
--- a/include/alpaka/math/max/Traits.hpp
+++ b/include/alpaka/math/max/Traits.hpp
@@ -72,14 +72,15 @@ namespace alpaka
                 x,
                 y))
         {
-            return traits::Max<
-                T,
-                Tx,
-                Ty>
-            ::max(
-                max,
-                x,
-                y);
+            return
+                traits::Max<
+                    T,
+                    Tx,
+                    Ty>
+                ::max(
+                    max,
+                    x,
+                    y);
         }
 
         namespace traits
diff --git a/include/alpaka/math/min/Traits.hpp b/include/alpaka/math/min/Traits.hpp
index ea90616..f3cca44 100644
--- a/include/alpaka/math/min/Traits.hpp
+++ b/include/alpaka/math/min/Traits.hpp
@@ -72,14 +72,15 @@ namespace alpaka
                 x,
                 y))
         {
-            return traits::Min<
-                T,
-                Tx,
-                Ty>
-            ::min(
-                min,
-                x,
-                y);
+            return
+                traits::Min<
+                    T,
+                    Tx,
+                    Ty>
+                ::min(
+                    min,
+                    x,
+                    y);
         }
 
         namespace traits
diff --git a/include/alpaka/math/pow/Traits.hpp b/include/alpaka/math/pow/Traits.hpp
index 61e752e..e77bb93 100644
--- a/include/alpaka/math/pow/Traits.hpp
+++ b/include/alpaka/math/pow/Traits.hpp
@@ -71,14 +71,15 @@ namespace alpaka
                 base,
                 exp))
         {
-            return traits::Pow<
-                T,
-                TBase,
-                TExp>
-            ::pow(
-                pow,
-                base,
-                exp);
+            return
+                traits::Pow<
+                    T,
+                    TBase,
+                    TExp>
+                ::pow(
+                    pow,
+                    base,
+                    exp);
         }
 
         namespace traits
diff --git a/include/alpaka/math/remainder/Traits.hpp b/include/alpaka/math/remainder/Traits.hpp
index 8e7e8ba..54db37a 100644
--- a/include/alpaka/math/remainder/Traits.hpp
+++ b/include/alpaka/math/remainder/Traits.hpp
@@ -71,14 +71,15 @@ namespace alpaka
                 x,
                 y))
         {
-            return traits::Remainder<
-                T,
-                Tx,
-                Ty>
-            ::remainder(
-                remainder,
-                x,
-                y);
+            return
+                traits::Remainder<
+                    T,
+                    Tx,
+                    Ty>
+                ::remainder(
+                    remainder,
+                    x,
+                    y);
         }
 
         namespace traits
diff --git a/include/alpaka/math/round/Traits.hpp b/include/alpaka/math/round/Traits.hpp
index f087490..248156d 100644
--- a/include/alpaka/math/round/Traits.hpp
+++ b/include/alpaka/math/round/Traits.hpp
@@ -82,12 +82,13 @@ namespace alpaka
                 round,
                 arg))
         {
-            return traits::Round<
-                T,
-                TArg>
-            ::round(
-                round,
-                arg);
+            return
+                traits::Round<
+                    T,
+                    TArg>
+                ::round(
+                    round,
+                    arg);
         }
         //-----------------------------------------------------------------------------
         //! Computes the nearest integer value to arg (in integer format), rounding halfway cases away from zero, regardless of the current rounding mode.
@@ -106,12 +107,13 @@ namespace alpaka
             TArg const & arg)
         -> long int
         {
-            return traits::Lround<
-                T,
-                TArg>
-            ::lround(
-                lround,
-                arg);
+            return
+                traits::Lround<
+                    T,
+                    TArg>
+                ::lround(
+                    lround,
+                    arg);
         }
         //-----------------------------------------------------------------------------
         //! Computes the nearest integer value to arg (in integer format), rounding halfway cases away from zero, regardless of the current rounding mode.
@@ -130,12 +132,13 @@ namespace alpaka
             TArg const & arg)
         -> long long int
         {
-            return traits::Llround<
-                T,
-                TArg>
-            ::llround(
-                llround,
-                arg);
+            return
+                traits::Llround<
+                    T,
+                    TArg>
+                ::llround(
+                    llround,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/rsqrt/Traits.hpp b/include/alpaka/math/rsqrt/Traits.hpp
index 5f550aa..6f62c6b 100644
--- a/include/alpaka/math/rsqrt/Traits.hpp
+++ b/include/alpaka/math/rsqrt/Traits.hpp
@@ -64,12 +64,13 @@ namespace alpaka
                 rsqrt,
                 arg))
         {
-            return traits::Rsqrt<
-                T,
-                TArg>
-            ::rsqrt(
-                rsqrt,
-                arg);
+            return
+                traits::Rsqrt<
+                    T,
+                    TArg>
+                ::rsqrt(
+                    rsqrt,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/sin/Traits.hpp b/include/alpaka/math/sin/Traits.hpp
index f495040..4bd88fc 100644
--- a/include/alpaka/math/sin/Traits.hpp
+++ b/include/alpaka/math/sin/Traits.hpp
@@ -64,12 +64,13 @@ namespace alpaka
                 sin,
                 arg))
         {
-            return traits::Sin<
-                T,
-                TArg>
-            ::sin(
-                sin,
-                arg);
+            return
+                traits::Sin<
+                    T,
+                    TArg>
+                ::sin(
+                    sin,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/sqrt/Traits.hpp b/include/alpaka/math/sqrt/Traits.hpp
index 62bef93..1d8dc26 100644
--- a/include/alpaka/math/sqrt/Traits.hpp
+++ b/include/alpaka/math/sqrt/Traits.hpp
@@ -64,12 +64,13 @@ namespace alpaka
                 sqrt,
                 arg))
         {
-            return traits::Sqrt<
-                T,
-                TArg>
-            ::sqrt(
-                sqrt,
-                arg);
+            return
+                traits::Sqrt<
+                    T,
+                    TArg>
+                ::sqrt(
+                    sqrt,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/tan/Traits.hpp b/include/alpaka/math/tan/Traits.hpp
index 1dfb74d..2874be1 100644
--- a/include/alpaka/math/tan/Traits.hpp
+++ b/include/alpaka/math/tan/Traits.hpp
@@ -64,12 +64,13 @@ namespace alpaka
                 tan,
                 arg))
         {
-            return traits::Tan<
-                T,
-                TArg>
-            ::tan(
-                tan,
-                arg);
+            return
+                traits::Tan<
+                    T,
+                    TArg>
+                ::tan(
+                    tan,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/trunc/Traits.hpp b/include/alpaka/math/trunc/Traits.hpp
index f3e4015..0cbe5ec 100644
--- a/include/alpaka/math/trunc/Traits.hpp
+++ b/include/alpaka/math/trunc/Traits.hpp
@@ -64,12 +64,13 @@ namespace alpaka
                 trunc,
                 arg))
         {
-            return traits::Trunc<
-                T,
-                TArg>
-            ::trunc(
-                trunc,
-                arg);
+            return
+                traits::Trunc<
+                    T,
+                    TArg>
+                ::trunc(
+                    trunc,
+                    arg);
         }
 
         namespace traits
diff --git a/include/alpaka/math/trunc/TruncCudaBuiltIn.hpp b/include/alpaka/math/trunc/TruncCudaBuiltIn.hpp
index bfcca80..631a707 100644
--- a/include/alpaka/math/trunc/TruncCudaBuiltIn.hpp
+++ b/include/alpaka/math/trunc/TruncCudaBuiltIn.hpp
@@ -1,6 +1,6 @@
 /**
 * \file
-* Copyright 2014-2015 Benjarint Worpitz
+* Copyright 2014-2015 Benjarint Worpitz, Rene Widera
 *
 * This file is part of alpaka.
 *
@@ -55,11 +55,10 @@ namespace alpaka
                     std::is_floating_point<TArg>::value>::type>
             {
                 ALPAKA_FN_ACC_CUDA_ONLY static auto trunc(
-                    TruncCudaBuiltIn const & trunc,
+                    TruncCudaBuiltIn const & /*trunc*/,
                     TArg const & arg)
                 -> decltype(::trunc(arg))
                 {
-                    boost::ignore_unused(trunc);
                     return ::trunc(arg);
                 }
             };
diff --git a/include/alpaka/mem/buf/BufCpu.hpp b/include/alpaka/mem/buf/BufCpu.hpp
index d3a6002..8878c88 100644
--- a/include/alpaka/mem/buf/BufCpu.hpp
+++ b/include/alpaka/mem/buf/BufCpu.hpp
@@ -24,7 +24,7 @@
 #include <alpaka/dev/Traits.hpp>            // dev::traits::DevType
 #include <alpaka/mem/buf/Traits.hpp>        // mem::buf::Alloc, ...
 
-#include <alpaka/vec/Vec.hpp>               // Vec<TDim, TSize>
+#include <alpaka/vec/Vec.hpp>               // Vec
 
 // \TODO: Remove CUDA inclusion for BufCpu by replacing pinning with non CUDA code!
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDACC__)
@@ -252,6 +252,24 @@ namespace alpaka
             };
         }
     }
+    namespace elem
+    {
+        namespace traits
+        {
+            //#############################################################################
+            //! The BufCpu memory element type get trait specialization.
+            //#############################################################################
+            template<
+                typename TElem,
+                typename TDim,
+                typename TSize>
+            struct ElemType<
+                mem::buf::BufCpu<TElem, TDim, TSize>>
+            {
+                using type = TElem;
+            };
+        }
+    }
     namespace extent
     {
         namespace traits
@@ -287,18 +305,6 @@ namespace alpaka
         {
             namespace traits
             {
-                //#############################################################################
-                //! The BufCpu memory element type get trait specialization.
-                //#############################################################################
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TSize>
-                struct ElemType<
-                    mem::buf::BufCpu<TElem, TDim, TSize>>
-                {
-                    using type = TElem;
-                };
                 //#############################################################################
                 //! The BufCpu buf trait specialization.
                 //#############################################################################
@@ -411,7 +417,7 @@ namespace alpaka
                     typename TDim,
                     typename TSize>
                 struct GetPitchBytes<
-                    std::integral_constant<std::size_t, TDim::value - 1u>,
+                    dim::DimInt<TDim::value - 1u>,
                     mem::buf::BufCpu<TElem, TDim, TSize>>
                 {
                     //-----------------------------------------------------------------------------
@@ -548,7 +554,7 @@ namespace alpaka
                             ALPAKA_CUDA_RT_CHECK_IGNORE(
                                 cudaHostRegister(
                                     const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf))),
-                                    extent::getProductOfExtents(buf) * sizeof(mem::view::Elem<buf::BufCpu<TElem, TDim, TSize>>),
+                                    extent::getProductOfExtents(buf) * sizeof(elem::Elem<buf::BufCpu<TElem, TDim, TSize>>),
                                     cudaHostRegisterDefault),
                                 cudaErrorHostMemoryAlreadyRegistered);
 
diff --git a/include/alpaka/mem/buf/BufCudaRt.hpp b/include/alpaka/mem/buf/BufCudaRt.hpp
index ebf1ee5..04d5666 100644
--- a/include/alpaka/mem/buf/BufCudaRt.hpp
+++ b/include/alpaka/mem/buf/BufCudaRt.hpp
@@ -23,9 +23,9 @@
 
 #include <alpaka/dev/Traits.hpp>            // dev::traits::DevType
 #include <alpaka/dim/DimIntegralConst.hpp>  // dim::DimInt<N>
-#include <alpaka/mem/buf/Traits.hpp>        // view::Copy, ...
+#include <alpaka/mem/buf/Traits.hpp>        // mem::view::Copy, ...
 
-#include <alpaka/vec/Vec.hpp>               // Vec<TDim, TSize>
+#include <alpaka/vec/Vec.hpp>               // Vec
 #include <alpaka/core/Cuda.hpp>             // cudaMalloc, ...
 
 #include <cassert>                          // assert
@@ -80,7 +80,9 @@ namespace alpaka
                         m_extentsElements(extent::getExtentsVecEnd<TDim>(extents)),
                         m_spMem(
                             pMem,
-                            std::bind(&BufCudaRt::freeBuffer, std::placeholders::_1, std::ref(m_dev))),
+                            // NOTE: Because the BufCudaRt object can be copied and the original object could have been destroyed,
+                            // a std::ref(m_dev) or a this pointer can not be bound to the callback because they are not always valid at time of destruction.
+                            std::bind(&BufCudaRt::freeBuffer, std::placeholders::_1, m_dev)),
                         m_pitchBytes(pitchBytes)
                 {
                     ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
@@ -115,7 +117,7 @@ namespace alpaka
                 }
 
             public:
-                dev::DevCudaRt m_dev;
+                dev::DevCudaRt m_dev;               // NOTE: The device has to be destructed after the memory pointer because it is required for destruction.
                 Vec<TDim, TSize> m_extentsElements;
                 std::shared_ptr<TElem> m_spMem;
                 TSize m_pitchBytes;
@@ -179,6 +181,24 @@ namespace alpaka
             };
         }
     }
+    namespace elem
+    {
+        namespace traits
+        {
+            //#############################################################################
+            //! The BufCudaRt memory element type get trait specialization.
+            //#############################################################################
+            template<
+                typename TElem,
+                typename TDim,
+                typename TSize>
+            struct ElemType<
+                mem::buf::BufCudaRt<TElem, TDim, TSize>>
+            {
+                using type = TElem;
+            };
+        }
+    }
     namespace extent
     {
         namespace traits
@@ -214,18 +234,6 @@ namespace alpaka
         {
             namespace traits
             {
-                //#############################################################################
-                //! The BufCudaRt memory element type get trait specialization.
-                //#############################################################################
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TSize>
-                struct ElemType<
-                    mem::buf::BufCudaRt<TElem, TDim, TSize>>
-                {
-                    using type = TElem;
-                };
                 //#############################################################################
                 //! The BufCudaRt buf trait specialization.
                 //#############################################################################
@@ -338,7 +346,7 @@ namespace alpaka
                     typename TDim,
                     typename TSize>
                 struct GetPitchBytes<
-                    std::integral_constant<std::size_t, TDim::value - 1u>,
+                    dim::DimInt<TDim::value - 1u>,
                     mem::buf::BufCudaRt<TElem, TDim, TSize>>
                 {
                     //-----------------------------------------------------------------------------
@@ -753,7 +761,7 @@ namespace alpaka
                             ALPAKA_CUDA_RT_CHECK(
                                 cudaHostRegister(
                                     const_cast<void *>(reinterpret_cast<void const *>(mem::view::getPtrNative(buf))),
-                                    extent::getProductOfExtents(buf) * sizeof(mem::view::Elem<BufCpu<TElem, TDim, TSize>>),
+                                    extent::getProductOfExtents(buf) * sizeof(elem::Elem<BufCpu<TElem, TDim, TSize>>),
                                     cudaHostRegisterMapped));
                         }
                         // If it is already the same device, nothing has to be mapped.
diff --git a/include/alpaka/mem/buf/BufPlainPtrWrapper.hpp b/include/alpaka/mem/buf/BufPlainPtrWrapper.hpp
index 3bc2ea1..d61f319 100644
--- a/include/alpaka/mem/buf/BufPlainPtrWrapper.hpp
+++ b/include/alpaka/mem/buf/BufPlainPtrWrapper.hpp
@@ -176,6 +176,25 @@ namespace alpaka
             };
         }
     }
+    namespace elem
+    {
+        namespace traits
+        {
+            //#############################################################################
+            //! The BufPlainPtrWrapper memory element type get trait specialization.
+            //#############################################################################
+            template<
+                typename TDev,
+                typename TElem,
+                typename TDim,
+                typename TSize>
+            struct ElemType<
+                mem::buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize>>
+            {
+                using type = TElem;
+            };
+        }
+    }
     namespace extent
     {
         namespace traits
@@ -210,20 +229,6 @@ namespace alpaka
         {
             namespace traits
             {
-                //#############################################################################
-                //! The BufPlainPtrWrapper memory element type get trait specialization.
-                //#############################################################################
-                template<
-                    typename TDev,
-                    typename TElem,
-                    typename TDim,
-                    typename TSize>
-                struct ElemType<
-                    buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize>>
-                {
-                    using type = TElem;
-                };
-
                 //#############################################################################
                 //! The BufPlainPtrWrapper buf trait specialization.
                 //#############################################################################
@@ -233,15 +238,15 @@ namespace alpaka
                     typename TDim,
                     typename TSize>
                 struct GetBuf<
-                    buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize>>
+                    mem::buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize>>
                 {
                     //-----------------------------------------------------------------------------
                     //!
                     //-----------------------------------------------------------------------------
                     ALPAKA_NO_HOST_ACC_WARNING
                     ALPAKA_FN_HOST_ACC static auto getBuf(
-                        buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize> const & buf)
-                    -> buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize> const &
+                        mem::buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize> const & buf)
+                    -> mem::buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize> const &
                     {
                         return buf;
                     }
@@ -250,8 +255,8 @@ namespace alpaka
                     //-----------------------------------------------------------------------------
                     ALPAKA_NO_HOST_ACC_WARNING
                     ALPAKA_FN_HOST_ACC static auto getBuf(
-                        buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize> & buf)
-                    -> buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize> &
+                        mem::buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize> & buf)
+                    -> mem::buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize> &
                     {
                         return buf;
                     }
@@ -266,18 +271,18 @@ namespace alpaka
                     typename TDim,
                     typename TSize>
                 struct GetPtrNative<
-                    buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize>>
+                    mem::buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize>>
                 {
                     ALPAKA_NO_HOST_ACC_WARNING
                     ALPAKA_FN_HOST_ACC static auto getPtrNative(
-                        buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize> const & buf)
+                        mem::buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize> const & buf)
                     -> TElem const *
                     {
                         return buf.m_pMem;
                     }
                     ALPAKA_NO_HOST_ACC_WARNING
                     ALPAKA_FN_HOST_ACC static auto getPtrNative(
-                        buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize> & buf)
+                        mem::buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize> & buf)
                     -> TElem *
                     {
                         return buf.m_pMem;
@@ -293,12 +298,12 @@ namespace alpaka
                     typename TDim,
                     typename TSize>
                 struct GetPitchBytes<
-                    std::integral_constant<std::size_t, 0u>,
-                    buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize>>
+                    dim::DimInt<TDim::value - 1u>,
+                    mem::buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize>>
                 {
                     ALPAKA_NO_HOST_ACC_WARNING
                     ALPAKA_FN_HOST_ACC static auto getPitchBytes(
-                        buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize> const & buf)
+                        mem::buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize> const & buf)
                     -> TSize
                     {
                         return buf.m_pitchBytes;
diff --git a/include/alpaka/mem/buf/BufStdContainers.hpp b/include/alpaka/mem/buf/BufStdContainers.hpp
index 267970d..69885d9 100644
--- a/include/alpaka/mem/buf/BufStdContainers.hpp
+++ b/include/alpaka/mem/buf/BufStdContainers.hpp
@@ -97,6 +97,24 @@ namespace alpaka
             };
         }
     }
+    namespace elem
+    {
+        namespace traits
+        {
+            //#############################################################################
+            //! The fixed size array memory element type get trait specialization.
+            //#############################################################################
+            template<
+                typename TFixedSizeArray>
+            struct ElemType<
+                TFixedSizeArray,
+                typename std::enable_if<
+                    std::is_array<TFixedSizeArray>::value>::type>
+            {
+                using type = typename std::remove_all_extents<TFixedSizeArray>::type;
+            };
+        }
+    }
     namespace extent
     {
         namespace traits
@@ -136,19 +154,6 @@ namespace alpaka
         {
             namespace traits
             {
-                //#############################################################################
-                //! The fixed size array memory element type get trait specialization.
-                //#############################################################################
-                template<
-                    typename TFixedSizeArray>
-                struct ElemType<
-                    TFixedSizeArray,
-                    typename std::enable_if<
-                        std::is_array<TFixedSizeArray>::value>::type>
-                {
-                    using type = typename std::remove_all_extents<TFixedSizeArray>::type;
-                };
-
                 //#############################################################################
                 //! The fixed size array buf trait specialization.
                 //#############################################################################
@@ -219,7 +224,7 @@ namespace alpaka
                 template<
                     typename TFixedSizeArray>
                 struct GetPitchBytes<
-                    std::integral_constant<std::size_t, std::rank<TFixedSizeArray>::value - 1u>,
+                    dim::DimInt<std::rank<TFixedSizeArray>::value - 1u>,
                     TFixedSizeArray,
                     typename std::enable_if<
                         std::is_array<TFixedSizeArray>::value
@@ -344,6 +349,23 @@ namespace alpaka
             };
         }
     }
+    namespace elem
+    {
+        namespace traits
+        {
+            //#############################################################################
+            //! The std::array memory element type get trait specialization.
+            //#############################################################################
+            template<
+                typename TElem,
+                std::size_t Tsize>
+            struct ElemType<
+                std::array<TElem, Tsize>>
+            {
+                using type = TElem;
+            };
+        }
+    }
     namespace extent
     {
         namespace traits
@@ -355,7 +377,7 @@ namespace alpaka
                 typename TElem,
                 std::size_t Tsize>
             struct GetExtent<
-                std::integral_constant<std::size_t, 0u>,
+                dim::DimInt<0u>,
                 std::array<TElem, Tsize>>
             {
                 //-----------------------------------------------------------------------------
@@ -378,18 +400,6 @@ namespace alpaka
         {
             namespace traits
             {
-                //#############################################################################
-                //! The std::array memory element type get trait specialization.
-                //#############################################################################
-                template<
-                    typename TElem,
-                    std::size_t Tsize>
-                struct ElemType<
-                    std::array<TElem, Tsize>>
-                {
-                    using type = TElem;
-                };
-
                 //#############################################################################
                 //! The std::array buf trait specialization.
                 //#############################################################################
@@ -455,7 +465,7 @@ namespace alpaka
                     typename TElem,
                     std::size_t Tsize>
                 struct GetPitchBytes<
-                    std::integral_constant<std::size_t, 0u>,
+                    dim::DimInt<0u>,
                     std::array<TElem, Tsize>>
                 {
                     //-----------------------------------------------------------------------------
@@ -573,6 +583,23 @@ namespace alpaka
             };
         }
     }
+    namespace elem
+    {
+        namespace traits
+        {
+            //#############################################################################
+            //! The std::vector memory element type get trait specialization.
+            //#############################################################################
+            template<
+                typename TElem,
+                typename TAllocator>
+            struct ElemType<
+                std::vector<TElem, TAllocator>>
+            {
+                using type = TElem;
+            };
+        }
+    }
     namespace extent
     {
         namespace traits
@@ -584,7 +611,7 @@ namespace alpaka
                 typename TElem,
                 typename TAllocator>
             struct GetExtent<
-                std::integral_constant<std::size_t, 0u>,
+                dim::DimInt<0u>,
                 std::vector<TElem, TAllocator>>
             {
                 //-----------------------------------------------------------------------------
@@ -605,18 +632,6 @@ namespace alpaka
         {
             namespace traits
             {
-                //#############################################################################
-                //! The std::vector memory element type get trait specialization.
-                //#############################################################################
-                template<
-                    typename TElem,
-                    typename TAllocator>
-                struct ElemType<
-                    std::vector<TElem, TAllocator>>
-                {
-                    using type = TElem;
-                };
-
                 //#############################################################################
                 //! The std::vector buf trait specialization.
                 //#############################################################################
@@ -682,7 +697,7 @@ namespace alpaka
                     typename TElem,
                     typename TAllocator>
                 struct GetPitchBytes<
-                    std::integral_constant<std::size_t, 0u>,
+                    dim::DimInt<0u>,
                     std::vector<TElem, TAllocator>>
                 {
                     //-----------------------------------------------------------------------------
diff --git a/include/alpaka/mem/buf/Traits.hpp b/include/alpaka/mem/buf/Traits.hpp
index 9cbde10..fff347c 100644
--- a/include/alpaka/mem/buf/Traits.hpp
+++ b/include/alpaka/mem/buf/Traits.hpp
@@ -145,14 +145,15 @@ namespace alpaka
                     dev,
                     extents))
             {
-                return traits::Alloc<
-                    TElem,
-                    dim::Dim<TExtents>,
-                    TSize,
-                    TDev>
-                ::alloc(
-                    dev,
-                    extents);
+                return
+                    traits::Alloc<
+                        TElem,
+                        dim::Dim<TExtents>,
+                        TSize,
+                        TDev>
+                    ::alloc(
+                        dev,
+                        extents);
             }
             //-----------------------------------------------------------------------------
             //! Maps the buffer into the memory of the given device.
@@ -170,12 +171,13 @@ namespace alpaka
                 TDev const & dev)
             -> void
             {
-                return traits::Map<
-                    TBuf,
-                    TDev>
-                ::map(
-                    buf,
-                    dev);
+                return
+                    traits::Map<
+                        TBuf,
+                        TDev>
+                    ::map(
+                        buf,
+                        dev);
             }
             //-----------------------------------------------------------------------------
             //! Unmaps the buffer from the memory of the given device.
@@ -193,12 +195,13 @@ namespace alpaka
                 TDev const & dev)
             -> void
             {
-                return traits::Unmap<
-                    TBuf,
-                    TDev>
-                ::unmap(
-                    buf,
-                    dev);
+                return
+                    traits::Unmap<
+                        TBuf,
+                        TDev>
+                    ::unmap(
+                        buf,
+                        dev);
             }
             //-----------------------------------------------------------------------------
             //! Pins the buffer.
@@ -212,10 +215,11 @@ namespace alpaka
                 TBuf & buf)
             -> void
             {
-                return traits::Pin<
-                    TBuf>
-                ::pin(
-                    buf);
+                return
+                    traits::Pin<
+                        TBuf>
+                    ::pin(
+                        buf);
             }
             //-----------------------------------------------------------------------------
             //! Unpins the buffer.
@@ -229,10 +233,11 @@ namespace alpaka
                 TBuf & buf)
             -> void
             {
-                return traits::Unpin<
-                    TBuf>
-                ::unpin(
-                    buf);
+                return
+                    traits::Unpin<
+                        TBuf>
+                    ::unpin(
+                        buf);
             }
             //-----------------------------------------------------------------------------
             //! The pin state of the buffer.
@@ -246,10 +251,11 @@ namespace alpaka
                 TBuf const & buf)
             -> bool
             {
-                return traits::IsPinned<
-                    TBuf>
-                ::isPinned(
-                    buf);
+                return
+                    traits::IsPinned<
+                        TBuf>
+                    ::isPinned(
+                        buf);
             }
         }
     }
diff --git a/include/alpaka/mem/buf/cpu/Copy.hpp b/include/alpaka/mem/buf/cpu/Copy.hpp
index 3de046b..86f6b14 100644
--- a/include/alpaka/mem/buf/cpu/Copy.hpp
+++ b/include/alpaka/mem/buf/cpu/Copy.hpp
@@ -23,7 +23,7 @@
 
 #include <alpaka/dim/DimIntegralConst.hpp>  // dim::DimInt<N>
 #include <alpaka/extent/Traits.hpp>         // extent::getXXX
-#include <alpaka/mem/view/Traits.hpp>       // view::Copy, ...
+#include <alpaka/mem/view/Traits.hpp>       // mem::view::Copy, ...
 #include <alpaka/stream/StreamCpuAsync.hpp> // stream::StreamCpuAsync
 #include <alpaka/stream/StreamCpuSync.hpp>  // stream::StreamCpuSync
 
@@ -71,7 +71,7 @@ namespace alpaka
                             "The buffers and the extents are required to have the same dimensionality!");
                         // TODO: Maybe check for Size of TBufDst and TBufSrc to have greater or equal range than TExtents.
                         static_assert(
-                            std::is_same<mem::view::Elem<TBufDst>, typename std::remove_const<mem::view::Elem<TBufSrc>>::type>::value,
+                            std::is_same<elem::Elem<TBufDst>, typename std::remove_const<elem::Elem<TBufSrc>>::type>::value,
                             "The source and the destination buffers are required to have the same element type!");
 
                         //-----------------------------------------------------------------------------
@@ -82,7 +82,7 @@ namespace alpaka
                             TBufSrc const & bufSrc,
                             TExtents const & extents) :
                                 m_extentWidth(extent::getWidth(extents)),
-                                m_extentWidthBytes(static_cast<Size>(m_extentWidth * sizeof(mem::view::Elem<TBufDst>))),
+                                m_extentWidthBytes(static_cast<Size>(m_extentWidth * sizeof(elem::Elem<TBufDst>))),
                                 m_dstWidth(static_cast<Size>(extent::getWidth(bufDst))),
                                 m_srcWidth(static_cast<Size>(extent::getWidth(bufSrc))),
                                 m_dstBufWidth(static_cast<Size>(extent::getWidth(mem::view::getBuf(bufDst)))),
diff --git a/include/alpaka/mem/buf/cpu/Set.hpp b/include/alpaka/mem/buf/cpu/Set.hpp
index ec6f6d4..f504582 100644
--- a/include/alpaka/mem/buf/cpu/Set.hpp
+++ b/include/alpaka/mem/buf/cpu/Set.hpp
@@ -22,8 +22,8 @@
 #pragma once
 
 #include <alpaka/dim/DimIntegralConst.hpp>  // dim::DimInt<N>
-#include <alpaka/extent/Traits.hpp>         // view::getXXX
-#include <alpaka/mem/view/Traits.hpp>       // view::Set, ...
+#include <alpaka/extent/Traits.hpp>         // mem::view::getXXX
+#include <alpaka/mem/view/Traits.hpp>       // mem::view::Set, ...
 #include <alpaka/stream/StreamCpuAsync.hpp> // stream::StreamCpuAsync
 #include <alpaka/stream/StreamCpuSync.hpp>  // stream::StreamCpuSync
 
@@ -93,7 +93,7 @@ namespace alpaka
                             assert(extentHeight <= dstHeight);
                             assert(extentDepth <= dstDepth);
 
-                            auto const extentWidthBytes(extentWidth * sizeof(mem::view::Elem<TBuf>));
+                            auto const extentWidthBytes(extentWidth * sizeof(elem::Elem<TBuf>));
                             auto const dstPitchBytes(mem::view::getPitchBytes<dim::Dim<TBuf>::value - 1u>(m_buf));
                             assert(extentWidthBytes <= dstPitchBytes);
 
diff --git a/include/alpaka/mem/buf/cuda/Copy.hpp b/include/alpaka/mem/buf/cuda/Copy.hpp
index 28ec2c2..ea4ca95 100644
--- a/include/alpaka/mem/buf/cuda/Copy.hpp
+++ b/include/alpaka/mem/buf/cuda/Copy.hpp
@@ -24,8 +24,8 @@
 #include <alpaka/dev/DevCpu.hpp>                // dev::DevCpu
 #include <alpaka/dev/DevCudaRt.hpp>             // dev::DevCudaRt
 #include <alpaka/dim/DimIntegralConst.hpp>      // dim::DimInt<N>
-#include <alpaka/extent/Traits.hpp>             // view::getXXX
-#include <alpaka/mem/view/Traits.hpp>           // view::Copy
+#include <alpaka/extent/Traits.hpp>             // mem::view::getXXX
+#include <alpaka/mem/view/Traits.hpp>           // mem::view::Copy
 #include <alpaka/stream/StreamCudaRtAsync.hpp>  // stream::StreamCudaRtAsync
 #include <alpaka/stream/StreamCudaRtSync.hpp>   // stream::StreamCudaRtSync
 
@@ -74,7 +74,7 @@ namespace alpaka
                             "The destination buffer and the extents are required to have the same dimensionality!");
                         // TODO: Maybe check for Size of TBufDst and TBufSrc to have greater or equal range than TExtents.
                         static_assert(
-                            std::is_same<mem::view::Elem<TBufDst>, typename std::remove_const<mem::view::Elem<TBufSrc>>::type>::value,
+                            std::is_same<elem::Elem<TBufDst>, typename std::remove_const<elem::Elem<TBufSrc>>::type>::value,
                             "The source and the destination buffers are required to have the same element type!");
 
                         using Size = size::Size<TExtents>;
@@ -97,7 +97,7 @@ namespace alpaka
                                 m_dstWidth(static_cast<Size>(extent::getWidth(bufDst))),
                                 m_srcWidth(static_cast<Size>(extent::getWidth(bufSrc))),
 #endif
-                                m_extentWidthBytes(static_cast<Size>(extent::getWidth(extents) * sizeof(mem::view::Elem<TBufDst>))),
+                                m_extentWidthBytes(static_cast<Size>(extent::getWidth(extents) * sizeof(elem::Elem<TBufDst>))),
                                 m_dstMemNative(reinterpret_cast<void *>(mem::view::getPtrNative(bufDst))),
                                 m_srcMemNative(reinterpret_cast<void const *>(mem::view::getPtrNative(bufSrc)))
                         {
@@ -159,7 +159,7 @@ namespace alpaka
                             "The destination buffer and the extents are required to have the same dimensionality!");
                         // TODO: Maybe check for Size of TBufDst and TBufSrc to have greater or equal range than TExtents.
                         static_assert(
-                            std::is_same<mem::view::Elem<TBufDst>, typename std::remove_const<mem::view::Elem<TBufSrc>>::type>::value,
+                            std::is_same<elem::Elem<TBufDst>, typename std::remove_const<elem::Elem<TBufSrc>>::type>::value,
                             "The source and the destination buffers are required to have the same element type!");
 
                         using Size = size::Size<TExtents>;
@@ -180,7 +180,7 @@ namespace alpaka
 #if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
                                 m_extentWidth(extent::getWidth(extents)),
 #endif
-                                m_extentWidthBytes(static_cast<Size>(extent::getWidth(extents) * sizeof(mem::view::Elem<TBufDst>))),
+                                m_extentWidthBytes(static_cast<Size>(extent::getWidth(extents) * sizeof(elem::Elem<TBufDst>))),
                                 m_dstWidth(static_cast<Size>(extent::getWidth(bufDst))),      // required for 3D peer copy
                                 m_srcWidth(static_cast<Size>(extent::getWidth(bufSrc))),      // required for 3D peer copy
 
@@ -270,7 +270,7 @@ namespace alpaka
                             "The destination buffer and the extents are required to have the same dimensionality!");
                         // TODO: Maybe check for Size of TBufDst and TBufSrc to have greater or equal range than TExtents.
                         static_assert(
-                            std::is_same<mem::view::Elem<TBufDst>, typename std::remove_const<mem::view::Elem<TBufSrc>>::type>::value,
+                            std::is_same<elem::Elem<TBufDst>, typename std::remove_const<elem::Elem<TBufSrc>>::type>::value,
                             "The source and the destination buffers are required to have the same element type!");
 
                         using Size = size::Size<TExtents>;
@@ -291,7 +291,7 @@ namespace alpaka
                                 m_iSrcDevice(iSrcDevice),
 
                                 m_extentWidth(extent::getWidth(extents)),
-                                m_extentWidthBytes(static_cast<Size>(m_extentWidth * sizeof(mem::view::Elem<TBufDst>))),
+                                m_extentWidthBytes(static_cast<Size>(m_extentWidth * sizeof(elem::Elem<TBufDst>))),
                                 m_dstWidth(static_cast<Size>(extent::getWidth(bufDst))),
                                 m_srcWidth(static_cast<Size>(extent::getWidth(bufSrc))),
 
@@ -378,7 +378,7 @@ namespace alpaka
             }
 
             //-----------------------------------------------------------------------------
-            // Trait specializations for view::TaskCopy.
+            // Trait specializations for TaskCopy.
             //-----------------------------------------------------------------------------
             namespace traits
             {
diff --git a/include/alpaka/mem/buf/cuda/Set.hpp b/include/alpaka/mem/buf/cuda/Set.hpp
index 2d91c43..2e929c9 100644
--- a/include/alpaka/mem/buf/cuda/Set.hpp
+++ b/include/alpaka/mem/buf/cuda/Set.hpp
@@ -1,6 +1,6 @@
 /**
 * \file
-* Copyright 2014-2015 Benjamin Worpitz
+* Copyright 2014-2015 Benjamin Worpitz, Rene Widera
 *
 * This file is part of alpaka.
 *
@@ -23,8 +23,8 @@
 
 #include <alpaka/dev/Traits.hpp>            // dev::getDev
 #include <alpaka/dim/DimIntegralConst.hpp>  // dim::DimInt<N>
-#include <alpaka/extent/Traits.hpp>         // view::getXXX
-#include <alpaka/mem/view/Traits.hpp>       // view::Set
+#include <alpaka/extent/Traits.hpp>         // mem::view::getXXX
+#include <alpaka/mem/view/Traits.hpp>       // mem::view::Set
 #include <alpaka/stream/Traits.hpp>         // stream::Enqueue
 
 #include <alpaka/core/Cuda.hpp>             // cudaMemset, ...
@@ -161,7 +161,7 @@ namespace alpaka
                     auto const & iDevice(task.m_iDevice);
 
                     auto const extentWidth(extent::getWidth(extents));
-                    auto const extentWidthBytes(extentWidth * sizeof(mem::view::Elem<TBuf>));
+                    auto const extentWidthBytes(extentWidth * sizeof(elem::Elem<TBuf>));
                     auto const dstWidth(extent::getWidth(buf));
                     auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(buf)));
                     assert(extentWidth <= dstWidth);
@@ -212,7 +212,7 @@ namespace alpaka
                     auto const & iDevice(task.m_iDevice);
 
                     auto const extentWidth(extent::getWidth(extents));
-                    auto const extentWidthBytes(extentWidth * sizeof(mem::view::Elem<TBuf>));
+                    auto const extentWidthBytes(extentWidth * sizeof(elem::Elem<TBuf>));
                     auto const dstWidth(extent::getWidth(buf));
                     auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(buf)));
                     assert(extentWidth <= dstWidth);
@@ -262,11 +262,11 @@ namespace alpaka
                     auto const & iDevice(task.m_iDevice);
 
                     auto const extentWidth(extent::getWidth(extents));
-                    auto const extentWidthBytes(extentWidth * sizeof(mem::view::Elem<TBuf>));
+                    auto const extentWidthBytes(extentWidth * sizeof(elem::Elem<TBuf>));
                     auto const extentHeight(extent::getHeight(extents));
                     auto const dstWidth(extent::getWidth(buf));
                     auto const dstHeight(extent::getHeight(buf));
-                    auto const dstPitchBytes(mem::view::getPitchBytes<std::integral_constant<std::size_t, dim::Dim<TBuf>::value - 1u>>(buf));
+                    auto const dstPitchBytes(mem::view::getPitchBytes<dim::Dim<TBuf>::value - 1u>(buf));
                     auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(buf)));
                     assert(extentWidth <= dstWidth);
                     assert(extentHeight <= dstHeight);
@@ -319,11 +319,11 @@ namespace alpaka
                     auto const & iDevice(task.m_iDevice);
 
                     auto const extentWidth(extent::getWidth(extents));
-                    auto const extentWidthBytes(extentWidth * sizeof(mem::view::Elem<TBuf>));
+                    auto const extentWidthBytes(extentWidth * sizeof(elem::Elem<TBuf>));
                     auto const extentHeight(extent::getHeight(extents));
                     auto const dstWidth(extent::getWidth(buf));
                     auto const dstHeight(extent::getHeight(buf));
-                    auto const dstPitchBytes(mem::view::getPitchBytes<std::integral_constant<std::size_t, dim::Dim<TBuf>::value - 1u>>(buf));
+                    auto const dstPitchBytes(mem::view::getPitchBytes<dim::Dim<TBuf>::value - 1u>(buf));
                     auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(buf)));
                     assert(extentWidth <= dstWidth);
                     assert(extentHeight <= dstHeight);
@@ -380,7 +380,7 @@ namespace alpaka
                     auto const dstWidth(extent::getWidth(buf));
                     auto const dstHeight(extent::getHeight(buf));
                     auto const dstDepth(extent::getDepth(buf));
-                    auto const dstPitchBytes(mem::view::getPitchBytes<std::integral_constant<std::size_t, dim::Dim<TBuf>::value - 1u>>(buf));
+                    auto const dstPitchBytes(mem::view::getPitchBytes<dim::Dim<TBuf>::value - 1u>(buf));
                     auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(buf)));
                     assert(extentWidth <= dstWidth);
                     assert(extentHeight <= dstHeight);
@@ -451,7 +451,7 @@ namespace alpaka
                     auto const dstWidth(extent::getWidth(buf));
                     auto const dstHeight(extent::getHeight(buf));
                     auto const dstDepth(extent::getDepth(buf));
-                    auto const dstPitchBytes(mem::view::getPitchBytes<std::integral_constant<std::size_t, dim::Dim<TBuf>::value - 1u>>(buf));
+                    auto const dstPitchBytes(mem::view::getPitchBytes<dim::Dim<TBuf>::value - 1u>(buf));
                     auto const dstNativePtr(reinterpret_cast<void *>(mem::view::getPtrNative(buf)));
                     assert(extentWidth <= dstWidth);
                     assert(extentHeight <= dstHeight);
diff --git a/include/alpaka/mem/view/Traits.hpp b/include/alpaka/mem/view/Traits.hpp
index 7332586..17220d3 100644
--- a/include/alpaka/mem/view/Traits.hpp
+++ b/include/alpaka/mem/view/Traits.hpp
@@ -21,8 +21,9 @@
 
 #pragma once
 
-#include <alpaka/dev/Traits.hpp>        // dev::traits::DevType, ...
-#include <alpaka/dim/Traits.hpp>        // dim::DimType
+#include <alpaka/dev/Traits.hpp>        // dev::Dev, ...
+#include <alpaka/dim/Traits.hpp>        // dim::Dim
+#include <alpaka/elem/Traits.hpp>       // elem::Elem
 #include <alpaka/extent/Traits.hpp>     // extent::GetExtent
 #include <alpaka/offset/Traits.hpp>     // offset::GetOffset
 #include <alpaka/stream/Traits.hpp>     // stream::enqueue
@@ -54,14 +55,6 @@ namespace alpaka
                     typename TSfinae = void>
                 struct ViewType;
 
-                //#############################################################################
-                //! The memory element type trait.
-                //#############################################################################
-                template<
-                    typename TView,
-                    typename TSfinae = void>
-                struct ElemType;
-
                 //#############################################################################
                 //! The native pointer get trait.
                 //#############################################################################
@@ -101,7 +94,7 @@ namespace alpaka
                         using IdxSequence = alpaka::core::detail::make_integer_sequence_offset<std::size_t, TIdx::value, dim::Dim<TView>::value - TIdx::value>;
                         return
                             extentsProd(view, IdxSequence())
-                            * sizeof(typename ElemType<TView>::type);
+                            * sizeof(typename elem::Elem<TView>);
                     }
                 private:
                     //-----------------------------------------------------------------------------
@@ -163,13 +156,6 @@ namespace alpaka
                 struct GetBuf;
             }
 
-            //#############################################################################
-            //! The memory element type trait alias template to remove the ::type.
-            //#############################################################################
-            template<
-                typename TView>
-            using Elem = typename std::remove_volatile<typename traits::ElemType<TView>::type>::type;
-
             //#############################################################################
             //! The memory buffer view type trait alias template to remove the ::type.
             //#############################################################################
@@ -190,12 +176,13 @@ namespace alpaka
                 typename TBuf>
             ALPAKA_FN_HOST auto getPtrNative(
                 TBuf const & buf)
-            -> Elem<TBuf> const *
+            -> elem::Elem<TBuf> const *
             {
-                return traits::GetPtrNative<
-                    TBuf>
-                ::getPtrNative(
-                    buf);
+                return
+                    traits::GetPtrNative<
+                        TBuf>
+                    ::getPtrNative(
+                        buf);
             }
             //-----------------------------------------------------------------------------
             //! Gets the native pointer of the memory buffer.
@@ -207,12 +194,13 @@ namespace alpaka
                 typename TBuf>
             ALPAKA_FN_HOST auto getPtrNative(
                 TBuf & buf)
-            -> Elem<TBuf> *
+            -> elem::Elem<TBuf> *
             {
-                return traits::GetPtrNative<
-                    TBuf>
-                ::getPtrNative(
-                    buf);
+                return
+                    traits::GetPtrNative<
+                        TBuf>
+                    ::getPtrNative(
+                        buf);
             }
 
             //-----------------------------------------------------------------------------
@@ -228,14 +216,15 @@ namespace alpaka
             ALPAKA_FN_HOST auto getPtrDev(
                 TBuf const & buf,
                 TDev const & dev)
-            -> Elem<TBuf> const *
+            -> elem::Elem<TBuf> const *
             {
-                return traits::GetPtrDev<
-                    TBuf,
-                    TDev>
-                ::getPtrDev(
-                    buf,
-                    dev);
+                return
+                    traits::GetPtrDev<
+                        TBuf,
+                        TDev>
+                    ::getPtrDev(
+                        buf,
+                        dev);
             }
             //-----------------------------------------------------------------------------
             //! Gets the pointer to the buffer on the given device.
@@ -250,14 +239,15 @@ namespace alpaka
             ALPAKA_FN_HOST auto getPtrDev(
                 TBuf & buf,
                 TDev const & dev)
-            -> Elem<TBuf> *
+            -> elem::Elem<TBuf> *
             {
-                return traits::GetPtrDev<
-                    TBuf,
-                    TDev>
-                ::getPtrDev(
-                    buf,
-                    dev);
+                return
+                    traits::GetPtrDev<
+                        TBuf,
+                        TDev>
+                    ::getPtrDev(
+                        buf,
+                        dev);
             }
 
             //-----------------------------------------------------------------------------
@@ -272,7 +262,7 @@ namespace alpaka
             {
                 return
                     traits::GetPitchBytes<
-                        std::integral_constant<std::size_t, Tidx>,
+                        dim::DimInt<Tidx>,
                         TView>
                     ::getPitchBytes(
                         buf);
@@ -374,7 +364,7 @@ namespace alpaka
                     dim::Dim<TBufDst>::value == dim::Dim<TExtents>::value,
                     "The destination buffer and the extents are required to have the same dimensionality!");
                 static_assert(
-                    std::is_same<Elem<TBufDst>, typename std::remove_const<Elem<TBufSrc>>::type>::value,
+                    std::is_same<elem::Elem<TBufDst>, typename std::remove_const<elem::Elem<TBufSrc>>::type>::value,
                     "The source and the destination buffers are required to have the same element type!");
 
                 return
@@ -578,7 +568,7 @@ namespace alpaka
                 {
                     ALPAKA_FN_HOST static auto print(
                         TView const & view,
-                        Elem<TView> const * const ptr,
+                        elem::Elem<TView> const * const ptr,
                         Vec<dim::Dim<TView>, size::Size<TView>> const & extents,
                         std::ostream & os,
                         std::string const & elementSeparator,
@@ -598,7 +588,7 @@ namespace alpaka
                                 TView>
                             ::print(
                                 view,
-                                reinterpret_cast<Elem<TView> const *>(reinterpret_cast<std::uint8_t const *>(ptr)+i*pitch),
+                                reinterpret_cast<elem::Elem<TView> const *>(reinterpret_cast<std::uint8_t const *>(ptr)+i*pitch),
                                 extents,
                                 os,
                                 elementSeparator,
@@ -627,7 +617,7 @@ namespace alpaka
                 {
                     ALPAKA_FN_HOST static auto print(
                         TView const & view,
-                        Elem<TView> const * const ptr,
+                        elem::Elem<TView> const * const ptr,
                         Vec<dim::Dim<TView>, size::Size<TView>> const & extents,
                         std::ostream & os,
                         std::string const & elementSeparator,
@@ -676,7 +666,7 @@ namespace alpaka
                     TView>
                 ::print(
                     view,
-                    view::getPtrNative(view),
+                    mem::view::getPtrNative(view),
                     extent::getExtentsVec(view),
                     os,
                     elementSeparator,
diff --git a/include/alpaka/mem/view/ViewBasic.hpp b/include/alpaka/mem/view/ViewBasic.hpp
index a1822d9..bcce45c 100644
--- a/include/alpaka/mem/view/ViewBasic.hpp
+++ b/include/alpaka/mem/view/ViewBasic.hpp
@@ -23,7 +23,7 @@
 
 #include <alpaka/dim/Traits.hpp>                    // Dim
 #include <alpaka/dev/Traits.hpp>                    // Dev
-#include <alpaka/extent/Traits.hpp>                 // view::getXXX
+#include <alpaka/extent/Traits.hpp>                 // mem::view::getXXX
 #include <alpaka/mem/view/Traits.hpp>
 #include <alpaka/offset/Traits.hpp>                 // traits::getOffsetX
 #include <alpaka/size/Traits.hpp>                   // size::traits::SizeType
@@ -68,7 +68,7 @@ namespace alpaka
                 using Dev = TDev;
                 using Elem = TElem;
                 using Dim = TDim;
-                using Buf = buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize>;
+                using Buf = mem::buf::BufPlainPtrWrapper<TDev, TElem, TDim, TSize>;
                 // If the value type is const, we store a const buffer.
                 //using BufC = detail::MimicConst<TElem, Buf>;
 
@@ -88,7 +88,9 @@ namespace alpaka
                             mem::view::getPitchBytes<TDim::value - 1u>(buf)),
                         m_vOffsetsElements(offset::getOffsetsVecEnd<TDim>(buf)),
                         m_extentsElements(extent::getExtentsVecEnd<TDim>(buf))
-                {}
+                {
+                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
+                }
                 //-----------------------------------------------------------------------------
                 //! Constructor.
                 //! \param buf This can be either a memory buffer or a memory view.
@@ -105,6 +107,8 @@ namespace alpaka
                         m_vOffsetsElements(offset::getOffsetsVecEnd<TDim>(buf)),
                         m_extentsElements(extent::getExtentsVecEnd<TDim>(buf))
                 {
+                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
                     static_assert(
                         std::is_same<TSize, size::Size<TBuf>>::value,
                         "The size type of TBuf and the TSize template parameter have to be identical!");
@@ -132,6 +136,8 @@ namespace alpaka
                         m_extentsElements(extent::getExtentsVecEnd<TDim>(extentsElements)),
                         m_vOffsetsElements(offset::getOffsetsVecEnd<TDim>(relativeOffsetsElements) + offset::getOffsetsVecEnd<TDim>(buf))
                 {
+                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
                     static_assert(
                         std::is_same<TDim, dim::Dim<TExtents>>::value,
                         "The buffer and the extents are required to have the same dimensionality!");
@@ -171,6 +177,8 @@ namespace alpaka
                         m_extentsElements(extent::getExtentsVecEnd<TDim>(extentsElements)),
                         m_vOffsetsElements(offset::getOffsetsVecEnd<TDim>(relativeOffsetsElements) + offset::getOffsetsVecEnd<TDim>(buf))
                 {
+                    ALPAKA_DEBUG_FULL_LOG_SCOPE;
+
                     static_assert(
                         std::is_same<TDim, dim::Dim<TExtents>>::value,
                         "The buffer and the extents are required to have the same dimensionality!");
@@ -262,6 +270,25 @@ namespace alpaka
             };
         }
     }
+    namespace elem
+    {
+        namespace traits
+        {
+            //#############################################################################
+            //! The ViewBasic memory element type get trait specialization.
+            //#############################################################################
+            template<
+                typename TElem,
+                typename TDim,
+                typename TDev,
+                typename TSize>
+            struct ElemType<
+                mem::view::ViewBasic<TDev, TElem, TDim, TSize>>
+            {
+                using type = TElem;
+            };
+        }
+    }
     namespace extent
     {
         namespace traits
@@ -298,20 +325,6 @@ namespace alpaka
         {
             namespace traits
             {
-                //#############################################################################
-                //! The ViewBasic memory element type get trait specialization.
-                //#############################################################################
-                template<
-                    typename TElem,
-                    typename TDim,
-                    typename TDev,
-                    typename TSize>
-                struct ElemType<
-                    mem::view::ViewBasic<TDev, TElem, TDim, TSize>>
-                {
-                    using type = TElem;
-                };
-
                 //#############################################################################
                 //! The memory buffer view creation type trait.
                 //#############################################################################
@@ -508,7 +521,7 @@ namespace alpaka
                     {
                         return
                             offset::getOffset<Tidx>(view)
-                            * view::getPitchBytes<Tidx + 1u>(buf);
+                            * mem::view::getPitchBytes<Tidx + 1u>(buf);
                     }
                 };
 
@@ -533,7 +546,7 @@ namespace alpaka
                     -> TSize
                     {
                         return
-                            view::getPitchBytes<TIdx::value>(
+                            mem::view::getPitchBytes<TIdx::value>(
                                 mem::view::getBuf(view));
                     }
                 };
diff --git a/include/alpaka/offset/Traits.hpp b/include/alpaka/offset/Traits.hpp
index f316cbc..ef04c72 100644
--- a/include/alpaka/offset/Traits.hpp
+++ b/include/alpaka/offset/Traits.hpp
@@ -83,7 +83,7 @@ namespace alpaka
         {
             return
                 traits::GetOffset<
-                    std::integral_constant<std::size_t, Tidx>,
+                    dim::DimInt<Tidx>,
                     TOffsets>
                 ::getOffset(
                     offsets);
@@ -139,7 +139,7 @@ namespace alpaka
         -> void
         {
             traits::SetOffset<
-                std::integral_constant<std::size_t, Tidx>,
+                dim::DimInt<Tidx>,
                 TOffsets,
                 TOffset>
             ::setOffset(
@@ -200,7 +200,7 @@ namespace alpaka
             template<
                 typename TOffsets>
             struct GetOffset<
-                std::integral_constant<std::size_t, 0u>,
+                dim::DimInt<0u>,
                 TOffsets,
                 typename std::enable_if<
                     std::is_integral<TOffsets>::value>::type>
@@ -220,7 +220,7 @@ namespace alpaka
                 typename TOffsets,
                 typename TOffset>
             struct SetOffset<
-                std::integral_constant<std::size_t, 0u>,
+                dim::DimInt<0u>,
                 TOffsets,
                 TOffset,
                 typename std::enable_if<
diff --git a/include/alpaka/rand/RandCuRand.hpp b/include/alpaka/rand/RandCuRand.hpp
index b567125..32ba6b2 100644
--- a/include/alpaka/rand/RandCuRand.hpp
+++ b/include/alpaka/rand/RandCuRand.hpp
@@ -36,6 +36,15 @@ namespace alpaka
 {
     namespace rand
     {
+        //#############################################################################
+        //! The CUDA rand implementation.
+        //#############################################################################
+        class RandCuRand
+        {
+        public:
+            using RandBase = RandCuRand;
+        };
+
         namespace generator
         {
             namespace cuda
@@ -237,22 +246,18 @@ namespace alpaka
                 //! The CUDA random number float normal distribution get trait specialization.
                 //#############################################################################
                 template<
-                    typename TAcc,
                     typename T>
                 struct CreateNormalReal<
-                    TAcc,
+                    RandCuRand,
                     T,
                     typename std::enable_if<
-                        std::is_same<
-                            dev::Dev<TAcc>,
-                            dev::DevCudaRt>::value
-                        && std::is_floating_point<T>::value>::type>
+                        std::is_floating_point<T>::value>::type>
                 {
                     //-----------------------------------------------------------------------------
                     //
                     //-----------------------------------------------------------------------------
                     ALPAKA_FN_ACC_CUDA_ONLY static auto createNormalReal(
-                        TAcc const & acc)
+                        RandCuRand const & /*rand*/)
                     -> rand::distribution::cuda::NormalReal<T>
                     {
                         return rand::distribution::cuda::NormalReal<T>();
@@ -262,22 +267,18 @@ namespace alpaka
                 //! The CUDA random number float uniform distribution get trait specialization.
                 //#############################################################################
                 template<
-                    typename TAcc,
                     typename T>
                 struct CreateUniformReal<
-                    TAcc,
+                    RandCuRand,
                     T,
                     typename std::enable_if<
-                        std::is_same<
-                            dev::Dev<TAcc>,
-                            dev::DevCudaRt>::value
-                        && std::is_floating_point<T>::value>::type>
+                        std::is_floating_point<T>::value>::type>
                 {
                     //-----------------------------------------------------------------------------
                     //
                     //-----------------------------------------------------------------------------
                     ALPAKA_FN_ACC_CUDA_ONLY static auto createUniformReal(
-                        TAcc const & acc)
+                        RandCuRand const & /*rand*/)
                     -> rand::distribution::cuda::UniformReal<T>
                     {
                         return rand::distribution::cuda::UniformReal<T>();
@@ -287,22 +288,18 @@ namespace alpaka
                 //! The CUDA random number integer uniform distribution get trait specialization.
                 //#############################################################################
                 template<
-                    typename TAcc,
                     typename T>
                 struct CreateUniformUint<
-                    TAcc,
+                    RandCuRand,
                     T,
                     typename std::enable_if<
-                        std::is_same<
-                            dev::Dev<TAcc>,
-                            dev::DevCudaRt>::value
-                        && std::is_integral<T>::value>::type>
+                        std::is_integral<T>::value>::type>
                 {
                     //-----------------------------------------------------------------------------
                     //
                     //-----------------------------------------------------------------------------
                     ALPAKA_FN_ACC_CUDA_ONLY static auto createUniformUint(
-                        TAcc const & acc)
+                        RandCuRand const & /*rand*/)
                     -> rand::distribution::cuda::UniformUint<T>
                     {
                         return rand::distribution::cuda::UniformUint<T>();
@@ -317,20 +314,15 @@ namespace alpaka
                 //#############################################################################
                 //! The CUDA random number default generator get trait specialization.
                 //#############################################################################
-                template<
-                    typename TAcc>
+                template<>
                 struct CreateDefault<
-                    TAcc,
-                    typename std::enable_if<
-                        std::is_same<
-                            dev::Dev<TAcc>,
-                            dev::DevCudaRt>::value>::type>
+                    RandCuRand>
                 {
                     //-----------------------------------------------------------------------------
                     //
                     //-----------------------------------------------------------------------------
                     ALPAKA_FN_ACC_CUDA_ONLY static auto createDefault(
-                        TAcc const & acc,
+                        RandCuRand const & /*rand*/,
                         std::uint32_t const & seed,
                         std::uint32_t const & subsequence)
                     -> rand::generator::cuda::Xor
diff --git a/include/alpaka/rand/RandStl.hpp b/include/alpaka/rand/RandStl.hpp
index 922406d..e6393bd 100644
--- a/include/alpaka/rand/RandStl.hpp
+++ b/include/alpaka/rand/RandStl.hpp
@@ -25,6 +25,8 @@
 
 #include <alpaka/core/Common.hpp>       // ALPAKA_FN_HOST_ACC
 
+#include <boost/core/ignore_unused.hpp> // boost::ignore_unused
+
 #include <random>                       // std::mt19937, std::uniform_real_distribution, ...
 #include <type_traits>                  // std::enable_if
 
@@ -32,6 +34,15 @@ namespace alpaka
 {
     namespace rand
     {
+        //#############################################################################
+        //! The standard library rand implementation.
+        //#############################################################################
+        class RandStl
+        {
+        public:
+            using RandBase = RandStl;
+        };
+
         namespace distribution
         {
             namespace traits
@@ -40,24 +51,21 @@ namespace alpaka
                 //! The CPU device random number float normal distribution get trait specialization.
                 //#############################################################################
                 template<
-                    typename TAcc,
                     typename T>
                 struct CreateNormalReal<
-                    TAcc,
+                    RandStl,
                     T,
                     typename std::enable_if<
-                        std::is_same<
-                            dev::Dev<TAcc>,
-                            dev::DevCpu>::value
-                        && std::is_floating_point<T>::value>::type>
+                        std::is_floating_point<T>::value>::type>
                 {
                     //-----------------------------------------------------------------------------
                     //
                     //-----------------------------------------------------------------------------
                     ALPAKA_FN_ACC_NO_CUDA static auto createNormalReal(
-                        TAcc const & acc)
+                        RandStl const & rand)
                     -> std::normal_distribution<T>
                     {
+                        boost::ignore_unused(rand);
                         return std::normal_distribution<T>();
                     }
                 };
@@ -65,24 +73,21 @@ namespace alpaka
                 //! The CPU device random number float uniform distribution get trait specialization.
                 //#############################################################################
                 template<
-                    typename TAcc,
                     typename T>
                 struct CreateUniformReal<
-                    TAcc,
+                    RandStl,
                     T,
                     typename std::enable_if<
-                        std::is_same<
-                            dev::Dev<TAcc>,
-                            dev::DevCpu>::value
-                        && std::is_floating_point<T>::value>::type>
+                        std::is_floating_point<T>::value>::type>
                 {
                     //-----------------------------------------------------------------------------
                     //
                     //-----------------------------------------------------------------------------
                     ALPAKA_FN_ACC_NO_CUDA static auto createUniformReal(
-                        TAcc const & acc)
+                        RandStl const & rand)
                     -> std::uniform_real_distribution<T>
                     {
+                        boost::ignore_unused(rand);
                         return std::uniform_real_distribution<T>();
                     }
                 };
@@ -90,24 +95,21 @@ namespace alpaka
                 //! The CPU device random number integer uniform distribution get trait specialization.
                 //#############################################################################
                 template<
-                    typename TAcc,
                     typename T>
                 struct CreateUniformUint<
-                    TAcc,
+                    RandStl,
                     T,
                     typename std::enable_if<
-                        std::is_same<
-                            dev::Dev<TAcc>,
-                            dev::DevCpu>::value
-                        && std::is_integral<T>::value>::type>
+                        std::is_integral<T>::value>::type>
                 {
                     //-----------------------------------------------------------------------------
                     //
                     //-----------------------------------------------------------------------------
                     ALPAKA_FN_ACC_NO_CUDA static auto createUniformUint(
-                        TAcc const & acc)
+                        RandStl const & rand)
                     -> std::uniform_int_distribution<T>
                     {
+                        boost::ignore_unused(rand);
                         return std::uniform_int_distribution<T>(
                             0,  // For signed integer: std::numeric_limits<T>::lowest()
                             std::numeric_limits<T>::max());
@@ -122,24 +124,20 @@ namespace alpaka
                 //#############################################################################
                 //! The CPU device random number default generator get trait specialization.
                 //#############################################################################
-                template<
-                    typename TAcc>
+                template<>
                 struct CreateDefault<
-                    TAcc,
-                    typename std::enable_if<
-                        std::is_same<
-                            dev::Dev<TAcc>,
-                            dev::DevCpu>::value>::type>
+                    RandStl>
                 {
                     //-----------------------------------------------------------------------------
                     //
                     //-----------------------------------------------------------------------------
                     ALPAKA_FN_ACC_NO_CUDA static auto createDefault(
-                        TAcc const & acc,
+                        RandStl const & rand,
                         std::uint32_t const & seed,
                         std::uint32_t const & subsequence)
                     -> std::mt19937
                     {
+                        boost::ignore_unused(rand);
                         // NOTE: XOR the seed and the subsequence to generate a unique seed.
                         return std::mt19937(seed ^ subsequence);
                     }
diff --git a/include/alpaka/rand/Traits.hpp b/include/alpaka/rand/Traits.hpp
index a9a6025..7e91282 100644
--- a/include/alpaka/rand/Traits.hpp
+++ b/include/alpaka/rand/Traits.hpp
@@ -47,7 +47,7 @@ namespace alpaka
                 //! The random number float normal distribution get trait.
                 //#############################################################################
                 template<
-                    typename TAcc,
+                    typename TRand,
                     typename T,
                     typename TSfinae = void>
                 struct CreateNormalReal;
@@ -56,7 +56,7 @@ namespace alpaka
                 //! The random number float uniform distribution get trait.
                 //#############################################################################
                 template<
-                    typename TAcc,
+                    typename TRand,
                     typename T,
                     typename TSfinae = void>
                 struct CreateUniformReal;
@@ -65,7 +65,7 @@ namespace alpaka
                 //! The random number integer uniform distribution get trait.
                 //#############################################################################
                 template<
-                    typename TAcc,
+                    typename TRand,
                     typename T,
                     typename TSfinae = void>
                 struct CreateUniformUint;
@@ -77,25 +77,26 @@ namespace alpaka
             ALPAKA_NO_HOST_ACC_WARNING
             template<
                 typename T,
-                typename TAcc>
+                typename TRand>
             ALPAKA_FN_HOST_ACC auto createNormalReal(
-                TAcc const & acc)
+                TRand const & rand)
             -> decltype(
                 traits::CreateNormalReal<
-                    TAcc,
+                    TRand,
                     T>
                 ::createNormalReal(
-                    std::declval<TAcc const &>()))
+                    std::declval<TRand const &>()))
             {
                 static_assert(
                     std::is_floating_point<T>::value,
                     "The value type T has to be a floating point type!");
 
-                return traits::CreateNormalReal<
-                    TAcc,
-                    T>
-                ::createNormalReal(
-                    acc);
+                return
+                    traits::CreateNormalReal<
+                        TRand,
+                        T>
+                    ::createNormalReal(
+                        rand);
             }
             //-----------------------------------------------------------------------------
             //! \return A uniform floating point distribution [0.0, 1.0).
@@ -103,25 +104,26 @@ namespace alpaka
             ALPAKA_NO_HOST_ACC_WARNING
             template<
                 typename T,
-                typename TAcc>
+                typename TRand>
             ALPAKA_FN_HOST_ACC auto createUniformReal(
-                TAcc const & acc)
+                TRand const & rand)
             -> decltype(
                 traits::CreateUniformReal<
-                    TAcc,
+                    TRand,
                     T>
                 ::createUniformReal(
-                    std::declval<TAcc const &>()))
+                    std::declval<TRand const &>()))
             {
                 static_assert(
                     std::is_floating_point<T>::value,
                     "The value type T has to be a floating point type!");
 
-                return traits::CreateUniformReal<
-                    TAcc,
-                    T>
-                ::createUniformReal(
-                    acc);
+                return
+                    traits::CreateUniformReal<
+                        TRand,
+                        T>
+                    ::createUniformReal(
+                        rand);
             }
             //-----------------------------------------------------------------------------
             //! \return A uniform integer distribution [0, UINT_MAX].
@@ -129,25 +131,116 @@ namespace alpaka
             ALPAKA_NO_HOST_ACC_WARNING
             template<
                 typename T,
-                typename TAcc>
+                typename TRand>
             ALPAKA_FN_HOST_ACC auto createUniformUint(
-                TAcc const & acc)
+                TRand const & rand)
             -> decltype(
                 traits::CreateUniformUint<
-                    TAcc,
+                    TRand,
                     T>
                 ::createUniformUint(
-                    std::declval<TAcc const &>()))
+                    std::declval<TRand const &>()))
             {
                 static_assert(
                     std::is_integral<T>::value && std::is_unsigned<T>::value,
                     "The value type T has to be a unsigned integral type!");
 
-                return traits::CreateUniformUint<
-                    TAcc,
-                    T>
-                ::createUniformUint(
-                    acc);
+                return
+                    traits::CreateUniformUint<
+                        TRand,
+                        T>
+                    ::createUniformUint(
+                        rand);
+            }
+            namespace traits
+            {
+                //#############################################################################
+                //! The CreateNormalReal specialization for classes with RandBase member type.
+                //#############################################################################
+                template<
+                    typename TRand,
+                    typename T>
+                struct CreateNormalReal<
+                    TRand,
+                    T,
+                    typename std::enable_if<
+                        std::is_base_of<typename TRand::RandBase, typename std::decay<TRand>::type>::value
+                        && (!std::is_same<typename TRand::RandBase, typename std::decay<TRand>::type>::value)>::type>
+                {
+                    //-----------------------------------------------------------------------------
+                    //
+                    //-----------------------------------------------------------------------------
+                    ALPAKA_NO_HOST_ACC_WARNING
+                    ALPAKA_FN_HOST_ACC static auto createNormalReal(
+                        TRand const & rand)
+                    -> decltype(
+                        rand::distribution::createNormalReal<T>(
+                            static_cast<typename TRand::RandBase const &>(rand)))
+                    {
+                        // Delegate the call to the base class.
+                        return
+                            rand::distribution::createNormalReal<T>(
+                                static_cast<typename TRand::RandBase const &>(rand));
+                    }
+                };
+                //#############################################################################
+                //! The CreateUniformReal specialization for classes with RandBase member type.
+                //#############################################################################
+                template<
+                    typename TRand,
+                    typename T>
+                struct CreateUniformReal<
+                    TRand,
+                    T,
+                    typename std::enable_if<
+                        std::is_base_of<typename TRand::RandBase, typename std::decay<TRand>::type>::value
+                        && (!std::is_same<typename TRand::RandBase, typename std::decay<TRand>::type>::value)>::type>
+                {
+                    //-----------------------------------------------------------------------------
+                    //
+                    //-----------------------------------------------------------------------------
+                    ALPAKA_NO_HOST_ACC_WARNING
+                    ALPAKA_FN_HOST_ACC static auto createUniformReal(
+                        TRand const & rand)
+                    -> decltype(
+                        rand::distribution::createUniformReal<T>(
+                            static_cast<typename TRand::RandBase const &>(rand)))
+                    {
+                        // Delegate the call to the base class.
+                        return
+                            rand::distribution::createUniformReal<T>(
+                                static_cast<typename TRand::RandBase const &>(rand));
+                    }
+                };
+                //#############################################################################
+                //! The CreateUniformUint specialization for classes with RandBase member type.
+                //#############################################################################
+                template<
+                    typename TRand,
+                    typename T>
+                struct CreateUniformUint<
+                    TRand,
+                    T,
+                    typename std::enable_if<
+                        std::is_base_of<typename TRand::RandBase, typename std::decay<TRand>::type>::value
+                        && (!std::is_same<typename TRand::RandBase, typename std::decay<TRand>::type>::value)>::type>
+                {
+                    //-----------------------------------------------------------------------------
+                    //
+                    //-----------------------------------------------------------------------------
+                    ALPAKA_NO_HOST_ACC_WARNING
+                    ALPAKA_FN_HOST_ACC static auto createUniformUint(
+                        TRand const & rand)
+                    -> decltype(
+                        rand::distribution::createUniformUint<T>(
+                            static_cast<typename TRand::RandBase const &>(rand)))
+                    {
+                        // Delegate the call to the base class.
+                        return
+                            rand::distribution::createUniformUint<T>(
+                                static_cast<typename TRand::RandBase const &>(rand));
+                    }
+                };
             }
         }
         //-----------------------------------------------------------------------------
@@ -164,7 +257,7 @@ namespace alpaka
                 //! The random number default generator get trait.
                 //#############################################################################
                 template<
-                    typename TAcc,
+                    typename TRand,
                     typename TSfinae = void>
                 struct CreateDefault;
             }
@@ -173,25 +266,62 @@ namespace alpaka
             //-----------------------------------------------------------------------------
             ALPAKA_NO_HOST_ACC_WARNING
             template<
-                typename TAcc>
+                typename TRand>
             ALPAKA_FN_HOST_ACC auto createDefault(
-                TAcc const & acc,
+                TRand const & rand,
                 std::uint32_t const & seed,
                 std::uint32_t const & subsequence)
             -> decltype(
                 traits::CreateDefault<
-                    TAcc>
+                    TRand>
                 ::createDefault(
-                    std::declval<TAcc const &>(),
+                    std::declval<TRand const &>(),
                     std::declval<std::uint32_t const &>(),
                     std::declval<std::uint32_t const &>()))
             {
-                return traits::CreateDefault<
-                    TAcc>
-                ::createDefault(
-                    acc,
-                    seed,
-                    subsequence);
+                return
+                    traits::CreateDefault<
+                        TRand>
+                    ::createDefault(
+                        rand,
+                        seed,
+                        subsequence);
+            }
+            namespace traits
+            {
+                //#############################################################################
+                //! The CreateDefault specialization for classes with RandBase member type.
+                //#############################################################################
+                template<
+                    typename TRand>
+                struct CreateDefault<
+                    TRand,
+                    typename std::enable_if<
+                        std::is_base_of<typename TRand::RandBase, typename std::decay<TRand>::type>::value
+                        && (!std::is_same<typename TRand::RandBase, typename std::decay<TRand>::type>::value)>::type>
+                {
+                    //-----------------------------------------------------------------------------
+                    //
+                    //-----------------------------------------------------------------------------
+                    ALPAKA_NO_HOST_ACC_WARNING
+                    ALPAKA_FN_HOST_ACC static auto createDefault(
+                        TRand const & rand,
+                        std::uint32_t const & seed,
+                        std::uint32_t const & subsequence)
+                    -> decltype(
+                        rand::generator::createDefault(
+                            static_cast<typename TRand::RandBase const &>(rand),
+                            seed,
+                            subsequence))
+                    {
+                        // Delegate the call to the base class.
+                        return
+                            rand::generator::createDefault(
+                                static_cast<typename TRand::RandBase const &>(rand),
+                                seed,
+                                subsequence);
+                    }
+                };
             }
         }
     }
diff --git a/include/alpaka/size/Traits.hpp b/include/alpaka/size/Traits.hpp
index 950d8f8..954efab 100644
--- a/include/alpaka/size/Traits.hpp
+++ b/include/alpaka/size/Traits.hpp
@@ -21,7 +21,7 @@
 
 #pragma once
 
-#include <type_traits>                      // std::enable_if, std::is_arithmetic
+#include <type_traits>  // std::enable_if, std::is_arithmetic
 
 namespace alpaka
 {
diff --git a/include/alpaka/stream/StreamCpuAsync.hpp b/include/alpaka/stream/StreamCpuAsync.hpp
index 52d6e82..e8ad0b3 100644
--- a/include/alpaka/stream/StreamCpuAsync.hpp
+++ b/include/alpaka/stream/StreamCpuAsync.hpp
@@ -223,6 +223,7 @@ namespace alpaka
         {
             //#############################################################################
             //! The CPU async device stream enqueue trait specialization.
+            //! This default implementation for all tasks directly invokes the function call operator of the task.
             //#############################################################################
             template<
                 typename TTask>
diff --git a/include/alpaka/stream/StreamCpuSync.hpp b/include/alpaka/stream/StreamCpuSync.hpp
index 93b88d1..bee87b8 100644
--- a/include/alpaka/stream/StreamCpuSync.hpp
+++ b/include/alpaka/stream/StreamCpuSync.hpp
@@ -198,6 +198,7 @@ namespace alpaka
         {
             //#############################################################################
             //! The CPU sync device stream enqueue trait specialization.
+            //! This default implementation for all tasks directly invokes the function call operator of the task.
             //#############################################################################
             template<
                 typename TTask>
diff --git a/include/alpaka/vec/Vec.hpp b/include/alpaka/vec/Vec.hpp
index db6f763..9cec6a6 100644
--- a/include/alpaka/vec/Vec.hpp
+++ b/include/alpaka/vec/Vec.hpp
@@ -42,6 +42,12 @@
 #include <type_traits>                      // std::enable_if
 #include <algorithm>                        // std::min, std::max, std::min_element, std::max_element
 
+#define ALPAKA_CREATE_VEC_IN_CLASS
+// The nvcc compiler does not support the out of class version.
+#ifdef __CUDACC__
+    #define ALPAKA_CREATE_VEC_IN_CLASS
+#endif
+
 namespace alpaka
 {
     template<
@@ -49,7 +55,7 @@ namespace alpaka
         typename TSize>
     class Vec;
 
-#ifndef __CUDACC__
+#ifndef ALPAKA_CREATE_VEC_IN_CLASS
     //-----------------------------------------------------------------------------
     //! Single value constructor helper.
     //-----------------------------------------------------------------------------
@@ -171,10 +177,10 @@ namespace alpaka
         ALPAKA_FN_HOST_ACC Vec(
             TArg0 && arg0,
             TArgs && ... args) :
-                m_auiData{std::forward<TArg0>(arg0), std::forward<TArgs>(args)...}
+                m_data{std::forward<TArg0>(arg0), std::forward<TArgs>(args)...}
         {}
 
-#ifdef __CUDACC__
+#ifdef ALPAKA_CREATE_VEC_IN_CLASS
         //-----------------------------------------------------------------------------
         //! Creator using func<idx>(args...) to initialize all values of the vector.
         //-----------------------------------------------------------------------------
@@ -294,7 +300,7 @@ namespace alpaka
         {
             return
                 createVecFromIndexedFn<
-#ifndef __CUDACC__
+#ifndef ALPAKA_CREATE_VEC_IN_CLASS
                     TDim,
 #endif
                     CreateSingleVal>(
@@ -335,7 +341,7 @@ namespace alpaka
             core::assertValueUnsigned(iIdx);
             auto const idx(static_cast<typename TDim::value_type>(iIdx));
             assert(idx<TDim::value);
-            return m_auiData[idx];
+            return m_data[idx];
         }
 
         //-----------------------------------------------------------------------------
@@ -354,7 +360,7 @@ namespace alpaka
             core::assertValueUnsigned(iIdx);
             auto const idx(static_cast<typename TDim::value_type>(iIdx));
             assert(idx<TDim::value);
-            return m_auiData[idx];
+            return m_data[idx];
         }
 
         //-----------------------------------------------------------------------------
@@ -416,7 +422,7 @@ namespace alpaka
         ALPAKA_FN_HOST_ACC auto foldrAll(
             TFnObj const & f) const
         -> decltype(
-#if (BOOST_COMP_GNUC) && (BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(5, 0, 0))
+#if (BOOST_COMP_GNUC && (BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(5, 0, 0))) || __INTEL_COMPILER
             this->foldrByIndices(
 #else
             foldrByIndices(
@@ -483,10 +489,10 @@ namespace alpaka
             return
                 static_cast<typename TDim::value_type>(
                     std::distance(
-                        std::begin(m_auiData),
+                        std::begin(m_data),
                         std::min_element(
-                            std::begin(m_auiData),
-                            std::end(m_auiData))));
+                            std::begin(m_data),
+                            std::end(m_data))));
         }
         //-----------------------------------------------------------------------------
         //! \return The index of the maximal element.
@@ -498,32 +504,17 @@ namespace alpaka
             return
                 static_cast<typename TDim::value_type>(
                     std::distance(
-                        std::begin(m_auiData),
+                        std::begin(m_data),
                         std::max_element(
-                            std::begin(m_auiData),
-                            std::end(m_auiData))));
+                            std::begin(m_data),
+                            std::end(m_data))));
         }
 
     private:
         // 16 Byte alignment for usage inside of CUDA kernels.
-        alignas(16u) TSize m_auiData[TDim::value];
+        alignas(16u) TSize m_data[TDim::value];
     };
 
-    template<
-        typename TSize>
-    using Vec1 = Vec<dim::DimInt<1u>, TSize>;
-
-    template<
-        typename TSize>
-    using Vec2 = Vec<dim::DimInt<2u>, TSize>;
-
-    template<
-        typename TSize>
-    using Vec3 = Vec<dim::DimInt<3u>, TSize>;
-
-    template<
-        typename TSize>
-    using Vec4 = Vec<dim::DimInt<4u>, TSize>;
 
     namespace detail
     {
@@ -563,11 +554,11 @@ namespace alpaka
     -> Vec<TDim, TSize>
     {
         return
-#ifdef __CUDACC__
+#ifdef ALPAKA_CREATE_VEC_IN_CLASS
             Vec<TDim, TSize>::template
 #endif
             createVecFromIndexedFn<
-#ifndef __CUDACC__
+#ifndef ALPAKA_CREATE_VEC_IN_CLASS
                 TDim,
 #endif
                 detail::CreateAdd>(
@@ -613,11 +604,11 @@ namespace alpaka
     -> Vec<TDim, TSize>
     {
         return
-#ifdef __CUDACC__
+#ifdef ALPAKA_CREATE_VEC_IN_CLASS
             Vec<TDim, TSize>::template
 #endif
             createVecFromIndexedFn<
-#ifndef __CUDACC__
+#ifndef ALPAKA_CREATE_VEC_IN_CLASS
                 TDim,
 #endif
                 detail::CreateMul>(
@@ -771,15 +762,15 @@ namespace alpaka
                 //-----------------------------------------------------------------------------
                 ALPAKA_NO_HOST_ACC_WARNING
                 template<
-                    typename TValNew,
+                    typename TSizeNew,
                     typename TDim,
                     typename TSize>
                 ALPAKA_FN_HOST_ACC static auto create(
-                    TValNew const &/* valNew*/,
+                    TSizeNew const &/* valNew*/,
                     Vec<TDim, TSize> const & vec)
-                -> TValNew
+                -> TSizeNew
                 {
-                    return static_cast<TValNew>(vec[Tidx]);
+                    return static_cast<TSizeNew>(vec[Tidx]);
                 }
             };
 
@@ -789,22 +780,22 @@ namespace alpaka
         //-----------------------------------------------------------------------------
         ALPAKA_NO_HOST_ACC_WARNING
         template<
-            typename TValNew,
+            typename TSizeNew,
             typename TDim,
             typename TSize>
         ALPAKA_FN_HOST_ACC static auto cast(Vec<TDim, TSize> const & other)
-        -> Vec<TDim, TValNew>
+        -> Vec<TDim, TSizeNew>
         {
             return
-#ifdef __CUDACC__
-            Vec<TDim, TValNew>::template
+#ifdef ALPAKA_CREATE_VEC_IN_CLASS
+            Vec<TDim, TSizeNew>::template
 #endif
                 createVecFromIndexedFn<
-#ifndef __CUDACC__
+#ifndef ALPAKA_CREATE_VEC_IN_CLASS
                     TDim,
 #endif
                     detail::CreateCast>(
-                        TValNew(),
+                        TSizeNew(),
                         other);
         }
     }
@@ -844,11 +835,11 @@ namespace alpaka
     -> Vec<TDim, TSize>
     {
         return
-#ifdef __CUDACC__
+#ifdef ALPAKA_CREATE_VEC_IN_CLASS
             Vec<TDim, TSize>::template
 #endif
             createVecFromIndexedFn<
-#ifndef __CUDACC__
+#ifndef ALPAKA_CREATE_VEC_IN_CLASS
                 TDim,
 #endif
                 detail::CreateReverse>(
@@ -891,11 +882,11 @@ namespace alpaka
         -> Vec<dim::Dim<TExtents>, size::Size<TExtents>>
         {
             return
-#ifdef __CUDACC__
+#ifdef ALPAKA_CREATE_VEC_IN_CLASS
             Vec<dim::Dim<TExtents>, size::Size<TExtents>>::template
 #endif
                 createVecFromIndexedFn<
-#ifndef __CUDACC__
+#ifndef ALPAKA_CREATE_VEC_IN_CLASS
                     dim::Dim<TExtents>,
 #endif
                     detail::CreateExtent>(
@@ -914,11 +905,11 @@ namespace alpaka
         {
             using IdxOffset = std::integral_constant<std::intmax_t, ((std::intmax_t)dim::Dim<TExtents>::value)-((std::intmax_t)TDim::value)>;
             return
-#ifdef __CUDACC__
+#ifdef ALPAKA_CREATE_VEC_IN_CLASS
             Vec<TDim, size::Size<TExtents>>::template
 #endif
                 createVecFromIndexedFnOffset<
-#ifndef __CUDACC__
+#ifndef ALPAKA_CREATE_VEC_IN_CLASS
                     TDim,
 #endif
                     detail::CreateExtent,
@@ -963,11 +954,11 @@ namespace alpaka
         -> Vec<dim::Dim<TOffsets>, size::Size<TOffsets>>
         {
             return
-#ifdef __CUDACC__
+#ifdef ALPAKA_CREATE_VEC_IN_CLASS
             Vec<dim::Dim<TOffsets>, size::Size<TOffsets>>::template
 #endif
                 createVecFromIndexedFn<
-#ifndef __CUDACC__
+#ifndef ALPAKA_CREATE_VEC_IN_CLASS
                     dim::Dim<TOffsets>,
 #endif
                     detail::CreateOffset>(
@@ -986,11 +977,11 @@ namespace alpaka
         {
             using IdxOffset = std::integral_constant<std::size_t, (std::size_t)(((std::intmax_t)dim::Dim<TOffsets>::value)-((std::intmax_t)TDim::value))>;
             return
-#ifdef __CUDACC__
+#ifdef ALPAKA_CREATE_VEC_IN_CLASS
             Vec<TDim, size::Size<TOffsets>>::template
 #endif
                 createVecFromIndexedFnOffset<
-#ifndef __CUDACC__
+#ifndef ALPAKA_CREATE_VEC_IN_CLASS
                     TDim,
 #endif
                     detail::CreateOffset,
diff --git a/include/alpaka/workdiv/Traits.hpp b/include/alpaka/workdiv/Traits.hpp
index 15947f6..b848ed1 100644
--- a/include/alpaka/workdiv/Traits.hpp
+++ b/include/alpaka/workdiv/Traits.hpp
@@ -76,17 +76,44 @@ namespace alpaka
 
         namespace traits
         {
+            //#############################################################################
+            //! The WorkDivMembers grid block extents trait specialization for classes with WorkDivBase member type.
+            //#############################################################################
+            template<
+                typename TWorkDiv>
+            struct GetWorkDiv<
+                TWorkDiv,
+                origin::Grid,
+                unit::Blocks,
+                typename std::enable_if<
+                    std::is_base_of<typename TWorkDiv::WorkDivBase, typename std::decay<TWorkDiv>::type>::value
+                    && (!std::is_same<typename TWorkDiv::WorkDivBase, typename std::decay<TWorkDiv>::type>::value)>::type>
+            {
+                //-----------------------------------------------------------------------------
+                //! \return The number of threads in each dimension of a block.
+                //-----------------------------------------------------------------------------
+                ALPAKA_NO_HOST_ACC_WARNING
+                ALPAKA_FN_HOST_ACC static auto getWorkDiv(
+                    TWorkDiv const & workDiv)
+                -> Vec<dim::Dim<typename TWorkDiv::WorkDivBase>, size::Size<TWorkDiv>>
+                {
+                    // Delegate the call to the base class.
+                    return
+                        workdiv::getWorkDiv<
+                            origin::Grid,
+                            unit::Blocks>(
+                                static_cast<typename TWorkDiv::WorkDivBase const &>(workDiv));
+                }
+            };
             //#############################################################################
             //! The WorkDivMembers block thread extents trait specialization for classes with WorkDivBase member type.
             //#############################################################################
             template<
-                typename TWorkDiv,
-                typename TOrigin,
-                typename TUnit>
+                typename TWorkDiv>
             struct GetWorkDiv<
                 TWorkDiv,
-                TOrigin,
-                TUnit,
+                origin::Block,
+                unit::Threads,
                 typename std::enable_if<
                     std::is_base_of<typename TWorkDiv::WorkDivBase, typename std::decay<TWorkDiv>::type>::value
                     && (!std::is_same<typename TWorkDiv::WorkDivBase, typename std::decay<TWorkDiv>::type>::value)>::type>
@@ -102,8 +129,8 @@ namespace alpaka
                     // Delegate the call to the base class.
                     return
                         workdiv::getWorkDiv<
-                            TOrigin,
-                            TUnit>(
+                            origin::Block,
+                            unit::Threads>(
                                 static_cast<typename TWorkDiv::WorkDivBase const &>(workDiv));
                 }
             };