UoB-HPC
diff --git a/‎.clang-format
+10 b/‎.clang-format
+10
diff --git a/‎.gitignore
+3-1 b/‎.gitignore
+3-1
diff --git a/‎CHANGELOG.md
+1-1 b/‎CHANGELOG.md
+1-1
diff --git a/‎CMakeLists.txt
+15-113 b/‎CMakeLists.txt
+15-113
diff --git a/‎README.md
+75-26 b/‎README.md
+75-26
diff --git a/‎heatmap.py
+9-2 b/‎heatmap.py
+9-2
@@ -0,0 +1,10 @@
+---
+AllowShortIfStatementsOnASingleLine: Always
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+IndentCaseLabels: true
+ColumnLimit: 120
+CompactNamespaces: true
+FixNamespaceComments: true
+IndentPPDirectives: BeforeHash
+...
@@ -18,4 +18,6 @@ build/
 cmake-build-*/
 .idea/
 .directory
-log.txt
+log.txt
+
+heatmap.csv
@@ -1,7 +1,7 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 
-## [v2.0] - 2022-02-??
+## [v2.0] - 2022-??-??
 
 ### Added
 - CI via GitHub Actions
 
@@ -3,112 +3,20 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 project(miniBUDE VERSION 2.0 LANGUAGES CXX)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-
 set(CMAKE_VERBOSE_MAKEFILE ON)
 
 # some nicer defaults for standard C++
 set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 
-#set(USE_CPU_FEATURES ON)
-#set(MODEL acc)
-
-
-if (NOT MODEL)
-#    set(MODEL raja)
-#    set(RAJA_IN_TREE /home/tom/Downloads/RAJA-v0.14.1/)
-#    set(ENABLE_CUDA ON)
-#    set(CUDA_ARCH sm_61)
-#    set(CUDA_TOOLKIT_ROOT_DIR /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/11.4)
-
-
-
-#    set(MODEL cuda)
-#    set(CUDA_ARCH sm_61)
-#    set(CMAKE_CUDA_COMPILER /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/11.4/bin/nvcc)
-
-
-
-#set(MODEL kokkos)
-#set(KOKKOS_IN_TREE /home/tom/Downloads/kokkos-3.5.00)
-#set(Kokkos_ENABLE_OPENMP ON CACHE BOOL "")
-#set(Kokkos_ENABLE_CUDA ON CACHE BOOL "")
-#set(Kokkos_ARCH_PASCAL61 ON CACHE BOOL "")
-#set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "")
-
-set(MODEL thrust)
-set(THRUST_IMPL CUDA)
-set(SDK_DIR /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/include)
-set(CMAKE_CUDA_COMPILER /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/11.4/bin/nvcc)
-set(CUDA_ARCH sm_61)
-
-
+## Flags for debugging only, enable for development (ASan only works on few models)
+set(SANITIZE OFF)
+if (SANITIZE)
+    set(DEBUG_FLAGS ${DEBUG_FLAGS} -fsanitize=address)
+    set(CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address)
 endif ()
 
-
-#set(MODEL tbb)
-#set(OFFLOAD NVIDIA:sm_35)
-
-
-#set(MODEL sycl)
-#set(SYCL_COMPILER HIPSYCL)
-#set(SYCL_COMPILER_DIR /opt/hipsycl/68fb6d2026b07b0895ff468b58d40858ac1ae7d5)
-
-
-#set(MODEL sycl)
-#set(SYCL_COMPILER ONEAPI-DPCPP)
-
-
-
-#set(CMAKE_CXX_COMPILER "${KOKKOS_IN_TREE}/bin/nvcc_wrapper")
-
-#set(ENV{CUDA_ROOT} /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/11.4/)
-#set(CUDA_ROOT /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/11.4/)
-
-#set(CMAKE_CUDA_COMPILER /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/compilers/bin/nvcc)
-#set(CUDA_ROOT /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/11.4/)
-#set(CUDAToolkit_ROOT /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/cuda/11.4/)
-
-
-
-#
-#set(MODEL std20)
-#set(CXX_EXTRA_LIBRARIES tbb)
-
-#set(MODEL std)
-#set(CMAKE_CXX_COMPILER /opt/nvidia/hpc_sdk/Linux_x86_64/21.5/compilers/bin/nvc++)
-
-
-#set(MODEL cuda)
-#set(CMAKE_CUDA_COMPILER /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/compilers/bin/nvcc)
-#set(CUDA_EXTRA_FLAGS "-gencode arch=compute_61,code=sm_61")
-#set(CUDA_ARCH sm_61)
-
-
-#set(MODEL ocl)
-#set(OpenCL_LIBRARY /opt/rocm-4.3.0/opencl/lib/libOpenCL.so.1.2)
-
-#set(MODEL hip)
-#set(CMAKE_CXX_COMPILER /opt/rocm-4.3.0/bin/hipcc)
-#set(CXX_EXTRA_FLAGS  -nogpuinc -nogpulib)
-#set(CUDA_EXTRA_FLAGS "-gencode arch=compute_35,code=sm_35")
-#set(CUDA_ARCH sm_35)
-
-
-#set(SYCL_COMPILER_DIR /opt/hipsycl/68fb6d2026b07b0895ff468b58d40858ac1ae7d5)
-
-#set(MODEL std)
-##set(CXX_EXTRA_LIBRARIES tbb)
-#set(CMAKE_CXX_COMPILER /opt/nvidia/hpc_sdk/Linux_x86_64/21.9/compilers/bin/nvc++)
-#set(NVHPC_OFFLOAD cc61)
-
-
-#set(SYCL_COMPILER COMPUTECPP)
-#set(SYCL_COMPILER_DIR /home/tom/Downloads/ComputeCpp-CE-2.6.0-x86_64-linux-gnu/)
-#set(OpenCL_LIBRARY /opt/intel/oneapi/compiler/2021.4.0/linux/lib/libOpenCL.so.1.2)
-
-
 # the final executable name
 set(EXE_NAME bude)
 
@@ -157,7 +65,7 @@ if ((DEFINED CXX_EXTRA_FLAGS) AND (NOT DEFINED CXX_EXTRA_LINK_FLAGS))
     set(CXX_EXTRA_LINK_FLAGS ${CXX_EXTRA_FLAGS})
 endif ()
 
-option(USE_CPU_FEATURES "Enable the cpu_feature library for host CPU detection" ON)
+option(USE_CPU_FEATURES "Enable the cpu_feature library for host CPU detection" OFF)
 
 if (USE_CPU_FEATURES)
     include(FetchContent)
@@ -177,8 +85,8 @@ include(cmake/register_models.cmake)
 # register out models <model_name> <preprocessor_def_name> <source files...>
 register_model(omp OMP fasten.hpp)
 register_model(ocl OCL fasten.hpp)
-register_model(std STD fasten.hpp)
-register_model(std20 STD20 fasten.hpp) # TODO
+register_model(std-indices STD_INDICES fasten.hpp)
+register_model(std-ranges STD_RANGES fasten.hpp) # TODO
 register_model(hip HIP fasten.hpp)
 register_model(cuda CUDA fasten.hpp)
 register_model(kokkos KOKKOS fasten.hpp)
@@ -190,7 +98,6 @@ register_model(tbb TBB fasten.hpp)
 register_model(thrust THRUST fasten.hpp) # TODO
 
 
-
 set(USAGE ON CACHE BOOL "Whether to print all custom flags for the selected model")
 
 message(STATUS "Available models:  ${REGISTERED_MODELS}")
@@ -226,12 +133,6 @@ endif ()
 
 message(STATUS "Default ${CMAKE_BUILD_TYPE} flags are `${DEFAULT_${BUILD_TYPE}_FLAGS}`, set ${BUILD_TYPE}_FLAGS to override (CXX_EXTRA_* flags are not affected)")
 
-set(SANATIZE OFF)
-
-if (SANATIZE)
-    set(DEBUG_FLAGS ${DEBUG_FLAGS} -fsanitize=address)
-    set(CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address)
-endif ()
 
 # setup common build flag defaults if there are no overrides
 if (NOT DEFINED ${BUILD_TYPE}_FLAGS)
@@ -243,9 +144,9 @@ endif ()
 set(DEFAULT_PPWI "1,2,4,8,16,32,64,128")
 
 if (NOT PPWI)
-    message(STATUS PPWI not set, defaulting to ${DEFAULT_PPWI})
+    message(STATUS "PPWI not set, defaulting to ${DEFAULT_PPWI}")
     set(PPWI ${DEFAULT_PPWI})
-endif()
+endif ()
 
 
 message(STATUS "CXX vendor  : ${CMAKE_CXX_COMPILER_ID} (${CMAKE_CXX_COMPILER})")
@@ -285,9 +186,10 @@ target_link_options(${EXE_NAME} PUBLIC ${LINK_FLAGS} ${CXX_EXTRA_LINK_FLAGS})
 # setup git_watcher...
 set(PRE_CONFIGURE_FILE "${CMAKE_SOURCE_DIR}/src/meta_vcs.h.in")
 set(POST_CONFIGURE_FILE "${CMAKE_BINARY_DIR}/generated/meta_vcs.h")
-#include("${CMAKE_SOURCE_DIR}/cmake/git_watcher.cmake")
-#set(GIT_FAIL_IF_NONZERO_EXIT FALSE)
-#add_dependencies(${EXE_NAME} check_git)
+
+set(GIT_FAIL_IF_NONZERO_EXIT FALSE) # Dont' fail the build because of VCS; use FALSE here because git_watcher says so
+include("${CMAKE_SOURCE_DIR}/cmake/git_watcher.cmake")
+add_dependencies(${EXE_NAME} check_git)
 
 
 # some models require the target to be already specified so they can finish their setup here
@@ -312,6 +214,6 @@ else ()
     set(COMPILE_COMMANDS COMPILE_FLAGS)
 endif ()
 
-#set_target_properties(${EXE_NAME} PROPERTIES OUTPUT_NAME "${BIN_NAME}")
+set_target_properties(${EXE_NAME} PROPERTIES OUTPUT_NAME "${BIN_NAME}")
 
 install(TARGETS ${EXE_NAME} DESTINATION bin)
@@ -8,35 +8,84 @@ Increasing the iteration count has similar performance effects to docking multip
 
 The top-level `data` directory contains the input common to implementations.
 The top-level `makedeck` directory contains an input deck generation program and a set of mol2/bhff input files.
-Each other subdirectory contains a separate C/C++ implementation:
-
-- [OpenMP](openmp/) for CPUs
-- [OpenMP target](openmp-target/) for GPUs
-- [CUDA](cuda/) for GPUs
-- [OpenCL](opencl/) for GPUs
-- [OpenACC](openacc/) for GPUs
-- [SYCL](sycl/) for CPUs and GPUs
-- [Kokkos](kokkos/) for CPUs and GPUs
-
-We also include implementations in emerging programming languages as direct ports of miniBUDE:
-
-- [Julia](miniBUDE.jl) for CPUs (@threads) and GPUs ([CUDA.jl](https://juliagpu.gitlab.io/CUDA.jl/), [AMDGPU.jl](https://amdgpu.juliagpu.org/stable/), [oneAPI.jl](https://github.com/JuliaGPU/oneAPI.jl), etc)
-
+Each other subdirectory in `src` contains a separate C/C++ implementation.
 
 ## Building
 
-To build with the default options, type `make` in an implementation directory.
-There are options to choose the compiler used and the architecture targeted.
-
-Refer to each implementation's README for further build instructions.
-
-## Running
-
-To run with the default options, run the binary without any flags.
-To adjust the run time, use `-i` to set the number of iterations.
-For very short runs, e.g. for simulation, use `-n 1024` to reduce the number of poses.
-
-Refer to each implementation's README for further run instructions.
+Drivers, compiler and software applicable to whichever implementation you would like to build against is required.
+
+### CMake
+
+The project supports building with CMake >= 3.14.0, which can be installed without root via the [official script](https://cmake.org/download/).
+
+Each miniBUDE implementation (programming model) is built as follows:
+
+```shell
+$ cd miniBUDE
+
+# configure the build, build type defaults to Release
+# The -DMODEL flag is required
+$ cmake -Bbuild -H. -DMODEL=<model> <model specific flags prefixed with -D...>
+
+# compile
+$ cmake --build build
+
+# run executables in ./build
+$ ./build/<model>-bude
+```
+
+The `MODEL` option selects one implementation of miniBUDE to build.
+The source for each model's implementations are located in `./src/<model>`.
+
+Currently available models are:
+```
+omp;ocl;std-indices;std-ranges;hip;cuda;kokkos;sycl;acc;raja;tbb;thrust
+```
+
+#### Overriding default flags
+By default, we have defined a set of optimal flags for known HPC compilers.
+There are assigned those to `RELEASE_FLAGS`, and you can override them if required.
+
+To find out what flag each model supports or requires, simply configure while only specifying the model.
+For example:
+```shell
+> cd miniBUDE
+> cmake -Bbuild -H. -DMODEL=omp 
+No CMAKE_BUILD_TYPE specified, defaulting to 'Release'
+-- CXX_EXTRA_FLAGS: 
+        Appends to common compile flags. These will be appended at link phase as well.
+        To use separate flags at link phase, set `CXX_EXTRA_LINK_FLAGS`
+-- CXX_EXTRA_LINK_FLAGS: 
+        Appends to link flags which appear *before* the objects.
+        Do not use this for linking libraries, as the link line is order-dependent
+-- CXX_EXTRA_LIBRARIES: 
+        Append to link flags which appear *after* the objects.
+        Use this for linking extra libraries (e.g `-lmylib`, or simply `mylib`)
+-- CXX_EXTRA_LINKER_FLAGS: 
+        Append to linker flags (i.e GCC's `-Wl` or equivalent)
+-- Available models:  omp;ocl;std-indices;std-ranges;hip;cuda;kokkos;sycl;acc;raja;tbb;thrust
+-- Selected model  :  omp
+-- Supported flags:
+
+   CMAKE_CXX_COMPILER (optional, default=c++): Any CXX compiler that supports OpenMP as per CMake detection (and offloading if enabled with `OFFLOAD`)
+   ARCH (optional, default=): This overrides CMake's CMAKE_SYSTEM_PROCESSOR detection which uses (uname -p), this is mainly for use with
+         specialised accelerators only and not to be confused with offload which is is mutually exclusive with this.
+         Supported values are:
+          - NEC
+   OFFLOAD (optional, default=OFF): Whether to use OpenMP offload, the format is <VENDOR:ARCH?>|ON|OFF.
+        We support a small set of known offload flags for clang, gcc, and icpx.
+        However, as offload support is rapidly evolving, we recommend you directly supply them via OFFLOAD_FLAGS.
+        For example:
+          * OFFLOAD=NVIDIA:sm_60
+          * OFFLOAD=AMD:gfx906
+          * OFFLOAD=INTEL
+          * OFFLOAD=ON OFFLOAD_FLAGS=...
+   OFFLOAD_FLAGS (optional, default=): If OFFLOAD is enabled, this *overrides* the default offload flags
+   OFFLOAD_APPEND_LINK_FLAG (optional, default=ON): If enabled, this appends all resolved offload flags (OFFLOAD=<vendor:arch> or directly from OFFLOAD_FLAGS) to the link flags.
+        This is required for most offload implementations so that offload libraries can linked correctly.
+
+
+```
 
 ### Benchmarks
 
 
@@ -2,6 +2,10 @@
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
+from matplotlib.colors import LogNorm
+
+import copy
+import matplotlib
 
 
 def linear_scale(old_min, old_max, new_min, new_max, old_value):
@@ -15,14 +19,17 @@ def linear_scale(old_min, old_max, new_min, new_max, old_value):
 normalised = data.copy()
 
 normalised["sum_ms"] = normalised["sum_ms"].apply(
-    lambda x: linear_scale(normalised["sum_ms"].min(), normalised["sum_ms"].max(), 1, 0, x))
+    lambda x: linear_scale(normalised["sum_ms"].min(), normalised["sum_ms"].max(), 0, 100, x) )
 
 out = normalised.pivot(index="ppwi", columns="wgsize", values="sum_ms")
 out.sort_index(level=0, ascending=False, inplace=True)
 
 # data = np.genfromtxt('heatmap.csv', delimiter=',')
 print(out)
 
-sns.heatmap(out, annot=True)
+my_cmap = copy.copy(matplotlib.cm.get_cmap('rocket')) # copy the default cmap
+my_cmap.set_bad((0,0,0))
+
+sns.heatmap(out, annot=True, norm=LogNorm(), cmap=my_cmap)
 
 plt.show()