diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..c40becf81
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,124 @@
+Language:        Cpp
+AccessModifierOffset: -2
+AlignAfterOpenBracket: DontAlign
+AlignConsecutiveMacros: false
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands:   false
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: All
+AlwaysBreakAfterReturnType: AllDefinitions
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterCaseLabel:  true
+  AfterClass:      true
+  AfterControlStatement: true
+  AfterEnum:       true
+  AfterFunction:   true
+  AfterNamespace:  true
+  AfterObjCDeclaration: true
+  AfterStruct:     true
+  AfterUnion:      false
+  AfterExternBlock: true
+  BeforeCatch:     true
+  BeforeElse:      true
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: NonAssignment
+BreakBeforeBraces: Allman
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeComma
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     120
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 2
+ContinuationIndentWidth: 2
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: false
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Preserve
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+IndentPPDirectives: None
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+InsertBraces: true
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 2
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: true
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: true
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  true
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: true
+SpacesInParentheses: true
+SpacesInSquareBrackets: true
+Standard:        Cpp03
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseTab:          Never
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 000000000..e02b1c455
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1 @@
+Checks: '-*,modernize-use-nullptr,modernize-use-override,bugprone,modernize-redundant-void-arg'
diff --git a/.gitignore b/.gitignore
index 3b24d7e6d..888cfaf75 100755
--- a/.gitignore
+++ b/.gitignore
@@ -39,4 +39,9 @@ doxygen
 python/test/log.txt
 python/Potjans_2014/data/
 python/Potjans_2014_tmp/data/
+python/mpi_mem_check/test_*.dat
+python/mpi_mem_check/req_mem_*.dat
+python/mpi_mem_check/full_*.dat
+python/mpi_mem_check/log*.txt
+python/mpi_mem_check/report.nsys-rep
 .vscode/
diff --git a/Makefile.am b/Makefile.am
index f45167727..5cf0608c6 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -12,17 +12,54 @@ CCLD = $(CC)
 LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
 
 HCUSRC=\
+connect.h \
+conn12b.h \
+conn16b.h \
 remote_connect.h \
+connect_rules.h \
+connect_spec.h \
+nestgpu.h \
+get_spike.h \
+send_spike.h \
+rev_spike.h \
+spike_buffer.h \
+remote_spike.h \
+mpi_comm.h \
+base_neuron.h \
+node_group.h \
+neuron_models.h \
+poiss_gen.h \
+poiss_gen_variables.h \
+cuda_error.h \
+ngpu_exception.h \
+getRealTime.h \
+random.h \
 utilities.h \
 distribution.h \
+scan.h \
+prefix_scan.h \
+nested_loop.h \
 copass_kernels.h \
 copass_sort.h \
-connect.h \
-iaf_psc_exp_g.h \
-iaf_psc_exp_hc_params.h \
-iaf_psc_exp_hc.h \
-iaf_psc_exp.h \
+rk5_const.h \
+rk5.h \
+rk5_interface.h \
+propagate_error.h \
+propagator_stability.h \
+stdp.h \
+syn_model.h \
+test_syn_model.h \
+spike_detector.h \
+spike_generator.h \
+parrot_neuron.h \
+multimeter.h \
 ext_neuron.h \
+user_m1.h \
+user_m1_kernel.h \
+user_m1_rk5.h \
+user_m2.h \
+user_m2_kernel.h \
+user_m2_rk5.h \
 aeif_cond_alpha.h \
 aeif_cond_alpha_kernel.h \
 aeif_cond_alpha_multisynapse.h \
@@ -46,17 +83,6 @@ aeif_psc_exp_kernel.h \
 aeif_psc_exp_multisynapse.h \
 aeif_psc_exp_multisynapse_kernel.h \
 aeif_psc_exp_multisynapse_rk5.h \
-base_neuron.h \
-connect.h \
-connect_rules.h \
-connect_spec.h \
-copass_kernels.h \
-copass_sort.h \
-cuda_error.h \
-distribution.h \
-ext_neuron.h \
-getRealTime.h \
-get_spike.h \
 iaf_psc_exp_g.h \
 iaf_psc_exp.h \
 iaf_psc_exp_hc.h \
@@ -68,53 +94,46 @@ izhikevich_cond_beta_rk5.h \
 izhikevich.h \
 izhikevich_psc_exp_2s.h \
 izhikevich_psc_exp_5s.h \
-izhikevich_psc_exp.h \
-multimeter.h \
-nested_loop.h \
-nestgpu.h \
-neuron_models.h \
-ngpu_exception.h \
-node_group.h \
-parrot_neuron.h \
-poiss_gen.h \
-poiss_gen_variables.h \
-prefix_scan.h \
-propagate_error.h \
-propagator_stability.h \
-random.h \
-rev_spike.h \
-rk5_const.h \
-rk5.h \
-rk5_interface.h \
-scan.h \
-send_spike.h \
-spike_buffer.h \
-spike_detector.h \
-spike_generator.h \
-remote_spike.h \
-mpi_comm.h \
-stdp.h \
-syn_model.h \
-test_syn_model.h \
-user_m1.h \
-user_m1_kernel.h \
-user_m1_rk5.h \
-user_m2.h \
-user_m2_kernel.h \
-user_m2_rk5.h \
-utilities.h
+izhikevich_psc_exp.h
 
 CUSRC=\
+connect.cu \
+conn12b.cu \
+conn16b.cu \
 remote_connect.cu \
+connect_rules.cu \
+nestgpu.cu \
+get_spike.cu \
+send_spike.cu \
+rev_spike.cu \
+spike_buffer.cu \
+remote_spike.cu \
+mpi_comm.cu \
+base_neuron.cu \
+node_group.cu \
+neuron_models.cu \
+poiss_gen.cu \
+getRealTime.cu \
+random.cu \
 utilities.cu \
 distribution.cu \
+scan.cu \
+prefix_scan.cu \
+nested_loop.cu \
 copass_kernels.cu \
 copass_sort.cu \
-connect.cu \
-iaf_psc_exp_g.cu \
-iaf_psc_exp_hc.cu \
-iaf_psc_exp.cu \
+rk5.cu \
+propagator_stability.cu \
+stdp.cu \
+syn_model.cu \
+test_syn_model.cu \
+spike_detector.cu \
+spike_generator.cu \
+parrot_neuron.cu \
+multimeter.cu \
 ext_neuron.cu \
+user_m1.cu \
+user_m2.cu \
 aeif_cond_alpha.cu \
 aeif_cond_alpha_multisynapse.cu \
 aeif_cond_beta.cu \
@@ -124,15 +143,6 @@ aeif_psc_alpha_multisynapse.cu \
 aeif_psc_delta.cu \
 aeif_psc_exp.cu \
 aeif_psc_exp_multisynapse.cu \
-base_neuron.cu \
-connect.cu \
-connect_rules.cu \
-copass_kernels.cu \
-copass_sort.cu \
-distribution.cu \
-ext_neuron.cu \
-getRealTime.cu \
-get_spike.cu \
 iaf_psc_exp.cu \
 iaf_psc_exp_g.cu \
 iaf_psc_exp_hc.cu \
@@ -141,32 +151,8 @@ izhikevich_cond_beta.cu \
 izhikevich.cu \
 izhikevich_psc_exp_2s.cu \
 izhikevich_psc_exp_5s.cu \
-izhikevich_psc_exp.cu \
-multimeter.cu \
-nested_loop.cu \
-nestgpu.cu \
-neuron_models.cu \
-node_group.cu \
-parrot_neuron.cu \
-poiss_gen.cu \
-prefix_scan.cu \
-propagator_stability.cu \
-random.cu \
-rev_spike.cu \
-rk5.cu \
-scan.cu \
-send_spike.cu \
-spike_buffer.cu \
-spike_detector.cu \
-spike_generator.cu \
-remote_spike.cu \
-mpi_comm.cu \
-stdp.cu \
-syn_model.cu \
-test_syn_model.cu \
-user_m1.cu \
-user_m2.cu \
-utilities.cu
+izhikevich_psc_exp.cu
+
 
 HCPPSRC=\
 nestgpu_C.h
diff --git a/build_support/.clang-tidy-ignore b/build_support/.clang-tidy-ignore
new file mode 100644
index 000000000..fbb2a9235
--- /dev/null
+++ b/build_support/.clang-tidy-ignore
@@ -0,0 +1 @@
+*.cuh
diff --git a/build_support/check_all_c_c++_cu_files.sh b/build_support/check_all_c_c++_cu_files.sh
new file mode 100755
index 000000000..bc99fb34a
--- /dev/null
+++ b/build_support/check_all_c_c++_cu_files.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+
+#
+#   This file is part of NEST GPU.
+#
+#  Copyright (C) 2021 The NEST Initiative
+#
+#  NEST GPU is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  NEST GPU is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with NEST GPU.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# With this script you can easily check all C/C++/CU files contained in
+# the src directory of NEST GPU. Internally it uses clang-tidy to do
+# the actual check.
+#
+
+function make_temp_dir {
+  # Create a temporary directory and store its name in a variable.
+  TEMPD=$(mktemp -d)
+
+  # Exit if the temp directory wasn't created successfully.
+  if [ ! -e "$TEMPD" ]; then
+    >&2 echo "Error: failed to create temp directory"
+    exit 1    
+  fi
+
+
+  # Make sure the temp directory gets removed on script exit.
+  trap "exit 1"           HUP INT PIPE QUIT TERM
+  trap 'rm -rf "$TEMPD"'  EXIT
+}
+
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <src-folder-path>"
+    exit 1
+fi
+
+CMD_DIR=$(dirname $(echo $0))
+CLANG_TIDY=${CMD_DIR}/clang-tidy-cuda.sh
+
+if [ ! -f $CLANG_TIDY ]; then
+    echo "Error: $CLANG_TIDY file not found in $CMD_DIR folder"
+    exit 1
+fi
+
+SRC_DIR=$1
+if [ -d "$SRC_DIR" ]; then 
+    if [ -L "$SRC_DIR" ]; then
+	# It is a symlink
+	echo "Error: cannot pass a symboloc link as source path"
+	exit 1
+    fi
+else
+    echo "Error: source path $SRC_DIR not found"
+    exit 1
+fi
+
+make_temp_dir
+CONF_DIR=${TEMPD}/config
+mkdir $CONF_DIR
+if [ ! -e "$CONF_DIR" ]; then
+    >&2 echo "Error: failed to create $CONF_DIR directory"
+    exit 1
+fi
+CONF_H=${CONF_DIR}/config.h
+:>$CONF_H
+if [ ! -f $CONF_H ]; then
+    echo "Error: cannot create temporary file $CONF_H"
+    exit 1
+fi
+
+
+cp $CLANG_TIDY $TEMPD
+CLANG_TIDY=$(basename $CLANG_TIDY)
+if [ ! -f $TEMPD/$CLANG_TIDY ]; then
+    echo "Error: cannot create temporary executable $CLANG_TIDY in folder $TEMPD"
+    exit 1
+fi
+
+pushd .
+cd $SRC_DIR
+
+for fn in $(ls *.cu *.cpp *.cc *.c *.cuh *.hpp *.h); do
+    cat $fn | sed 's://<BEGIN-CLANG-TIDY-SKIP>//:#if 0:;s://<END-CLANG-TIDY-SKIP>//:#endif:' > $TEMPD/$fn
+    if [ ! -f $TEMPD/$fn ]; then
+	echo "Error: cannot create file $TEMPD/$fn"
+	popd
+	exit 1
+    fi
+done
+
+
+cd $TEMPD
+
+PASSED_NUM=0
+for fn in $(ls *.cu *.cpp *.cc *.c | grep -v user_m); do
+    echo " - Check with $CLANG_TIDY C/C++/CUDA file: $fn"
+    #$TEMPD/$CLANG_TIDY --include-path=../../build_cmake/libnestutil/ $fn
+    echo "$TEMPD/$CLANG_TIDY --include-path=$CONF_DIR $fn"
+    $TEMPD/$CLANG_TIDY --include-path=$CONF_DIR $fn
+    if [ $? -eq 0 ]; then
+	echo PASSED
+	PASSED_NUM=$(($PASSED_NUM + 1))
+    else
+	popd
+	exit 1
+    fi
+
+done
+
+popd
+echo "Checked $PASSED_NUM files with clang-tidy-cuda.sh"
+echo "All tests PASSED"
+
+exit 0
diff --git a/build_support/clang-format-cuda.sh b/build_support/clang-format-cuda.sh
new file mode 100755
index 000000000..e31fef014
--- /dev/null
+++ b/build_support/clang-format-cuda.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 input-file"
+    return
+fi
+
+if [ ! -f .clang-format ]; then
+    echo "Error: .clang-format file not found in current directory"
+    return
+fi
+
+if [ ! -f $1 ]; then
+    echo "Error: input file $1 not found"
+    return
+fi
+
+if grep -q '$$<' $1; then
+    echo 'Error: illegal character sequence in input file: "$$<"'
+    return
+fi
+if grep -q '$ >' $1; then
+    echo 'Error: illegal character sequence in input file: "$ >"'
+    return
+fi
+if grep -q '$>' $1; then
+    echo 'Error: illegal character sequence in input file: "$>"'
+    return
+fi
+
+cat $1 | sed 's/<<</$$</g;s/>>>/$ >/g;' > tmp1~
+clang-format -style=file:.clang-format tmp1~ > tmp2~
+cat tmp2~ | sed 's/$$</<<</g;s/$ >/>>>/g;s/$>/>>>/g;' > $1
+rm -f tmp1~
+rm -f tmp2~
diff --git a/build_support/clang-tidy-cuda.sh b/build_support/clang-tidy-cuda.sh
new file mode 100755
index 000000000..d16af0626
--- /dev/null
+++ b/build_support/clang-tidy-cuda.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+#
+#   This file is part of NEST GPU.
+#
+#  Copyright (C) 2021 The NEST Initiative
+#
+#  NEST GPU is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  NEST GPU is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with NEST GPU.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+cuda_default_path="/usr/local/cuda/include"
+
+if [ "$#" -eq 0 ]; then
+    echo "Usage: $0 [--include-path=INCLUDE_PATHS] [--cuda-path=CUDA_PATHS] [--mpi-path=MPI_PATHS] input-file"
+    echo "where INCLUDE_PATHS are optional header paths separated by colons,"
+    echo "CUDA_PATHS are the paths of CUDA headers separated by colons"
+    echo "(default: $cuda_default_path)"
+    echo "and MPI_PATHS are the paths of MPI headers separated by colons"
+    exit 0
+fi
+
+cuda_path=""
+mpi_path=""
+include_path=""
+
+for i in "$@"; do
+    case $i in
+	--include-path=*)
+	    include_path="${i#*=}"
+	    shift # past argument=value
+	    ;;
+	--cuda-path=*)
+	    cuda_path="${i#*=}"
+	    shift # past argument=value
+	    ;;
+	--mpi-path=*)
+	    mpi_path="${i#*=}"
+	    shift # past argument=value
+	    ;;
+	-*|--*)
+	    echo "Error: unknown option $i"
+	    exit 1
+	    ;;
+	*)
+	    ;;
+    esac
+done
+
+if [[ -n $1 ]]; then
+    echo "Input file: $1"
+else
+    echo "Error: input file not specified."
+    exit 1
+fi
+
+if [ ! -f $1 ]; then
+    echo "Error: input file $1 not found."
+    exit 1
+fi
+
+if [ "$include_path" != "" ]; then    
+    include_path=$(echo ":$include_path" | sed 's/::*/:/g;s/:$//;s/:/ -I /g')
+fi
+
+# Searches the paths of CUDA headers
+if [ "$cuda_path" == "" ]; then    
+    cuda_path=":/usr/local/cuda/include"
+else
+    cuda_path=$(echo ":$cuda_path" | sed 's/::*/:/g;s/:$//')
+fi
+
+cuda_path_spaced=$(echo $cuda_path | tr ':' ' ')
+cuda_err=1
+for dn in $cuda_path_spaced; do
+    if test -f "$dn/cuda.h" ; then
+	echo "cuda.h found in path $dn"
+	cuda_err=0
+	break
+    fi
+done
+
+if [ $cuda_err -eq 1 ]; then
+    echo "cuda.h not found in path(s) $cuda_path_spaced"
+    echo "You can specify path for CUDA headers with the option --cuda-path=CUDA_PATHS"
+    echo "where CUDA_PATHS are the paths of CUDA headers separated by colons"
+    echo "(default: $cuda_default_path)"
+    exit 1
+fi
+
+cuda_include=$(echo $cuda_path | sed 's/:/ -isystem /g')
+
+#cat $1 | sed 's://<BEGIN-CLANG-TIDY-SKIP>//:#if 0:;s://<END-CLANG-TIDY-SKIP>//:#endif:' > tmp~
+    
+#cat ../build_cmake/compile_commands.json | sed "s:-Xcompiler=-fPIC::;s:-forward-unknown-to-host-compiler::;s:--compiler-options='.*'::;s:--generate-code=arch=compute_80,code=\[compute_80,sm_80\]::;s:--maxrregcount=55::" > compile_commands.json
+
+# Searches the paths of MPI headers
+if [ "$mpi_path" == "" ]; then    
+    mpi_include=$( \
+		   for l in  $(mpicc -showme); do \
+		       echo $l; \
+		   done | grep '^-I')
+    if [ "$mpi_include" == "" ]; then
+	echo "Error: cannot find MPI include paths"
+	echo "You can specify path for MPI headers with the option --mpi-path=MPI_PATHS"
+	echo "where MPI_PATHS are the paths of MPI headers separated by colons"
+	exit 1
+    fi
+    mpi_include=$(echo $mpi_include | sed 's/-I/ -isystem /g')
+    mpi_path_spaced=$(echo $mpi_include | sed 's/-I/ /g')
+else
+    mpi_path=$(echo ":$mpi_path" | sed 's/::*/:/g;s/:$//')
+    mpi_path_spaced=$(echo $mpi_path | tr ':' ' ')
+    mpi_include=$(echo $mpi_path | sed 's/:/ -isystem /g')
+fi
+
+mpi_err=1
+for dn in $mpi_path_spaced; do
+    if test -f "$dn/mpi.h" ; then
+	echo "mpi.h found in path $dn"
+	mpi_err=0
+	break
+    fi
+done
+
+if [ $mpi_err -eq 1 ]; then
+    echo "mpi.h not found in path(s) $mpi_path_spaced"
+    echo "You can specify path for MPI headers with the option --mpi-path=MPI_PATHS"
+    echo "where MPI_PATHS are the paths of MPI headers separated by colons"
+    exit 1
+fi
+
+echo "clang-tidy $1 -p . -- $include_path $mpi_include $cuda_include --no-cuda-version-check"
+
+clang-tidy $1 -p .  -- $include_path $mpi_include $cuda_include --no-cuda-version-check
diff --git a/build_support/format_all_c_c++_cu_files.sh b/build_support/format_all_c_c++_cu_files.sh
new file mode 100755
index 000000000..b0ffed1af
--- /dev/null
+++ b/build_support/format_all_c_c++_cu_files.sh
@@ -0,0 +1,163 @@
+#!/bin/bash
+
+#
+#   This file is part of NEST GPU.
+#
+#  Copyright (C) 2021 The NEST Initiative
+#
+#  NEST GPU is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  NEST GPU is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with NEST GPU.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# With this script you can easily format all C/C++/CU files contained in
+# the src directory of NEST GPU. Internally it uses clang-format to do
+# the actual formatting.
+#
+# NEST GPU C/C++/CUDA code should be formatted according to clang-format
+# version 17.0.4. If you would like to see how the code will be formatted
+# with a different clang-format version, execute e.g.
+# `CLANG_FORMAT=clang-format-14 ./format_all_c_c++_cu_files.sh`.
+#
+# By default the script starts at the current working directory ($PWD), but
+# supply a different starting directory as the first argument to the command.
+
+CLANG_FORMAT=${CLANG_FORMAT:-clang-format}
+CLANG_FORMAT_FILE=${CLANG_FORMAT_FILE:-${PWD}/.clang-format}
+
+# Drop files that should not be checked
+FILES_TO_IGNORE="" # not used now, bult could be used in the future
+DIRS_TO_IGNORE="thirdparty" # not used now, bult could be used in the future
+
+CHANGE_COUNT=0
+
+function clang_format_cuda {
+  if [ ! -f $1 ]; then
+    echo "Error: input file $1 not found"
+    return
+  fi
+
+  if grep -q '$$<' $1; then
+    echo 'Error: illegal character sequence in input file: "$$<"'
+    return
+  fi
+  if grep -q '$ >' $1; then
+    echo 'Error: illegal character sequence in input file: "$ >"'
+    return
+  fi
+  if grep -q '$>' $1; then
+    echo 'Error: illegal character sequence in input file: "$>"'
+    return
+  fi
+
+  cat $1 | sed 's/<<</$$</g;s/>>>/$ >/g;' > $TEMPD/tmp1~
+  #echo "CLANG_FORMAT_FILE: $CLANG_FORMAT_FILE"
+  clang-format -style=file:$CLANG_FORMAT_FILE $TEMPD/tmp1~ > $TEMPD/tmp2~
+  cat $TEMPD/tmp2~ | sed 's/$$</<<</g;s/$ >/>>>/g;s/$>/>>>/g;' > $TEMPD/tmp1~  
+  if ! cmp -s $TEMPD/tmp1~ $1; then # file changed by clang-format
+      /bin/cp -f $TEMPD/tmp1~  $1
+      CHANGE_COUNT=$((CHANGE_COUNT+1))
+      echo "     FILE CHANGED BY FORMATTING"
+  fi
+}  
+
+# Recursively process all C/C++/CUDA files in all sub-directories.
+function process_dir {
+  dir=$1
+  echo "Process directory: $dir"
+
+  if [[ " $DIRS_TO_IGNORE " =~ .*[[:space:]]${dir##*/}[[:space:]].* ]]; then
+    echo "   Directory explicitly ignored."
+    return
+  fi
+
+  for f in $dir/*; do
+    if [[ -d $f ]]; then
+      # Recursively process sub-directories.
+      process_dir $f
+    else
+      ignore_file=0
+
+      for FILE_TO_IGNORE in $FILES_TO_IGNORE; do
+        if [[ $f == *$FILE_TO_IGNORE* ]]; then
+          ignore_file=1
+          break
+        fi
+      done
+
+      if [ $ignore_file == 1 ] ; then
+        continue
+      fi
+
+      case $f in
+        *.cpp | *.cc | *.c | *.h | *.hpp | *.cu | *.cuh )
+          # Format C/C++/CUDA files.
+          echo " - Format C/C++/CUDA file: $f"
+          #  $CLANG_FORMAT -i $f
+	  clang_format_cuda $f
+          ;;
+        * )
+          # Ignore all other files.
+      esac
+    fi
+  done
+}
+
+function help_output {
+  echo "The $CLANG_FORMAT_FILE requires clang-format version 13 or later."
+  echo "Use like: [CLANG_FORMAT=<clang-format>] ./build_support/`basename $0` [start folder, defaults to '$PWD']"
+  exit 0
+}
+
+function make_temp_dir {
+  # Create a temporary directory and store its name in a variable.
+  TEMPD=$(mktemp -d)
+
+  # Exit if the temp directory wasn't created successfully.
+  if [ ! -e "$TEMPD" ]; then
+    >&2 echo "Failed to create temp directory"
+    exit 1    
+  fi
+
+
+  # Make sure the temp directory gets removed on script exit.
+  trap "exit 1"           HUP INT PIPE QUIT TERM
+  trap 'rm -rf "$TEMPD"'  EXIT
+}
+
+make_temp_dir
+
+if [[ ! -f $CLANG_FORMAT_FILE ]]; then
+  echo "Cannot find $CLANG_FORMAT_FILE file. Please start '`basename $0`' from the NEST GPU base source directory."
+  help_output
+fi
+
+if [[ $# -eq 0 ]]; then
+  # Start with current directory.
+  startdir=$PWD
+elif [[ $# -eq 1 ]]; then
+  if [[ -d $1 ]]; then
+    # Start with given directory.
+    startdir=$1
+  else
+    # Not a directory.
+    help_output
+  fi
+else
+  # Two or more arguments...
+  help_output
+fi
+
+# Start formatting the $startdir and all subdirectories
+process_dir $startdir
+
+echo "$CHANGE_COUNT files have been changed by formatting"
diff --git a/c++/examples/brunel_mpi.cpp b/c++/examples/brunel_mpi.cpp
index fe9335e9f..d7f185e06 100644
--- a/c++/examples/brunel_mpi.cpp
+++ b/c++/examples/brunel_mpi.cpp
@@ -20,50 +20,48 @@
  *
  */
 
-
-
-
-
-#include <stdio.h>
+#include "nestgpu.h"
+#include <algorithm>
 #include <iostream>
+#include <stdio.h>
 #include <string>
-#include <algorithm>
-#include "nestgpu.h"
 
 using namespace std;
 
-int main(int argc, char *argv[])
+int
+main( int argc, char* argv[] )
 {
   NESTGPU ngpu;
-  ngpu.ConnectMpiInit(argc, argv);
+  ngpu.ConnectMpiInit( argc, argv );
   int mpi_np = ngpu.MpiNp();
-  if (argc != 2 || mpi_np != 2) {
-    cout << "Usage: mpirun -np 2 " << argv[0] << " n_neurons\n";
+  if ( argc != 2 || mpi_np != 2 )
+  {
+    cout << "Usage: mpirun -np 2 " << argv[ 0 ] << " n_neurons\n";
     ngpu.MpiFinalize();
     return 0;
   }
   int arg1;
-  sscanf(argv[1], "%d", &arg1);
+  sscanf( argv[ 1 ], "%d", &arg1 );
 
   int mpi_id = ngpu.MpiId();
-  cout << "Building on host " << mpi_id << " ..." <<endl;
+  cout << "Building on host " << mpi_id << " ..." << endl;
+
+  ngpu.SetRandomSeed( 1234ULL + mpi_id ); // seed for GPU random numbers
 
-  ngpu.SetRandomSeed(1234ULL + mpi_id); // seed for GPU random numbers
- 
   //////////////////////////////////////////////////////////////////////
   // WRITE HERE COMMANDS THAT ARE EXECUTED ON ALL HOSTS
   //////////////////////////////////////////////////////////////////////
   int n_receptors = 2;
 
-  float delay = 1.0;       // synaptic delay in ms
+  float delay = 1.0; // synaptic delay in ms
 
-  int order = arg1/5;
+  int order = arg1 / 5;
   int NE = 4 * order;      // number of excitatory neurons
   int NI = 1 * order;      // number of inhibitory neurons
   int n_neurons = NE + NI; // number of neurons in total
 
-  int CE = 800;  // number of excitatory synapses per neuron
-  int CI = CE/4;  // number of inhibitory synapses per neuron
+  int CE = 800;    // number of excitatory synapses per neuron
+  int CI = CE / 4; // number of inhibitory synapses per neuron
 
   float Wex = 0.05;
   float Win = 0.35;
@@ -74,57 +72,56 @@ int main(int argc, char *argv[])
   float poiss_delay = 0.2; // poisson signal delay in ms
 
   // create poisson generator
-  NodeSeq pg = ngpu.Create("poisson_generator");
-  ngpu.SetNeuronParam(pg, "rate", poiss_rate);
+  NodeSeq pg = ngpu.Create( "poisson_generator" );
+  ngpu.SetNeuronParam( pg, "rate", poiss_rate );
 
   // each host has n_neurons neurons with n_receptor receptor ports
-  NodeSeq neuron = ngpu.Create("aeif_cond_beta", n_neurons,
-					   n_receptors);
-  NodeSeq exc_neuron = neuron.Subseq(0,NE-1); // excitatory neuron group
-  NodeSeq inh_neuron = neuron.Subseq(NE, n_neurons-1); //inhibitory neuron group
-  
+  NodeSeq neuron = ngpu.Create( "aeif_cond_beta", n_neurons, n_receptors );
+  NodeSeq exc_neuron = neuron.Subseq( 0, NE - 1 );         // excitatory neuron group
+  NodeSeq inh_neuron = neuron.Subseq( NE, n_neurons - 1 ); // inhibitory neuron group
+
   // the following parameters are set to the same values on all hosts
-  float E_rev[] = {0.0, -85.0};
-  float tau_decay[] = {1.0, 1.0};
-  float tau_rise[] = {1.0, 1.0};
-  ngpu.SetNeuronParam(neuron, "E_rev", E_rev, 2);
-  ngpu.SetNeuronParam(neuron, "tau_decay", tau_decay, 2);
-  ngpu.SetNeuronParam(neuron, "tau_rise", tau_rise, 2);
+  float E_rev[] = { 0.0, -85.0 };
+  float tau_decay[] = { 1.0, 1.0 };
+  float tau_rise[] = { 1.0, 1.0 };
+  ngpu.SetNeuronParam( neuron, "E_rev", E_rev, 2 );
+  ngpu.SetNeuronParam( neuron, "tau_decay", tau_decay, 2 );
+  ngpu.SetNeuronParam( neuron, "tau_rise", tau_rise, 2 );
 
   // Excitatory local connections, defined on all hosts
   // connect excitatory neurons to port 0 of all neurons
   // weight Wex and fixed indegree CE*3/4
-  ConnSpec conn_spec1(FIXED_INDEGREE, CE*3/4);
+  ConnSpec conn_spec1( FIXED_INDEGREE, CE * 3 / 4 );
   SynSpec syn_spec1;
-  syn_spec1.SetParam("receptor", 0);
-  syn_spec1.SetParam("weight", Wex);
-  syn_spec1.SetParam("delay", delay);
-  ngpu.Connect(exc_neuron, neuron, conn_spec1, syn_spec1);
+  syn_spec1.SetParam( "receptor", 0 );
+  syn_spec1.SetParam( "weight", Wex );
+  syn_spec1.SetParam( "delay", delay );
+  ngpu.Connect( exc_neuron, neuron, conn_spec1, syn_spec1 );
 
   // Inhibitory local connections, defined on all hosts
   // connect inhibitory neurons to port 1 of all neurons
   // weight Win and fixed indegree CI*3/4
-  ConnSpec conn_spec2(FIXED_INDEGREE, CI*3/4);
+  ConnSpec conn_spec2( FIXED_INDEGREE, CI * 3 / 4 );
   SynSpec syn_spec2;
-  syn_spec2.SetParam("receptor", 1);
-  syn_spec2.SetParam("weight", Win);
-  syn_spec2.SetParam("delay", delay);
-  ngpu.Connect(inh_neuron, neuron, conn_spec2, syn_spec2);
+  syn_spec2.SetParam( "receptor", 1 );
+  syn_spec2.SetParam( "weight", Win );
+  syn_spec2.SetParam( "delay", delay );
+  ngpu.Connect( inh_neuron, neuron, conn_spec2, syn_spec2 );
 
-  ConnSpec conn_spec3(ALL_TO_ALL);
-  SynSpec syn_spec3(STANDARD_SYNAPSE, poiss_weight, poiss_delay, 0);
+  ConnSpec conn_spec3( ALL_TO_ALL );
+  SynSpec syn_spec3( STANDARD_SYNAPSE, poiss_weight, poiss_delay, 0 );
   // connect poisson generator to port 0 of all neurons
-  ngpu.Connect(pg, neuron, conn_spec3, syn_spec3);
-
+  ngpu.Connect( pg, neuron, conn_spec3, syn_spec3 );
 
-  char filename[100];
-  sprintf(filename, "test_brunel_mpi_%d.dat", mpi_id);
+  char filename[ 100 ];
+  sprintf( filename, "test_brunel_mpi_%d.dat", mpi_id );
 
-  int i_neuron_arr[] = {neuron[0], neuron[rand()%n_neurons],
-		     neuron[n_neurons-1]}; // any set of neuron indexes
+  int i_neuron_arr[] = {
+    neuron[ 0 ], neuron[ rand() % n_neurons ], neuron[ n_neurons - 1 ]
+  }; // any set of neuron indexes
   // create multimeter record of V_m
-  std::string var_name_arr[] = {"V_m", "V_m", "V_m"};
-  ngpu.CreateRecord(string(filename), var_name_arr, i_neuron_arr, 3);
+  std::string var_name_arr[] = { "V_m", "V_m", "V_m" };
+  ngpu.CreateRecord( string( filename ), var_name_arr, i_neuron_arr, 3 );
 
   //////////////////////////////////////////////////////////////////////
   // WRITE HERE REMOTE CONNECTIONS
@@ -134,29 +131,29 @@ int main(int argc, char *argv[])
   // connect excitatory neurons to port 0 of all neurons
   // weight Wex and fixed indegree CE/4
   // host 0 to host 1
-  ConnSpec conn_spec4(FIXED_INDEGREE, CE/4);
+  ConnSpec conn_spec4( FIXED_INDEGREE, CE / 4 );
   SynSpec syn_spec4;
-  syn_spec4.SetParam("receptor", 0);
-  syn_spec4.SetParam("weight", Wex);
-  syn_spec4.SetParam("delay", delay);
+  syn_spec4.SetParam( "receptor", 0 );
+  syn_spec4.SetParam( "weight", Wex );
+  syn_spec4.SetParam( "delay", delay );
   // host 0 to host 1
-  ngpu.RemoteConnect(0, exc_neuron, 1, neuron, conn_spec4, syn_spec4);
+  ngpu.RemoteConnect( 0, exc_neuron, 1, neuron, conn_spec4, syn_spec4 );
   // host 1 to host 0
-  ngpu.RemoteConnect(1, exc_neuron, 0, neuron, conn_spec4, syn_spec4);
+  ngpu.RemoteConnect( 1, exc_neuron, 0, neuron, conn_spec4, syn_spec4 );
 
   // Inhibitory remote connections
   // connect inhibitory neurons to port 1 of all neurons
   // weight Win and fixed indegree CI/4
   // host 0 to host 1
-  ConnSpec conn_spec5(FIXED_INDEGREE, CI/4);
+  ConnSpec conn_spec5( FIXED_INDEGREE, CI / 4 );
   SynSpec syn_spec5;
-  syn_spec5.SetParam("receptor", 1);
-  syn_spec5.SetParam("weight", Win);
-  syn_spec5.SetParam("delay", delay);
+  syn_spec5.SetParam( "receptor", 1 );
+  syn_spec5.SetParam( "weight", Win );
+  syn_spec5.SetParam( "delay", delay );
   // host 0 to host 1
-  ngpu.RemoteConnect(0, inh_neuron, 1, neuron, conn_spec5, syn_spec5);
+  ngpu.RemoteConnect( 0, inh_neuron, 1, neuron, conn_spec5, syn_spec5 );
   // host 1 to host 0
-  ngpu.RemoteConnect(1, inh_neuron, 0, neuron, conn_spec5, syn_spec5);
+  ngpu.RemoteConnect( 1, inh_neuron, 0, neuron, conn_spec5, syn_spec5 );
 
   ngpu.Simulate();
 
diff --git a/c++/examples/brunel_net.cpp b/c++/examples/brunel_net.cpp
index 723007632..d18e1042f 100644
--- a/c++/examples/brunel_net.cpp
+++ b/c++/examples/brunel_net.cpp
@@ -20,40 +20,38 @@
  *
  */
 
-
-
-
-
-#include <stdio.h>
+#include "nestgpu.h"
+#include <algorithm>
 #include <iostream>
+#include <stdio.h>
 #include <string>
-#include <algorithm>
-#include "nestgpu.h"
 
 using namespace std;
 
-int main(int argc, char *argv[])
+int
+main( int argc, char* argv[] )
 {
-  if (argc != 2) {
-    cout << "Usage: " << argv[0] << " n_neurons\n";
+  if ( argc != 2 )
+  {
+    cout << "Usage: " << argv[ 0 ] << " n_neurons\n";
     return 0;
   }
   int arg1;
-  sscanf(argv[1], "%d", &arg1);
+  sscanf( argv[ 1 ], "%d", &arg1 );
   NESTGPU ngpu;
   cout << "Building ...\n";
 
-  ngpu.SetRandomSeed(1234ULL); // seed for GPU random numbers
-  
+  ngpu.SetRandomSeed( 1234ULL ); // seed for GPU random numbers
+
   int n_receptors = 2;
 
-  int order = arg1/5;
+  int order = arg1 / 5;
   int NE = 4 * order;      // number of excitatory neurons
   int NI = 1 * order;      // number of inhibitory neurons
   int n_neurons = NE + NI; // number of neurons in total
 
-  int CE = 800;  // number of excitatory synapses per neuron
-  int CI = CE/4;  // number of inhibitory synapses per neuron
+  int CE = 800;    // number of excitatory synapses per neuron
+  int CI = CE / 4; // number of inhibitory synapses per neuron
 
   float Wex = 0.05;
   float Win = 0.35;
@@ -64,68 +62,66 @@ int main(int argc, char *argv[])
   float poiss_delay = 0.2; // poisson signal delay in ms
 
   // create poisson generator
-  NodeSeq pg = ngpu.Create("poisson_generator");
-  ngpu.SetNeuronParam(pg, "rate", poiss_rate);
+  NodeSeq pg = ngpu.Create( "poisson_generator" );
+  ngpu.SetNeuronParam( pg, "rate", poiss_rate );
 
   // create n_neurons neurons with n_receptor receptor ports
-  NodeSeq neuron = ngpu.Create("aeif_cond_beta", n_neurons,
-					   n_receptors);
-  NodeSeq exc_neuron = neuron.Subseq(0,NE-1); // excitatory neuron group
-  NodeSeq inh_neuron = neuron.Subseq(NE, n_neurons-1); //inhibitory neuron group
+  NodeSeq neuron = ngpu.Create( "aeif_cond_beta", n_neurons, n_receptors );
+  NodeSeq exc_neuron = neuron.Subseq( 0, NE - 1 );         // excitatory neuron group
+  NodeSeq inh_neuron = neuron.Subseq( NE, n_neurons - 1 ); // inhibitory neuron group
 
   // neuron parameters
-  float E_rev[] = {0.0, -85.0};
-  float tau_decay[] = {1.0, 1.0};
-  float tau_rise[] = {1.0, 1.0};
-  ngpu.SetNeuronParam(neuron, "E_rev", E_rev, 2);
-  ngpu.SetNeuronParam(neuron, "tau_decay", tau_decay, 2);
-  ngpu.SetNeuronParam(neuron, "tau_rise", tau_rise, 2);
-  
+  float E_rev[] = { 0.0, -85.0 };
+  float tau_decay[] = { 1.0, 1.0 };
+  float tau_rise[] = { 1.0, 1.0 };
+  ngpu.SetNeuronParam( neuron, "E_rev", E_rev, 2 );
+  ngpu.SetNeuronParam( neuron, "tau_decay", tau_decay, 2 );
+  ngpu.SetNeuronParam( neuron, "tau_rise", tau_rise, 2 );
+
   float mean_delay = 0.5;
   float std_delay = 0.25;
   float min_delay = 0.1;
   // Excitatory connections
   // connect excitatory neurons to port 0 of all neurons
   // normally distributed delays, weight Wex and CE connections per neuron
-  float *exc_delays = ngpu.RandomNormalClipped(CE*n_neurons, mean_delay,
-  						     std_delay, min_delay,
-  						     mean_delay+3*std_delay);
-  
-  ConnSpec conn_spec1(FIXED_INDEGREE, CE);
+  float* exc_delays =
+    ngpu.RandomNormalClipped( CE * n_neurons, mean_delay, std_delay, min_delay, mean_delay + 3 * std_delay );
+
+  ConnSpec conn_spec1( FIXED_INDEGREE, CE );
   SynSpec syn_spec1;
-  syn_spec1.SetParam("receptor", 0);
-  syn_spec1.SetParam("weight", Wex);
-  syn_spec1.SetParam("delay_array", exc_delays);
-  ngpu.Connect(exc_neuron, neuron, conn_spec1, syn_spec1);
+  syn_spec1.SetParam( "receptor", 0 );
+  syn_spec1.SetParam( "weight", Wex );
+  syn_spec1.SetParam( "delay_array", exc_delays );
+  ngpu.Connect( exc_neuron, neuron, conn_spec1, syn_spec1 );
   delete[] exc_delays;
 
   // Inhibitory connections
   // connect inhibitory neurons to port 1 of all neurons
   // normally distributed delays, weight Win and CI connections per neuron
-  float *inh_delays = ngpu.RandomNormalClipped(CI*n_neurons, mean_delay,
-  						     std_delay, min_delay,
-  						     mean_delay+3*std_delay);
+  float* inh_delays =
+    ngpu.RandomNormalClipped( CI * n_neurons, mean_delay, std_delay, min_delay, mean_delay + 3 * std_delay );
 
-  ConnSpec conn_spec2(FIXED_INDEGREE, CI);
+  ConnSpec conn_spec2( FIXED_INDEGREE, CI );
   SynSpec syn_spec2;
-  syn_spec2.SetParam("receptor", 1);
-  syn_spec2.SetParam("weight", Win);
-  syn_spec2.SetParam("delay_array", inh_delays);
-  ngpu.Connect(inh_neuron, neuron, conn_spec2, syn_spec2);
+  syn_spec2.SetParam( "receptor", 1 );
+  syn_spec2.SetParam( "weight", Win );
+  syn_spec2.SetParam( "delay_array", inh_delays );
+  ngpu.Connect( inh_neuron, neuron, conn_spec2, syn_spec2 );
 
   delete[] inh_delays;
 
-  ConnSpec conn_spec3(ALL_TO_ALL);
-  SynSpec syn_spec3(STANDARD_SYNAPSE, poiss_weight, poiss_delay, 0);
+  ConnSpec conn_spec3( ALL_TO_ALL );
+  SynSpec syn_spec3( STANDARD_SYNAPSE, poiss_weight, poiss_delay, 0 );
   // connect poisson generator to port 0 of all neurons
-  ngpu.Connect(pg, neuron, conn_spec3, syn_spec3);
+  ngpu.Connect( pg, neuron, conn_spec3, syn_spec3 );
 
   char filename[] = "test_brunel_net.dat";
-  int i_neuron_arr[] = {neuron[0], neuron[rand()%n_neurons],
-		     neuron[n_neurons-1]}; // any set of neuron indexes
+  int i_neuron_arr[] = {
+    neuron[ 0 ], neuron[ rand() % n_neurons ], neuron[ n_neurons - 1 ]
+  }; // any set of neuron indexes
   // create multimeter record of V_m
-  std::string var_name_arr[] = {"V_m", "V_m", "V_m"};
-  ngpu.CreateRecord(string(filename), var_name_arr, i_neuron_arr, 3);
+  std::string var_name_arr[] = { "V_m", "V_m", "V_m" };
+  ngpu.CreateRecord( string( filename ), var_name_arr, i_neuron_arr, 3 );
 
   ngpu.Simulate();
 
diff --git a/c++/examples/brunel_outdegree.cpp b/c++/examples/brunel_outdegree.cpp
index 3c2daf5a6..843ab73ba 100644
--- a/c++/examples/brunel_outdegree.cpp
+++ b/c++/examples/brunel_outdegree.cpp
@@ -20,40 +20,38 @@
  *
  */
 
-
-
-
-
-#include <stdio.h>
+#include "nestgpu.h"
+#include <algorithm>
 #include <iostream>
+#include <stdio.h>
 #include <string>
-#include <algorithm>
-#include "nestgpu.h"
 
 using namespace std;
 
-int main(int argc, char *argv[])
+int
+main( int argc, char* argv[] )
 {
-  if (argc != 2) {
-    cout << "Usage: " << argv[0] << " n_neurons\n";
+  if ( argc != 2 )
+  {
+    cout << "Usage: " << argv[ 0 ] << " n_neurons\n";
     return 0;
   }
   int arg1;
-  sscanf(argv[1], "%d", &arg1);
+  sscanf( argv[ 1 ], "%d", &arg1 );
   NESTGPU ngpu;
   cout << "Building ...\n";
 
-  ngpu.SetRandomSeed(12345ULL); // seed for GPU random numbers
-  
+  ngpu.SetRandomSeed( 12345ULL ); // seed for GPU random numbers
+
   int n_receptors = 2;
 
-  int order = arg1/5;
+  int order = arg1 / 5;
   int NE = 4 * order;      // number of excitatory neurons
   int NI = 1 * order;      // number of inhibitory neurons
   int n_neurons = NE + NI; // number of neurons in total
 
   int CPN = 1000; // number of output connections per neuron
-  
+
   float Wex = 0.05;
   float Win = 0.35;
 
@@ -63,73 +61,74 @@ int main(int argc, char *argv[])
   float poiss_delay = 0.2; // poisson signal delay in ms
 
   // create poisson generator
-  NodeSeq pg = ngpu.Create("poisson_generator");
-  ngpu.SetNeuronParam(pg, "rate", poiss_rate);
+  NodeSeq pg = ngpu.Create( "poisson_generator" );
+  ngpu.SetNeuronParam( pg, "rate", poiss_rate );
 
   // create n_neurons neurons with n_receptor receptor ports
-  NodeSeq neuron = ngpu.Create("aeif_cond_beta", n_neurons,
-					   n_receptors);
-  NodeSeq exc_neuron = neuron.Subseq(0,NE-1); // excitatory neuron group
-  NodeSeq inh_neuron = neuron.Subseq(NE, n_neurons-1); //inhibitory neuron group
+  NodeSeq neuron = ngpu.Create( "aeif_cond_beta", n_neurons, n_receptors );
+  NodeSeq exc_neuron = neuron.Subseq( 0, NE - 1 );         // excitatory neuron group
+  NodeSeq inh_neuron = neuron.Subseq( NE, n_neurons - 1 ); // inhibitory neuron group
 
   // neuron parameters
-  float E_rev[] = {0.0, -85.0};
-  float tau_decay[] = {1.0, 1.0};
-  float tau_rise[] = {1.0, 1.0};
-  ngpu.SetNeuronParam(neuron, "E_rev", E_rev, 2);
-  ngpu.SetNeuronParam(neuron, "tau_decay", tau_decay, 2);
-  ngpu.SetNeuronParam(neuron, "tau_rise", tau_rise, 2);
-  
+  float E_rev[] = { 0.0, -85.0 };
+  float tau_decay[] = { 1.0, 1.0 };
+  float tau_rise[] = { 1.0, 1.0 };
+  ngpu.SetNeuronParam( neuron, "E_rev", E_rev, 2 );
+  ngpu.SetNeuronParam( neuron, "tau_decay", tau_decay, 2 );
+  ngpu.SetNeuronParam( neuron, "tau_rise", tau_rise, 2 );
+
   float mean_delay = 0.5;
   float std_delay = 0.25;
   float min_delay = 0.1;
   // Excitatory connections
   // connect excitatory neurons to port 0 of all neurons
   // normally distributed delays, weight Wex and CPN connections per neuron
-  float *exc_delays = ngpu.RandomNormalClipped(CPN*NE, mean_delay,
-  						     std_delay, min_delay,
-  						     mean_delay+3*std_delay);
-  
-  ConnSpec conn_spec1(FIXED_OUTDEGREE, CPN);
+  float* exc_delays =
+    ngpu.RandomNormalClipped( CPN * NE, mean_delay, std_delay, min_delay, mean_delay + 3 * std_delay );
+
+  ConnSpec conn_spec1( FIXED_OUTDEGREE, CPN );
   SynSpec syn_spec1;
-  syn_spec1.SetParam("receptor", 0);
-  syn_spec1.SetParam("weight", Wex);
-  syn_spec1.SetParam("delay_array", exc_delays);
-  ngpu.Connect(exc_neuron, neuron, conn_spec1, syn_spec1);
+  syn_spec1.SetParam( "receptor", 0 );
+  syn_spec1.SetParam( "weight", Wex );
+  syn_spec1.SetParam( "delay_array", exc_delays );
+  ngpu.Connect( exc_neuron, neuron, conn_spec1, syn_spec1 );
   delete[] exc_delays;
 
   // Inhibitory connections
   // connect inhibitory neurons to port 1 of all neurons
   // normally distributed delays, weight Win and CPN connections per neuron
-  float *inh_delays = ngpu.RandomNormalClipped(CPN*NI, mean_delay,
-  						     std_delay, min_delay,
-  						     mean_delay+3*std_delay);
+  float* inh_delays =
+    ngpu.RandomNormalClipped( CPN * NI, mean_delay, std_delay, min_delay, mean_delay + 3 * std_delay );
 
-  ConnSpec conn_spec2(FIXED_OUTDEGREE, CPN);
+  ConnSpec conn_spec2( FIXED_OUTDEGREE, CPN );
   SynSpec syn_spec2;
-  syn_spec2.SetParam("receptor", 1);
-  syn_spec2.SetParam("weight", Win);
-  syn_spec2.SetParam("delay_array", inh_delays);
-  ngpu.Connect(inh_neuron, neuron, conn_spec2, syn_spec2);
+  syn_spec2.SetParam( "receptor", 1 );
+  syn_spec2.SetParam( "weight", Win );
+  syn_spec2.SetParam( "delay_array", inh_delays );
+  ngpu.Connect( inh_neuron, neuron, conn_spec2, syn_spec2 );
 
   delete[] inh_delays;
 
-  ConnSpec conn_spec3(ALL_TO_ALL);
-  SynSpec syn_spec3(STANDARD_SYNAPSE, poiss_weight, poiss_delay, 0);
+  ConnSpec conn_spec3( ALL_TO_ALL );
+  SynSpec syn_spec3( STANDARD_SYNAPSE, poiss_weight, poiss_delay, 0 );
   // connect poisson generator to port 0 of all neurons
-  ngpu.Connect(pg, neuron, conn_spec3, syn_spec3);
+  ngpu.Connect( pg, neuron, conn_spec3, syn_spec3 );
 
   char filename[] = "test_brunel_outdegree.dat";
   // any set of neuron indexes
-  int i_neuron_arr[] = {neuron[0], neuron[rand()%n_neurons],
-			neuron[rand()%n_neurons], neuron[rand()%n_neurons],
-			neuron[rand()%n_neurons], neuron[rand()%n_neurons],
-			neuron[rand()%n_neurons], neuron[rand()%n_neurons],
-			neuron[rand()%n_neurons], neuron[n_neurons-1]};
+  int i_neuron_arr[] = { neuron[ 0 ],
+    neuron[ rand() % n_neurons ],
+    neuron[ rand() % n_neurons ],
+    neuron[ rand() % n_neurons ],
+    neuron[ rand() % n_neurons ],
+    neuron[ rand() % n_neurons ],
+    neuron[ rand() % n_neurons ],
+    neuron[ rand() % n_neurons ],
+    neuron[ rand() % n_neurons ],
+    neuron[ n_neurons - 1 ] };
   // create multimeter record of V_m
-  std::string var_name_arr[] = {"V_m", "V_m", "V_m", "V_m", "V_m", "V_m",
-				"V_m", "V_m", "V_m", "V_m"};
-  ngpu.CreateRecord(string(filename), var_name_arr, i_neuron_arr, 10);
+  std::string var_name_arr[] = { "V_m", "V_m", "V_m", "V_m", "V_m", "V_m", "V_m", "V_m", "V_m", "V_m" };
+  ngpu.CreateRecord( string( filename ), var_name_arr, i_neuron_arr, 10 );
 
   ngpu.Simulate();
 
diff --git a/c++/examples/brunel_outdegree_mpi.cpp b/c++/examples/brunel_outdegree_mpi.cpp
index 38a8130dd..ca13bf222 100644
--- a/c++/examples/brunel_outdegree_mpi.cpp
+++ b/c++/examples/brunel_outdegree_mpi.cpp
@@ -20,55 +20,53 @@
  *
  */
 
-
-
-
-
-#include <stdio.h>
+#include "nestgpu.h"
+#include <algorithm>
 #include <iostream>
+#include <stdio.h>
 #include <string>
-#include <algorithm>
-#include "nestgpu.h"
 
 using namespace std;
 
-int main(int argc, char *argv[])
+int
+main( int argc, char* argv[] )
 {
   NESTGPU ngpu;
-  ngpu.ConnectMpiInit(argc, argv);
+  ngpu.ConnectMpiInit( argc, argv );
   int mpi_np = ngpu.MpiNp();
-    if (argc != 2 || mpi_np != 2) {
-    cout << "Usage: mpirun -np 2 " << argv[0] << " n_neurons\n";
+  if ( argc != 2 || mpi_np != 2 )
+  {
+    cout << "Usage: mpirun -np 2 " << argv[ 0 ] << " n_neurons\n";
     ngpu.MpiFinalize();
     return 0;
   }
   int arg1;
-  sscanf(argv[1], "%d", &arg1);
-  
+  sscanf( argv[ 1 ], "%d", &arg1 );
+
   int mpi_id = ngpu.MpiId();
-  cout << "Building on host " << mpi_id << " ..." <<endl;
+  cout << "Building on host " << mpi_id << " ..." << endl;
+
+  ngpu.SetRandomSeed( 12345ULL + mpi_id ); // seed for GPU random numbers
 
-  ngpu.SetRandomSeed(12345ULL + mpi_id); // seed for GPU random numbers
-  
   //////////////////////////////////////////////////////////////////////
   // WRITE HERE COMMANDS THAT ARE EXECUTED ON ALL HOSTS
   //////////////////////////////////////////////////////////////////////
   int n_receptors = 2;
 
-  float delay = 1.0;       // synaptic delay in ms
+  float delay = 1.0; // synaptic delay in ms
 
-  int order = arg1/5;
+  int order = arg1 / 5;
   int NE = 4 * order;      // number of excitatory neurons
   int NI = 1 * order;      // number of inhibitory neurons
   int n_neurons = NE + NI; // number of neurons in total
 
-  int CPN = 1000;  // number of output connections per neuron
+  int CPN = 1000; // number of output connections per neuron
 
   float fext = 0.25; // fraction of the excitatory neurons that
   // send their output to neurons of another mpi host
-  int NEext = (int)(fext*NE);
+  int NEext = ( int ) ( fext * NE );
   int NEint = NE - NEext;
-  
+
   float Wex = 0.05;
   float Win = 0.35;
 
@@ -78,62 +76,62 @@ int main(int argc, char *argv[])
   float poiss_delay = 0.2; // poisson signal delay in ms
 
   // create poisson generator
-  NodeSeq pg = ngpu.Create("poisson_generator");
-  ngpu.SetNeuronParam(pg, "rate", poiss_rate);
+  NodeSeq pg = ngpu.Create( "poisson_generator" );
+  ngpu.SetNeuronParam( pg, "rate", poiss_rate );
 
   // each host has n_neurons neurons with n_receptor receptor ports
-  NodeSeq neuron = ngpu.Create("aeif_cond_beta", n_neurons,
-					   n_receptors);
-  NodeSeq excint_neuron = neuron.Subseq(0,NEint-1); // excitatory group
+  NodeSeq neuron = ngpu.Create( "aeif_cond_beta", n_neurons, n_receptors );
+  NodeSeq excint_neuron = neuron.Subseq( 0, NEint - 1 ); // excitatory group
   // of neurons that project internally
-  NodeSeq excext_neuron = neuron.Subseq(NEint,NE-1); // excitatory group
+  NodeSeq excext_neuron = neuron.Subseq( NEint, NE - 1 ); // excitatory group
   // of neurons that project externally
-  
-  NodeSeq inh_neuron = neuron.Subseq(NE, n_neurons-1); //inhibitory neuron group
-  
+
+  NodeSeq inh_neuron = neuron.Subseq( NE, n_neurons - 1 ); // inhibitory neuron group
+
   // the following parameters are set to the same values on all hosts
-  float E_rev[] = {0.0, -85.0};
-  float tau_decay[] = {1.0, 1.0};
-  float tau_rise[] = {1.0, 1.0};
-  ngpu.SetNeuronParam(neuron, "E_rev", E_rev, 2);
-  ngpu.SetNeuronParam(neuron, "tau_decay", tau_decay, 2);
-  ngpu.SetNeuronParam(neuron, "tau_rise", tau_rise, 2);
+  float E_rev[] = { 0.0, -85.0 };
+  float tau_decay[] = { 1.0, 1.0 };
+  float tau_rise[] = { 1.0, 1.0 };
+  ngpu.SetNeuronParam( neuron, "E_rev", E_rev, 2 );
+  ngpu.SetNeuronParam( neuron, "tau_decay", tau_decay, 2 );
+  ngpu.SetNeuronParam( neuron, "tau_rise", tau_rise, 2 );
 
   // Excitatory local connections, defined on all hosts
   // connect excitatory neurons to port 0 of all neurons
   // weight Wex and fixed indegree CE*3/4
-  ConnSpec conn_spec1(FIXED_OUTDEGREE, CPN);
+  ConnSpec conn_spec1( FIXED_OUTDEGREE, CPN );
   SynSpec syn_spec1;
-  syn_spec1.SetParam("receptor", 0);
-  syn_spec1.SetParam("weight", Wex);
-  syn_spec1.SetParam("delay", delay);
-  ngpu.Connect(excint_neuron, neuron, conn_spec1, syn_spec1);
+  syn_spec1.SetParam( "receptor", 0 );
+  syn_spec1.SetParam( "weight", Wex );
+  syn_spec1.SetParam( "delay", delay );
+  ngpu.Connect( excint_neuron, neuron, conn_spec1, syn_spec1 );
 
   // Inhibitory local connections, defined on all hosts
   // connect inhibitory neurons to port 1 of all neurons
   // weight Win and fixed indegree CI*3/4
-  ConnSpec conn_spec2(FIXED_OUTDEGREE, CPN);
+  ConnSpec conn_spec2( FIXED_OUTDEGREE, CPN );
   SynSpec syn_spec2;
-  syn_spec2.SetParam("receptor", 1);
-  syn_spec2.SetParam("weight", Win);
-  syn_spec2.SetParam("delay", delay);
-  ngpu.Connect(inh_neuron, neuron, conn_spec2, syn_spec2);
+  syn_spec2.SetParam( "receptor", 1 );
+  syn_spec2.SetParam( "weight", Win );
+  syn_spec2.SetParam( "delay", delay );
+  ngpu.Connect( inh_neuron, neuron, conn_spec2, syn_spec2 );
 
-  ConnSpec conn_spec3(ALL_TO_ALL);
-  SynSpec syn_spec3(STANDARD_SYNAPSE, poiss_weight, poiss_delay, 0);
+  ConnSpec conn_spec3( ALL_TO_ALL );
+  SynSpec syn_spec3( STANDARD_SYNAPSE, poiss_weight, poiss_delay, 0 );
   // connect poisson generator to port 0 of all neurons
-  ngpu.Connect(pg, neuron, conn_spec3, syn_spec3);
-
+  ngpu.Connect( pg, neuron, conn_spec3, syn_spec3 );
 
-  char filename[100];
-  sprintf(filename, "test_brunel_outdegree_mpi_%d.dat", mpi_id);
+  char filename[ 100 ];
+  sprintf( filename, "test_brunel_outdegree_mpi_%d.dat", mpi_id );
 
-  int i_neuron_arr[] = {neuron[0], neuron[rand()%n_neurons],
-			neuron[rand()%n_neurons], neuron[rand()%n_neurons],
-			neuron[n_neurons-1]}; // any set of neuron indexes
+  int i_neuron_arr[] = { neuron[ 0 ],
+    neuron[ rand() % n_neurons ],
+    neuron[ rand() % n_neurons ],
+    neuron[ rand() % n_neurons ],
+    neuron[ n_neurons - 1 ] }; // any set of neuron indexes
   // create multimeter record of V_m
-  std::string var_name_arr[] = {"V_m", "V_m", "V_m", "V_m", "V_m"};
-  ngpu.CreateRecord(string(filename), var_name_arr, i_neuron_arr, 5);
+  std::string var_name_arr[] = { "V_m", "V_m", "V_m", "V_m", "V_m" };
+  ngpu.CreateRecord( string( filename ), var_name_arr, i_neuron_arr, 5 );
 
   //////////////////////////////////////////////////////////////////////
   // WRITE HERE REMOTE CONNECTIONS
@@ -143,9 +141,9 @@ int main(int argc, char *argv[])
   // connect excitatory neurons to port 0 of all neurons
   // weight Wex and fixed outdegree CPN
   // host 0 to host 1
-  ngpu.RemoteConnect(0, excext_neuron, 1, neuron, conn_spec1, syn_spec1);
+  ngpu.RemoteConnect( 0, excext_neuron, 1, neuron, conn_spec1, syn_spec1 );
   // host 1 to host 0
-  ngpu.RemoteConnect(1, excext_neuron, 0, neuron, conn_spec1, syn_spec1);
+  ngpu.RemoteConnect( 1, excext_neuron, 0, neuron, conn_spec1, syn_spec1 );
 
   ngpu.Simulate();
 
diff --git a/c++/examples/brunel_vect.cpp b/c++/examples/brunel_vect.cpp
index 56347ffff..2d3a3f585 100644
--- a/c++/examples/brunel_vect.cpp
+++ b/c++/examples/brunel_vect.cpp
@@ -20,40 +20,38 @@
  *
  */
 
-
-
-
-
-#include <stdio.h>
+#include "nestgpu.h"
+#include <algorithm>
 #include <iostream>
+#include <stdio.h>
 #include <string>
-#include <algorithm>
-#include "nestgpu.h"
 
 using namespace std;
 
-int main(int argc, char *argv[])
+int
+main( int argc, char* argv[] )
 {
-  if (argc != 2) {
-    cout << "Usage: " << argv[0] << " n_neurons\n";
+  if ( argc != 2 )
+  {
+    cout << "Usage: " << argv[ 0 ] << " n_neurons\n";
     return 0;
   }
   int arg1;
-  sscanf(argv[1], "%d", &arg1);
+  sscanf( argv[ 1 ], "%d", &arg1 );
   NESTGPU ngpu;
   cout << "Building ...\n";
 
-  ngpu.SetRandomSeed(1234ULL); // seed for GPU random numbers
-  
+  ngpu.SetRandomSeed( 1234ULL ); // seed for GPU random numbers
+
   int n_receptors = 2;
 
-  int order = arg1/5;
+  int order = arg1 / 5;
   int NE = 4 * order;      // number of excitatory neurons
   int NI = 1 * order;      // number of inhibitory neurons
   int n_neurons = NE + NI; // number of neurons in total
 
-  int CE = 800;  // number of excitatory synapses per neuron
-  int CI = CE/4;  // number of inhibitory synapses per neuron
+  int CE = 800;    // number of excitatory synapses per neuron
+  int CI = CE / 4; // number of inhibitory synapses per neuron
 
   float Wex = 0.05;
   float Win = 0.35;
@@ -64,72 +62,70 @@ int main(int argc, char *argv[])
   float poiss_delay = 0.2; // poisson signal delay in ms
 
   // create poisson generator
-  NodeSeq pg = ngpu.Create("poisson_generator");
-  ngpu.SetNeuronParam(pg, "rate", poiss_rate);
-  std::vector<int> pg_vect = pg.ToVector();
+  NodeSeq pg = ngpu.Create( "poisson_generator" );
+  ngpu.SetNeuronParam( pg, "rate", poiss_rate );
+  std::vector< int > pg_vect = pg.ToVector();
 
   // create n_neurons neurons with n_receptor receptor ports
-  NodeSeq neuron = ngpu.Create("aeif_cond_beta", n_neurons,
-					   n_receptors);
-  std::vector<int> neuron_vect = neuron.ToVector();
-  NodeSeq exc_neuron = neuron.Subseq(0,NE-1); // excitatory neuron group
-  std::vector<int> exc_neuron_vect = exc_neuron.ToVector();
-  NodeSeq inh_neuron = neuron.Subseq(NE, n_neurons-1); //inhibitory neuron group
-  std::vector<int> inh_neuron_vect = inh_neuron.ToVector();
+  NodeSeq neuron = ngpu.Create( "aeif_cond_beta", n_neurons, n_receptors );
+  std::vector< int > neuron_vect = neuron.ToVector();
+  NodeSeq exc_neuron = neuron.Subseq( 0, NE - 1 ); // excitatory neuron group
+  std::vector< int > exc_neuron_vect = exc_neuron.ToVector();
+  NodeSeq inh_neuron = neuron.Subseq( NE, n_neurons - 1 ); // inhibitory neuron group
+  std::vector< int > inh_neuron_vect = inh_neuron.ToVector();
 
   // neuron parameters
-  float E_rev[] = {0.0, -85.0};
-  float tau_decay[] = {1.0, 1.0};
-  float tau_rise[] = {1.0, 1.0};
-  ngpu.SetNeuronParam(neuron_vect, "E_rev", E_rev, 2);
-  ngpu.SetNeuronParam(neuron_vect, "tau_decay", tau_decay, 2);
-  ngpu.SetNeuronParam(neuron_vect, "tau_rise", tau_rise, 2);
-  
+  float E_rev[] = { 0.0, -85.0 };
+  float tau_decay[] = { 1.0, 1.0 };
+  float tau_rise[] = { 1.0, 1.0 };
+  ngpu.SetNeuronParam( neuron_vect, "E_rev", E_rev, 2 );
+  ngpu.SetNeuronParam( neuron_vect, "tau_decay", tau_decay, 2 );
+  ngpu.SetNeuronParam( neuron_vect, "tau_rise", tau_rise, 2 );
+
   float mean_delay = 0.5;
   float std_delay = 0.25;
   float min_delay = 0.1;
   // Excitatory connections
   // connect excitatory neurons to port 0 of all neurons
   // normally distributed delays, weight Wex and CE connections per neuron
-  float *exc_delays = ngpu.RandomNormalClipped(CE*n_neurons, mean_delay,
-  						     std_delay, min_delay,
-  						     mean_delay+3*std_delay);
-  
-  ConnSpec conn_spec1(FIXED_INDEGREE, CE);
+  float* exc_delays =
+    ngpu.RandomNormalClipped( CE * n_neurons, mean_delay, std_delay, min_delay, mean_delay + 3 * std_delay );
+
+  ConnSpec conn_spec1( FIXED_INDEGREE, CE );
   SynSpec syn_spec1;
-  syn_spec1.SetParam("receptor", 0);
-  syn_spec1.SetParam("weight", Wex);
-  syn_spec1.SetParam("delay_array", exc_delays);
-  ngpu.Connect(exc_neuron_vect, neuron, conn_spec1, syn_spec1);
+  syn_spec1.SetParam( "receptor", 0 );
+  syn_spec1.SetParam( "weight", Wex );
+  syn_spec1.SetParam( "delay_array", exc_delays );
+  ngpu.Connect( exc_neuron_vect, neuron, conn_spec1, syn_spec1 );
   delete[] exc_delays;
 
   // Inhibitory connections
   // connect inhibitory neurons to port 1 of all neurons
   // normally distributed delays, weight Win and CI connections per neuron
-  float *inh_delays = ngpu.RandomNormalClipped(CI*n_neurons, mean_delay,
-  						     std_delay, min_delay,
-  						     mean_delay+3*std_delay);
+  float* inh_delays =
+    ngpu.RandomNormalClipped( CI * n_neurons, mean_delay, std_delay, min_delay, mean_delay + 3 * std_delay );
 
-  ConnSpec conn_spec2(FIXED_INDEGREE, CI);
+  ConnSpec conn_spec2( FIXED_INDEGREE, CI );
   SynSpec syn_spec2;
-  syn_spec2.SetParam("receptor", 1);
-  syn_spec2.SetParam("weight", Win);
-  syn_spec2.SetParam("delay_array", inh_delays);
-  ngpu.Connect(inh_neuron, neuron_vect, conn_spec2, syn_spec2);
+  syn_spec2.SetParam( "receptor", 1 );
+  syn_spec2.SetParam( "weight", Win );
+  syn_spec2.SetParam( "delay_array", inh_delays );
+  ngpu.Connect( inh_neuron, neuron_vect, conn_spec2, syn_spec2 );
 
   delete[] inh_delays;
 
-  ConnSpec conn_spec3(ALL_TO_ALL);
-  SynSpec syn_spec3(STANDARD_SYNAPSE, poiss_weight, poiss_delay, 0);
+  ConnSpec conn_spec3( ALL_TO_ALL );
+  SynSpec syn_spec3( STANDARD_SYNAPSE, poiss_weight, poiss_delay, 0 );
   // connect poisson generator to port 0 of all neurons
-  ngpu.Connect(pg_vect, neuron_vect, conn_spec3, syn_spec3);
+  ngpu.Connect( pg_vect, neuron_vect, conn_spec3, syn_spec3 );
   char filename[] = "test_brunel_vect.dat";
-  
-  int i_neuron_arr[] = {neuron[0], neuron[rand()%n_neurons],
-		     neuron[n_neurons-1]}; // any set of neuron indexes
+
+  int i_neuron_arr[] = {
+    neuron[ 0 ], neuron[ rand() % n_neurons ], neuron[ n_neurons - 1 ]
+  }; // any set of neuron indexes
   // create multimeter record of V_m
-  std::string var_name_arr[] = {"V_m", "V_m", "V_m"};
-  ngpu.CreateRecord(string(filename), var_name_arr, i_neuron_arr, 3);
+  std::string var_name_arr[] = { "V_m", "V_m", "V_m" };
+  ngpu.CreateRecord( string( filename ), var_name_arr, i_neuron_arr, 3 );
 
   ngpu.Simulate();
 
diff --git a/c++/examples/test_aeif_cond_beta.cpp b/c++/examples/test_aeif_cond_beta.cpp
index b5e2937e8..3fdf77779 100644
--- a/c++/examples/test_aeif_cond_beta.cpp
+++ b/c++/examples/test_aeif_cond_beta.cpp
@@ -20,65 +20,63 @@
  *
  */
 
-
-
-
-
-#include <stdio.h>
+#include "nestgpu.h"
+#include <algorithm>
 #include <iostream>
+#include <stdio.h>
 #include <string>
-#include <algorithm>
-#include "nestgpu.h"
 
 using namespace std;
 
-int main(int argc, char *argv[])
+int
+main( int argc, char* argv[] )
 {
   NESTGPU ngpu;
   cout << "Building ...\n";
-  
-  srand(12345);
+
+  srand( 12345 );
   int n_neurons = 10000;
-  
+
   // create n_neurons neurons with 3 receptor ports
-  NodeSeq neuron = ngpu.Create("aeif_cond_beta", n_neurons, 3);
+  NodeSeq neuron = ngpu.Create( "aeif_cond_beta", n_neurons, 3 );
 
   // neuron parameters
-  float E_rev[] = {20.0, 0.0, -85.0};
-  float tau_decay[] = {40.0, 20.0, 30.0};
-  float tau_rise[] = {20.0, 10.0, 5.0};
-  ngpu.SetNeuronParam(neuron, "E_rev", E_rev, 3);
-  ngpu.SetNeuronParam(neuron, "tau_decay", tau_decay, 3);
-  ngpu.SetNeuronParam(neuron, "tau_rise", tau_rise, 3);
-  ngpu.SetNeuronParam(neuron, "a", 4.0);
-  ngpu.SetNeuronParam(neuron, "b", 80.5);
-  ngpu.SetNeuronParam(neuron, "E_L", -70.6);
-  ngpu.SetNeuronParam(neuron, "g_L", 300.0);
+  float E_rev[] = { 20.0, 0.0, -85.0 };
+  float tau_decay[] = { 40.0, 20.0, 30.0 };
+  float tau_rise[] = { 20.0, 10.0, 5.0 };
+  ngpu.SetNeuronParam( neuron, "E_rev", E_rev, 3 );
+  ngpu.SetNeuronParam( neuron, "tau_decay", tau_decay, 3 );
+  ngpu.SetNeuronParam( neuron, "tau_rise", tau_rise, 3 );
+  ngpu.SetNeuronParam( neuron, "a", 4.0 );
+  ngpu.SetNeuronParam( neuron, "b", 80.5 );
+  ngpu.SetNeuronParam( neuron, "E_L", -70.6 );
+  ngpu.SetNeuronParam( neuron, "g_L", 300.0 );
 
-  NodeSeq sg = ngpu.Create("spike_generator"); // create spike generator
+  NodeSeq sg = ngpu.Create( "spike_generator" ); // create spike generator
 
-  float spike_times[] = {10.0, 400.0};
-  float spike_heights[] = {1.0, 0.5};
+  float spike_times[] = { 10.0, 400.0 };
+  float spike_heights[] = { 1.0, 0.5 };
   int n_spikes = 2;
   // set spike times and height
-  ngpu.SetNeuronParam(sg, "spike_times", spike_times, n_spikes);
-  ngpu.SetNeuronParam(sg, "spike_heights", spike_heights, n_spikes);
-  
-  float delay[] = {1.0, 100.0, 130.0};
-  float weight[] = {0.1, 0.2, 0.5};
+  ngpu.SetNeuronParam( sg, "spike_times", spike_times, n_spikes );
+  ngpu.SetNeuronParam( sg, "spike_heights", spike_heights, n_spikes );
+
+  float delay[] = { 1.0, 100.0, 130.0 };
+  float weight[] = { 0.1, 0.2, 0.5 };
 
-  for (int i_port=0; i_port<3; i_port++) {
-    ConnSpec conn_spec(ALL_TO_ALL);
-    SynSpec syn_spec(STANDARD_SYNAPSE, weight[i_port], delay[i_port], i_port);
-    ngpu.Connect(sg, neuron, conn_spec, syn_spec);
+  for ( int i_port = 0; i_port < 3; i_port++ )
+  {
+    ConnSpec conn_spec( ALL_TO_ALL );
+    SynSpec syn_spec( STANDARD_SYNAPSE, weight[ i_port ], delay[ i_port ], i_port );
+    ngpu.Connect( sg, neuron, conn_spec, syn_spec );
   }
   string filename = "test_aeif_cond_beta.dat";
-  int i_neuron[] = {neuron[rand()%n_neurons]}; // any set of neuron indexes
-  string var_name[] = {"V_m"};
+  int i_neuron[] = { neuron[ rand() % n_neurons ] }; // any set of neuron indexes
+  string var_name[] = { "V_m" };
   // create multimeter record of V_m
-  ngpu.CreateRecord(filename, var_name, i_neuron, 1);
+  ngpu.CreateRecord( filename, var_name, i_neuron, 1 );
 
-  ngpu.Simulate(800.0);
+  ngpu.Simulate( 800.0 );
 
   return 0;
 }
diff --git a/c++/examples/test_connect.cpp b/c++/examples/test_connect.cpp
index d0f229758..359789e83 100644
--- a/c++/examples/test_connect.cpp
+++ b/c++/examples/test_connect.cpp
@@ -20,71 +20,70 @@
  *
  */
 
-
-
-
-
-#include <stdio.h>
 #include <iostream>
+#include <stdio.h>
 #include <string>
 
 #include "nestgpu.h"
 
-int main(int argc, char *argv[])
+int
+main( int argc, char* argv[] )
 {
   const int N = 5;
-  
+
   NESTGPU ngpu;
 
-  NodeSeq neuron = ngpu.Create("aeif_cond_beta", 2*N);
-  std::vector<int> neuron_even;
-  std::vector<int> neuron_odd;
-  for (int i=0; i<N; i++) {
-    neuron_even.push_back(neuron[2*i]);
-    neuron_odd.push_back(neuron[2*i+1]);
+  NodeSeq neuron = ngpu.Create( "aeif_cond_beta", 2 * N );
+  std::vector< int > neuron_even;
+  std::vector< int > neuron_odd;
+  for ( int i = 0; i < N; i++ )
+  {
+    neuron_even.push_back( neuron[ 2 * i ] );
+    neuron_odd.push_back( neuron[ 2 * i + 1 ] );
   }
-  float even_to_odd_delay[N*N];
-  float even_to_odd_weight[N*N];
-  float odd_to_even_delay[N*N];
-  float odd_to_even_weight[N*N];
-  for (int is=0; is<N; is++) {
-    int ise = 2*is;
-    int iso = 2*is + 1;
-    for (int it=0; it<N; it++) {
-      int ite = 2*it;
-      int ito = 2*it + 1;
-      even_to_odd_delay[it*N+is] = 2.0*N*ise + ito;
-      even_to_odd_weight[it*N+is] = 100.0*(2.0*N*ise + ito);
-      odd_to_even_delay[it*N+is] = 2.0*N*iso + ite;
-      odd_to_even_weight[it*N+is] = 100.0*(2.0*N*iso + ite);
+  float even_to_odd_delay[ N * N ];
+  float even_to_odd_weight[ N * N ];
+  float odd_to_even_delay[ N * N ];
+  float odd_to_even_weight[ N * N ];
+  for ( int is = 0; is < N; is++ )
+  {
+    int ise = 2 * is;
+    int iso = 2 * is + 1;
+    for ( int it = 0; it < N; it++ )
+    {
+      int ite = 2 * it;
+      int ito = 2 * it + 1;
+      even_to_odd_delay[ it * N + is ] = 2.0 * N * ise + ito;
+      even_to_odd_weight[ it * N + is ] = 100.0 * ( 2.0 * N * ise + ito );
+      odd_to_even_delay[ it * N + is ] = 2.0 * N * iso + ite;
+      odd_to_even_weight[ it * N + is ] = 100.0 * ( 2.0 * N * iso + ite );
     }
   }
 
-  ConnSpec conn_spec(ALL_TO_ALL);
+  ConnSpec conn_spec( ALL_TO_ALL );
   SynSpec even_to_odd_syn_spec;
-  even_to_odd_syn_spec.SetParam("weight_array", even_to_odd_weight);
-  even_to_odd_syn_spec.SetParam("delay_array", even_to_odd_delay);
+  even_to_odd_syn_spec.SetParam( "weight_array", even_to_odd_weight );
+  even_to_odd_syn_spec.SetParam( "delay_array", even_to_odd_delay );
   SynSpec odd_to_even_syn_spec;
-  odd_to_even_syn_spec.SetParam("weight_array", odd_to_even_weight);
-  odd_to_even_syn_spec.SetParam("delay_array", odd_to_even_delay);
-  
-  ngpu.Connect(neuron_even, neuron_odd, conn_spec, even_to_odd_syn_spec);
-  ngpu.Connect(neuron_odd, neuron_even, conn_spec, odd_to_even_syn_spec);
+  odd_to_even_syn_spec.SetParam( "weight_array", odd_to_even_weight );
+  odd_to_even_syn_spec.SetParam( "delay_array", odd_to_even_delay );
+
+  ngpu.Connect( neuron_even, neuron_odd, conn_spec, even_to_odd_syn_spec );
+  ngpu.Connect( neuron_odd, neuron_even, conn_spec, odd_to_even_syn_spec );
 
   // Even to all
-  std::vector<ConnectionId> conn_id
-    = ngpu.GetConnections(neuron_even, neuron);
-  std::vector<ConnectionStatus> conn_stat_vect
-    = ngpu.GetConnectionStatus(conn_id);
+  std::vector< ConnectionId > conn_id = ngpu.GetConnections( neuron_even, neuron );
+  std::vector< ConnectionStatus > conn_stat_vect = ngpu.GetConnectionStatus( conn_id );
   std::cout << "########################################\n";
   std::cout << "Even to all\n";
-  for (unsigned int i=0; i<conn_stat_vect.size(); i++) {
-    int i_source = conn_stat_vect[i].i_source;
-    int i_target = conn_stat_vect[i].i_target;
-    int port = conn_stat_vect[i].port;
-    int syn_group = conn_stat_vect[i].syn_group;
-    float weight = conn_stat_vect[i].weight;
-    float delay = conn_stat_vect[i].delay;
+  for ( unsigned int i = 0; i < conn_stat_vect.size(); i++ )
+  {
+    int i_source = conn_stat_vect[ i ].i_source;
+    int i_target = conn_stat_vect[ i ].i_target;
+    int port = conn_stat_vect[ i ].port;
+    int syn_group = conn_stat_vect[ i ].syn_group;
+    float weight = conn_stat_vect[ i ].weight;
+    float delay = conn_stat_vect[ i ].delay;
     std::cout << "  i_source " << i_source << "\n";
     std::cout << "  i_target " << i_target << "\n";
     std::cout << "  port " << port << "\n";
@@ -95,19 +94,19 @@ int main(int argc, char *argv[])
   }
   std::cout << "########################################\n";
 
-  
   // All to odd
-  conn_id = ngpu.GetConnections(neuron, neuron_odd);
-  conn_stat_vect = ngpu.GetConnectionStatus(conn_id);
+  conn_id = ngpu.GetConnections( neuron, neuron_odd );
+  conn_stat_vect = ngpu.GetConnectionStatus( conn_id );
   std::cout << "########################################\n";
   std::cout << "All to odd\n";
-  for (unsigned int i=0; i<conn_stat_vect.size(); i++) {
-    int i_source = conn_stat_vect[i].i_source;
-    int i_target = conn_stat_vect[i].i_target;
-    int port = conn_stat_vect[i].port;
-    int syn_group = conn_stat_vect[i].syn_group;
-    float weight = conn_stat_vect[i].weight;
-    float delay = conn_stat_vect[i].delay;
+  for ( unsigned int i = 0; i < conn_stat_vect.size(); i++ )
+  {
+    int i_source = conn_stat_vect[ i ].i_source;
+    int i_target = conn_stat_vect[ i ].i_target;
+    int port = conn_stat_vect[ i ].port;
+    int syn_group = conn_stat_vect[ i ].syn_group;
+    float weight = conn_stat_vect[ i ].weight;
+    float delay = conn_stat_vect[ i ].delay;
     std::cout << "  i_source " << i_source << "\n";
     std::cout << "  i_target " << i_target << "\n";
     std::cout << "  port " << port << "\n";
@@ -118,20 +117,20 @@ int main(int argc, char *argv[])
   }
   std::cout << "########################################\n";
 
-  
   // Even to 3,4,5,6
-  NodeSeq neuron_3_6 = neuron.Subseq(3,6);
-  conn_id = ngpu.GetConnections(neuron_even, neuron_3_6);
-  conn_stat_vect = ngpu.GetConnectionStatus(conn_id);
+  NodeSeq neuron_3_6 = neuron.Subseq( 3, 6 );
+  conn_id = ngpu.GetConnections( neuron_even, neuron_3_6 );
+  conn_stat_vect = ngpu.GetConnectionStatus( conn_id );
   std::cout << "########################################\n";
   std::cout << "Even to 3,4,5,6\n";
-  for (unsigned int i=0; i<conn_stat_vect.size(); i++) {
-    int i_source = conn_stat_vect[i].i_source;
-    int i_target = conn_stat_vect[i].i_target;
-    int port = conn_stat_vect[i].port;
-    int syn_group = conn_stat_vect[i].syn_group;
-    float weight = conn_stat_vect[i].weight;
-    float delay = conn_stat_vect[i].delay;
+  for ( unsigned int i = 0; i < conn_stat_vect.size(); i++ )
+  {
+    int i_source = conn_stat_vect[ i ].i_source;
+    int i_target = conn_stat_vect[ i ].i_target;
+    int port = conn_stat_vect[ i ].port;
+    int syn_group = conn_stat_vect[ i ].syn_group;
+    float weight = conn_stat_vect[ i ].weight;
+    float delay = conn_stat_vect[ i ].delay;
     std::cout << "  i_source " << i_source << "\n";
     std::cout << "  i_target " << i_target << "\n";
     std::cout << "  port " << port << "\n";
@@ -142,19 +141,19 @@ int main(int argc, char *argv[])
   }
   std::cout << "########################################\n";
 
-  
   // 3,4,5,6 to odd
-  conn_id = ngpu.GetConnections(neuron_3_6, neuron_odd);
-  conn_stat_vect = ngpu.GetConnectionStatus(conn_id);
+  conn_id = ngpu.GetConnections( neuron_3_6, neuron_odd );
+  conn_stat_vect = ngpu.GetConnectionStatus( conn_id );
   std::cout << "########################################\n";
   std::cout << "3,4,5,6 to odd\n";
-  for (unsigned int i=0; i<conn_stat_vect.size(); i++) {
-    int i_source = conn_stat_vect[i].i_source;
-    int i_target = conn_stat_vect[i].i_target;
-    int port = conn_stat_vect[i].port;
-    int syn_group = conn_stat_vect[i].syn_group;
-    float weight = conn_stat_vect[i].weight;
-    float delay = conn_stat_vect[i].delay;
+  for ( unsigned int i = 0; i < conn_stat_vect.size(); i++ )
+  {
+    int i_source = conn_stat_vect[ i ].i_source;
+    int i_target = conn_stat_vect[ i ].i_target;
+    int port = conn_stat_vect[ i ].port;
+    int syn_group = conn_stat_vect[ i ].syn_group;
+    float weight = conn_stat_vect[ i ].weight;
+    float delay = conn_stat_vect[ i ].delay;
     std::cout << "  i_source " << i_source << "\n";
     std::cout << "  i_target " << i_target << "\n";
     std::cout << "  port " << port << "\n";
@@ -165,6 +164,5 @@ int main(int argc, char *argv[])
   }
   std::cout << "########################################\n";
 
-  
   return 0;
 }
diff --git a/c++/examples/test_constcurr.cpp b/c++/examples/test_constcurr.cpp
index 671205fe2..1fdbef51f 100644
--- a/c++/examples/test_constcurr.cpp
+++ b/c++/examples/test_constcurr.cpp
@@ -20,41 +20,38 @@
  *
  */
 
-
-
-
-
-#include <stdio.h>
+#include "nestgpu.h"
+#include <algorithm>
 #include <iostream>
+#include <stdio.h>
 #include <string>
-#include <algorithm>
-#include "nestgpu.h"
 
 using namespace std;
 
-int main(int argc, char *argv[])
+int
+main( int argc, char* argv[] )
 {
   NESTGPU ngpu;
   cout << "Building ...\n";
-  
-  srand(12345);
+
+  srand( 12345 );
   int n_neurons = 10000;
-  
+
   // create n_neurons neurons with 1 receptor ports
-  NodeSeq neuron = ngpu.Create("aeif_cond_beta", n_neurons, 1);
+  NodeSeq neuron = ngpu.Create( "aeif_cond_beta", n_neurons, 1 );
 
   // neuron parameters
-  ngpu.SetNeuronParam(neuron, "a", 4.0);
-  ngpu.SetNeuronParam(neuron, "b",  80.5);
-  ngpu.SetNeuronParam(neuron, "E_L", -70.6);
-  ngpu.SetNeuronParam(neuron, "I_e", 800.0);
+  ngpu.SetNeuronParam( neuron, "a", 4.0 );
+  ngpu.SetNeuronParam( neuron, "b", 80.5 );
+  ngpu.SetNeuronParam( neuron, "E_L", -70.6 );
+  ngpu.SetNeuronParam( neuron, "I_e", 800.0 );
 
   string filename = "test_constcurr.dat";
-  int i_neurons[] = {neuron[rand()%n_neurons]}; // any set of neuron indexes
-  string var_name[] = {"V_m"};
+  int i_neurons[] = { neuron[ rand() % n_neurons ] }; // any set of neuron indexes
+  string var_name[] = { "V_m" };
 
   // create multimeter record of V_m
-  ngpu.CreateRecord(filename, var_name, i_neurons, 1);
+  ngpu.CreateRecord( filename, var_name, i_neurons, 1 );
 
   ngpu.Simulate();
 
diff --git a/c++/examples/test_error.cpp b/c++/examples/test_error.cpp
index 32e2e9d5d..29ff5ba58 100644
--- a/c++/examples/test_error.cpp
+++ b/c++/examples/test_error.cpp
@@ -20,119 +20,117 @@
  *
  */
 
-
-
-
-
-#include <stdio.h>
-#include <iostream>
-#include <string>
-#include <algorithm>
 #include "nestgpu.h"
 #include "ngpu_exception.h"
+#include <algorithm>
+#include <iostream>
+#include <stdio.h>
+#include <string>
 
 using namespace std;
 
-int main(int argc, char *argv[])
+int
+main( int argc, char* argv[] )
 {
-  BEGIN_TRY {
-  if (argc != 2) {
-    cout << "Usage: " << argv[0] << " n_neurons\n";
+  BEGIN_TRY
+  {
+    if ( argc != 2 )
+    {
+      cout << "Usage: " << argv[ 0 ] << " n_neurons\n";
+      return 0;
+    }
+    int arg1;
+    sscanf( argv[ 1 ], "%d", &arg1 );
+    NESTGPU ngpu;
+    cout << "Building ...\n";
+
+    ngpu.SetRandomSeed( 1234ULL ); // seed for GPU random numbers
+
+    int n_receptors = 2;
+
+    int order = arg1 / 5;
+    int NE = 4 * order;      // number of excitatory neurons
+    int NI = 1 * order;      // number of inhibitory neurons
+    int n_neurons = NE + NI; // number of neurons in total
+
+    int CE = 800;    // number of excitatory synapses per neuron
+    int CI = CE / 4; // number of inhibitory synapses per neuron
+
+    float Wex = 0.05;
+    float Win = 0.35;
+
+    // poisson generator parameters
+    float poiss_rate = 20000.0; // poisson signal rate in Hz
+    float poiss_weight = 0.37;
+    float poiss_delay = 0.2; // poisson signal delay in ms
+    int n_pg = n_neurons;    // number of poisson generators
+    // create poisson generator
+    NodeSeq pg = ngpu.CreatePoissonGenerator( n_pg, poiss_rate );
+
+    // create n_neurons neurons with n_receptor receptor ports
+    NodeSeq neuron = ngpu.Create( "aeif_cond_beta", n_neurons, n_receptors );
+
+    NodeSeq exc_neuron = neuron.Subseq( 0, NE - 1 );         // excitatory neuron group
+    NodeSeq inh_neuron = neuron.Subseq( NE, n_neurons - 1 ); // inhibitory neuron group
+
+    // neuron parameters
+    float E_rev[] = { 0.0, -85.0 };
+    float tau_decay[] = { 1.0, 1.0 };
+    float tau_rise[] = { 1.0, 1.0 };
+
+    ngpu.SetNeuronParam( neuron, "Non-existent", E_rev, 2 );
+    ngpu.SetNeuronParam( neuron, "tau_decay", tau_decay, 2 );
+    ngpu.SetNeuronParam( neuron, "tau_rise", tau_rise, 2 );
+
+    float mean_delay = 0.5;
+    float std_delay = 0.25;
+    float min_delay = 0.1;
+    // Excitatory connections
+    // connect excitatory neurons to port 0 of all neurons
+    // normally distributed delays, weight Wex and CE connections per neuron
+    float* exc_delays =
+      ngpu.RandomNormalClipped( CE * n_neurons, mean_delay, std_delay, min_delay, mean_delay + 3 * std_delay );
+
+    ConnSpec conn_spec1( FIXED_INDEGREE, CE );
+    SynSpec syn_spec1;
+    syn_spec1.SetParam( "receptor", 0 );
+    syn_spec1.SetParam( "weight", Wex );
+    syn_spec1.SetParam( "delay_array", exc_delays );
+    ngpu.Connect( exc_neuron, neuron, conn_spec1, syn_spec1 );
+    delete[] exc_delays;
+
+    // Inhibitory connections
+    // connect inhibitory neurons to port 1 of all neurons
+    // normally distributed delays, weight Win and CI connections per neuron
+    float* inh_delays =
+      ngpu.RandomNormalClipped( CI * n_neurons, mean_delay, std_delay, min_delay, mean_delay + 3 * std_delay );
+
+    ConnSpec conn_spec2( FIXED_INDEGREE, CI );
+    SynSpec syn_spec2;
+    syn_spec2.SetParam( "receptor", 1 );
+    syn_spec2.SetParam( "weight", Win );
+    syn_spec2.SetParam( "delay_array", inh_delays );
+    ngpu.Connect( inh_neuron, neuron, conn_spec2, syn_spec2 );
+
+    delete[] inh_delays;
+
+    ConnSpec conn_spec3( ONE_TO_ONE );
+    SynSpec syn_spec3( STANDARD_SYNAPSE, poiss_weight, poiss_delay, 0 );
+    // connect poisson generator to port 0 of all neurons
+    ngpu.Connect( pg, neuron, conn_spec3, syn_spec3 );
+
+    char filename[] = "test_brunel_net.dat";
+    int i_neuron_arr[] = {
+      neuron[ 0 ], neuron[ rand() % n_neurons ], neuron[ n_neurons - 1 ]
+    }; // any set of neuron indexes
+    // create multimeter record of V_m
+    std::string var_name_arr[] = { "V_m", "V_m", "V_m" };
+    ngpu.CreateRecord( string( filename ), var_name_arr, i_neuron_arr, 3 );
+
+    ngpu.Simulate();
+
     return 0;
   }
-  int arg1;
-  sscanf(argv[1], "%d", &arg1);
-  NESTGPU ngpu;
-  cout << "Building ...\n";
-
-  ngpu.SetRandomSeed(1234ULL); // seed for GPU random numbers
-  
-  int n_receptors = 2;
-
-  int order = arg1/5;
-  int NE = 4 * order;      // number of excitatory neurons
-  int NI = 1 * order;      // number of inhibitory neurons
-  int n_neurons = NE + NI; // number of neurons in total
-
-  int CE = 800;  // number of excitatory synapses per neuron
-  int CI = CE/4;  // number of inhibitory synapses per neuron
-
-  float Wex = 0.05;
-  float Win = 0.35;
-
-  // poisson generator parameters
-  float poiss_rate = 20000.0; // poisson signal rate in Hz
-  float poiss_weight = 0.37;
-  float poiss_delay = 0.2; // poisson signal delay in ms
-  int n_pg = n_neurons; // number of poisson generators
-  // create poisson generator
-  NodeSeq pg = ngpu.CreatePoissonGenerator(n_pg, poiss_rate);
-
-  // create n_neurons neurons with n_receptor receptor ports
-  NodeSeq neuron = ngpu.Create("aeif_cond_beta", n_neurons,
-					   n_receptors);
-
-  NodeSeq exc_neuron = neuron.Subseq(0,NE-1); // excitatory neuron group
-  NodeSeq inh_neuron = neuron.Subseq(NE, n_neurons-1); //inhibitory neuron group
-
-  // neuron parameters
-  float E_rev[] = {0.0, -85.0};
-  float tau_decay[] = {1.0, 1.0};
-  float tau_rise[] = {1.0, 1.0};
-
-  ngpu.SetNeuronParam(neuron, "Non-existent", E_rev, 2);
-  ngpu.SetNeuronParam(neuron, "tau_decay", tau_decay, 2);
-  ngpu.SetNeuronParam(neuron, "tau_rise", tau_rise, 2);
-  
-  float mean_delay = 0.5;
-  float std_delay = 0.25;
-  float min_delay = 0.1;
-  // Excitatory connections
-  // connect excitatory neurons to port 0 of all neurons
-  // normally distributed delays, weight Wex and CE connections per neuron
-  float *exc_delays = ngpu.RandomNormalClipped(CE*n_neurons, mean_delay,
-  						     std_delay, min_delay,
-  						     mean_delay+3*std_delay);
-  
-  ConnSpec conn_spec1(FIXED_INDEGREE, CE);
-  SynSpec syn_spec1;
-  syn_spec1.SetParam("receptor", 0);
-  syn_spec1.SetParam("weight", Wex);
-  syn_spec1.SetParam("delay_array", exc_delays);
-  ngpu.Connect(exc_neuron, neuron, conn_spec1, syn_spec1);
-  delete[] exc_delays;
-
-  // Inhibitory connections
-  // connect inhibitory neurons to port 1 of all neurons
-  // normally distributed delays, weight Win and CI connections per neuron
-  float *inh_delays = ngpu.RandomNormalClipped(CI*n_neurons, mean_delay,
-  						     std_delay, min_delay,
-  						     mean_delay+3*std_delay);
-
-  ConnSpec conn_spec2(FIXED_INDEGREE, CI);
-  SynSpec syn_spec2;
-  syn_spec2.SetParam("receptor", 1);
-  syn_spec2.SetParam("weight", Win);
-  syn_spec2.SetParam("delay_array", inh_delays);
-  ngpu.Connect(inh_neuron, neuron, conn_spec2, syn_spec2);
-
-  delete[] inh_delays;
-
-  ConnSpec conn_spec3(ONE_TO_ONE);
-  SynSpec syn_spec3(STANDARD_SYNAPSE, poiss_weight, poiss_delay, 0);
-  // connect poisson generator to port 0 of all neurons
-  ngpu.Connect(pg, neuron, conn_spec3, syn_spec3);
-
-  char filename[] = "test_brunel_net.dat";
-  int i_neuron_arr[] = {neuron[0], neuron[rand()%n_neurons],
-		     neuron[n_neurons-1]}; // any set of neuron indexes
-  // create multimeter record of V_m
-  std::string var_name_arr[] = {"V_m", "V_m", "V_m"};
-  ngpu.CreateRecord(string(filename), var_name_arr, i_neuron_arr, 3);
-
-  ngpu.Simulate();
-
-  return 0;
-  } END_TRY
+  END_TRY
   return -1;
 }
diff --git a/c++/examples/test_setvar.cpp b/c++/examples/test_setvar.cpp
index f161276c0..18a808c99 100644
--- a/c++/examples/test_setvar.cpp
+++ b/c++/examples/test_setvar.cpp
@@ -20,69 +20,68 @@
  *
  */
 
-
-
-
-
-#include <stdio.h>
+#include "nestgpu.h"
+#include <algorithm>
 #include <iostream>
+#include <stdio.h>
 #include <string>
-#include <algorithm>
-#include "nestgpu.h"
 
 using namespace std;
 
-int main(int argc, char *argv[])
+int
+main( int argc, char* argv[] )
 {
   NESTGPU ngpu;
   cout << "Building ...\n";
-  
-  srand(12345);
+
+  srand( 12345 );
   int n_neurons = 3;
-  
+
   // create n_neurons neurons with 2 receptor ports
-  NodeSeq neuron = ngpu.Create("aeif_cond_beta", n_neurons, 2);
-  float tau_decay[] = {60.0, 10.0};
-  float tau_rise[] = {40.0, 5.0};
-  ngpu.SetNeuronParam(neuron, "tau_decay", tau_decay, 2);
-  ngpu.SetNeuronParam(neuron, "tau_rise", tau_rise, 2);
-  
-  NodeSeq neuron0 = neuron.Subseq(0,0);
-  NodeSeq neuron1 = neuron.Subseq(1,1);
-  NodeSeq neuron2 = neuron.Subseq(2,2);
-  float g11[] = {0.0, 0.1};
-  float g12[] = {0.1, 0.0};
-  
+  NodeSeq neuron = ngpu.Create( "aeif_cond_beta", n_neurons, 2 );
+  float tau_decay[] = { 60.0, 10.0 };
+  float tau_rise[] = { 40.0, 5.0 };
+  ngpu.SetNeuronParam( neuron, "tau_decay", tau_decay, 2 );
+  ngpu.SetNeuronParam( neuron, "tau_rise", tau_rise, 2 );
+
+  NodeSeq neuron0 = neuron.Subseq( 0, 0 );
+  NodeSeq neuron1 = neuron.Subseq( 1, 1 );
+  NodeSeq neuron2 = neuron.Subseq( 2, 2 );
+  float g11[] = { 0.0, 0.1 };
+  float g12[] = { 0.1, 0.0 };
+
   // neuron variables
-  ngpu.SetNeuronVar(neuron0, "V_m", -80.0);
-  ngpu.SetNeuronVar(neuron1, "g1", g11, 2);
-  ngpu.SetNeuronVar(neuron2, "g1", g12, 2);
+  ngpu.SetNeuronVar( neuron0, "V_m", -80.0 );
+  ngpu.SetNeuronVar( neuron1, "g1", g11, 2 );
+  ngpu.SetNeuronVar( neuron2, "g1", g12, 2 );
 
   // reading parameters and variables test
-  float *read_td = ngpu.GetNeuronParam(neuron, "tau_decay");
-  float *read_tr = ngpu.GetNeuronParam(neuron, "tau_rise");
-  float *read_Vm = ngpu.GetNeuronVar(neuron, "V_m");
-  float *read_Vth = ngpu.GetNeuronParam(neuron, "V_th");
-  float *read_g1 = ngpu.GetNeuronVar(neuron, "g1");
+  float* read_td = ngpu.GetNeuronParam( neuron, "tau_decay" );
+  float* read_tr = ngpu.GetNeuronParam( neuron, "tau_rise" );
+  float* read_Vm = ngpu.GetNeuronVar( neuron, "V_m" );
+  float* read_Vth = ngpu.GetNeuronParam( neuron, "V_th" );
+  float* read_g1 = ngpu.GetNeuronVar( neuron, "g1" );
 
-  for (int in=0; in<3; in++) {
-    printf("Neuron n. %d\n", in);
-    printf("\tV_m: %f\n", read_Vm[in]);
-    printf("\tV_th: %f\n", read_Vth[in]); 
-    for (int ip=0; ip<2; ip++) {
-      printf("\tg1: %f\n", read_g1[in*2+ip]);
-      printf("\ttau_rise: %f\n", read_tr[in*2+ip]);
-      printf("\ttau_decay: %f\n", read_td[in*2+ip]); 
+  for ( int in = 0; in < 3; in++ )
+  {
+    printf( "Neuron n. %d\n", in );
+    printf( "\tV_m: %f\n", read_Vm[ in ] );
+    printf( "\tV_th: %f\n", read_Vth[ in ] );
+    for ( int ip = 0; ip < 2; ip++ )
+    {
+      printf( "\tg1: %f\n", read_g1[ in * 2 + ip ] );
+      printf( "\ttau_rise: %f\n", read_tr[ in * 2 + ip ] );
+      printf( "\ttau_decay: %f\n", read_td[ in * 2 + ip ] );
     }
-    printf("\n");
+    printf( "\n" );
   }
 
   string filename = "test_setvar.dat";
-  int i_neurons[] = {neuron[0], neuron[1], neuron[2]};
-  string var_name[] = {"V_m", "V_m", "V_m"};
+  int i_neurons[] = { neuron[ 0 ], neuron[ 1 ], neuron[ 2 ] };
+  string var_name[] = { "V_m", "V_m", "V_m" };
 
   // create multimeter record of V_m
-  ngpu.CreateRecord(filename, var_name, i_neurons, 3);
+  ngpu.CreateRecord( filename, var_name, i_neurons, 3 );
 
   ngpu.Simulate();
 
diff --git a/c++/tests/test_connections.cpp b/c++/tests/test_connections.cpp
index 018b1fe11..e69de29bb 100644
--- a/c++/tests/test_connections.cpp
+++ b/c++/tests/test_connections.cpp
@@ -1,145 +0,0 @@
-/*
- *  test_connections.cpp
- *
- *  This file is part of NEST GPU.
- *
- *  Copyright (C) 2021 The NEST Initiative
- *
- *  NEST GPU is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  NEST GPU is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with NEST GPU.  If not, see <http://www.gnu.org/licenses/>.
- *
- */
-
-
-
-
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <iostream>
-#include <string>
-#include <algorithm>
-#include "nestgpu.h"
-
-using namespace std;
-
-int main(int argc, char *argv[])
-{
-  // Intializes C random number generator
-  // srand((unsigned) time(&t));
-
-  NESTGPU ngpu;
-  cout << "Building ...\n";
-
-  ngpu.SetRandomSeed(1234ULL); // seed for GPU random numbers
-  
-  // poisson generator parameters
-  float poiss_rate = 5000.0; // poisson signal rate in Hz
-  float poiss_weight = 1.0;
-  float poiss_delay = 0.2; // poisson signal delay in ms
-
-  // create poisson generator
-  NodeSeq pg = ngpu.Create("poisson_generator");
-  ngpu.SetNeuronParam(pg, "rate", poiss_rate);
-
-  int n_recept = 3; // number of receptors
-  // create 3 neuron groups
-  int n_neur1 = 100; // number of neurons
-  int n_neur2 = 20;
-  int n_neur3 = 50;
-  int n_neurons = n_neur1 + n_neur2 + n_neur3;
-  
-  NodeSeq neur_group = ngpu.Create("aeif_cond_beta", n_neurons, n_recept);
-  NodeSeq neur_group1 = neur_group.Subseq(0, n_neur1 - 1);
-  NodeSeq neur_group2 = neur_group.Subseq(n_neur1, n_neur1 + n_neur2 - 1);
-  NodeSeq neur_group3 = neur_group.Subseq(n_neur1 + n_neur2, n_neurons - 1);
-  
-  // neuron parameters
-  float E_rev[] = {0.0, 0.0, 0.0};
-  float tau_decay[] = {1.0, 1.0, 1.0};
-  float tau_rise[] = {1.0, 1.0, 1.0};
-  ngpu.SetNeuronParam(neur_group1, "E_rev", E_rev, 3);
-  ngpu.SetNeuronParam(neur_group1, "tau_decay", tau_decay, 3);
-  ngpu.SetNeuronParam(neur_group1, "tau_rise", tau_rise, 3);
-  ngpu.SetNeuronParam(neur_group2, "E_rev", E_rev, 3);
-  ngpu.SetNeuronParam(neur_group2, "tau_decay", tau_decay, 3);
-  ngpu.SetNeuronParam(neur_group2, "tau_rise", tau_rise, 3);
-  ngpu.SetNeuronParam(neur_group3, "E_rev", E_rev, 3);
-  ngpu.SetNeuronParam(neur_group3, "tau_decay", tau_decay, 3);
-  ngpu.SetNeuronParam(neur_group3, "tau_rise", tau_rise, 3);
-
-  int i11 = neur_group1[rand()%n_neur1];
-  int i12 = neur_group2[rand()%n_neur2];
-  int i13 = neur_group2[rand()%n_neur2];
-  int i14 = neur_group3[rand()%n_neur3];
-
-  int i21 = neur_group2[rand()%n_neur2];
-
-  int i31 = neur_group1[rand()%n_neur1];
-  int i32 = neur_group3[rand()%n_neur3];
-
-  int it1 = neur_group1[rand()%n_neur1];
-  int it2 = neur_group2[rand()%n_neur2];
-  int it3 = neur_group3[rand()%n_neur3];
-  
-  // connect poisson generator to port 0 of all neurons
-  ngpu.Connect(pg[0], i11, 0, 0, poiss_weight, poiss_delay);
-  ngpu.Connect(pg[0], i12, 0, 0, poiss_weight, poiss_delay);
-  ngpu.Connect(pg[0], i13, 0, 0, poiss_weight, poiss_delay);
-  ngpu.Connect(pg[0], i14, 0, 0, poiss_weight, poiss_delay);
-  ngpu.Connect(pg[0], i21, 0, 0, poiss_weight, poiss_delay);
-  ngpu.Connect(pg[0], i31, 0, 0, poiss_weight, poiss_delay);
-  ngpu.Connect(pg[0], i32, 0, 0, poiss_weight, poiss_delay);
-
-  float weight = 0.01; // connection weight
-  float delay = 0.2; // connection delay in ms
-
-  // connect neurons to target neuron n. 1
-  ngpu.Connect(i11, it1, 0, 0, weight, delay);
-  ngpu.Connect(i12, it1, 1, 0, weight, delay);
-  ngpu.Connect(i13, it1, 1, 0, weight, delay);
-  ngpu.Connect(i14, it1, 2, 0, weight, delay);
-
-  // connect neuron to target neuron n. 2
-  ngpu.Connect(i21, it2, 0, 0, weight, delay);
-
-    // connect neurons to target neuron n. 3
-  ngpu.Connect(i31, it3, 0, 0, weight, delay);
-  ngpu.Connect(i32, it3, 1, 0, weight, delay);
-  
-  // create multimeter record n.1
-  string filename1 = "test_connections_voltage.dat";
-  int i_neuron_arr1[] = {i11, i12, i13, i14, i21, i31, i32, it1, it2, it3};
-  std::string var_name_arr1[] = {"V_m", "V_m", "V_m", "V_m", "V_m", "V_m",
-				"V_m", "V_m", "V_m", "V_m"};
-  ngpu.CreateRecord(filename1, var_name_arr1, i_neuron_arr1, 10);
-
-  // create multimeter record n.2
-  string filename2 = "test_connections_g1.dat";
-  int i_neuron_arr2[] = {it1, it1, it1, it2, it3, it3};
-  int i_receptor_arr[] = {0, 1, 2, 0, 0, 1};
-  std::string var_name_arr2[] = {"g1", "g1", "g1", "g1", "g1", "g1"};
-  ngpu.CreateRecord(filename2, var_name_arr2, i_neuron_arr2,
-		    i_receptor_arr, 6);
-
-  // create multimeter record n.3
-  string filename3 = "test_connections_spikes.dat";
-  int i_neuron_arr3[] = {i11, i12, i13, i14, i21, i31, i32};
-  std::string var_name_arr3[] = {"spike", "spike", "spike", "spike", "spike",
-				 "spike", "spike"};
-  ngpu.CreateRecord(filename3, var_name_arr3, i_neuron_arr3, 7);
-
-  ngpu.Simulate();
-
-  return 0;
-}
diff --git a/c++/tests/test_neuron_groups.cpp b/c++/tests/test_neuron_groups.cpp
index 5a824105b..e69de29bb 100644
--- a/c++/tests/test_neuron_groups.cpp
+++ b/c++/tests/test_neuron_groups.cpp
@@ -1,162 +0,0 @@
-/*
- *  test_neuron_groups.cpp
- *
- *  This file is part of NEST GPU.
- *
- *  Copyright (C) 2021 The NEST Initiative
- *
- *  NEST GPU is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  NEST GPU is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with NEST GPU.  If not, see <http://www.gnu.org/licenses/>.
- *
- */
-
-
-
-
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <iostream>
-#include <string>
-#include <algorithm>
-#include <vector>
-#include "nestgpu.h"
-
-using namespace std;
-
-int main(int argc, char *argv[])
-{
-  // Intializes C random number generator
-  // srand((unsigned) time(&t));
-
-  NESTGPU ngpu;
-  cout << "Building ...\n";
-  
-  ngpu.SetRandomSeed(1234ULL); // seed for GPU random numbers
-
-  // poisson generator parameters
-  float poiss_rate = 5000.0; // poisson signal rate in Hz
-  float poiss_weight = 1.0;
-  float poiss_delay = 0.2; // poisson signal delay in ms
-
-  // create poisson generator
-  NodeSeq pg = ngpu.Create("poisson_generator");
-  ngpu.SetNeuronParam(pg, "rate", poiss_rate);
-
-  // create 3 neuron groups
-  int n_neur1 = 100; // number of neurons
-  int n_recept1 = 3; // number of receptors
-  NodeSeq neur_group1 = ngpu.Create("aeif_cond_beta", n_neur1, n_recept1);
-  int n_neur2 = 20; // number of neurons
-  int n_recept2 = 1; // number of receptors
-  NodeSeq neur_group2 = ngpu.Create("aeif_cond_beta", n_neur2, n_recept2);
-  int n_neur3 = 50; // number of neurons
-  int n_recept3 = 2; // number of receptors
-  NodeSeq neur_group3 = ngpu.Create("aeif_cond_beta", n_neur3, n_recept3);
-  
-  // neuron parameters
-  float E_rev[] = {0.0, 0.0, 0.0};
-  float tau_decay[] = {1.0, 1.0, 1.0};
-  float tau_rise[] = {1.0, 1.0, 1.0};
-  ngpu.SetNeuronParam(neur_group1, "E_rev", E_rev, 3);
-  ngpu.SetNeuronParam(neur_group1, "tau_decay", tau_decay, 3);
-  ngpu.SetNeuronParam(neur_group1, "tau_rise", tau_rise, 3);
-  ngpu.SetNeuronParam(neur_group2, "E_rev", E_rev, 1);
-  ngpu.SetNeuronParam(neur_group2, "tau_decay", tau_decay, 1);
-  ngpu.SetNeuronParam(neur_group2, "tau_rise", tau_rise, 1);
-  ngpu.SetNeuronParam(neur_group3, "E_rev", E_rev, 2);
-  ngpu.SetNeuronParam(neur_group3, "tau_decay", tau_decay, 2);
-  ngpu.SetNeuronParam(neur_group3, "tau_rise", tau_rise, 2);
-
-  int i11 = neur_group1[rand()%n_neur1];
-  int i12 = neur_group2[rand()%n_neur2];
-  int i13 = neur_group2[rand()%n_neur2];
-  int i14 = neur_group3[rand()%n_neur3];
-
-  int i21 = neur_group2[rand()%n_neur2];
-
-  int i31 = neur_group1[rand()%n_neur1];
-  int i32 = neur_group3[rand()%n_neur3];
-
-  int it1 = neur_group1[rand()%n_neur1];
-  int it2 = neur_group2[rand()%n_neur2];
-  int it3 = neur_group3[rand()%n_neur3];
-  
-  // connect poisson generator to port 0 of all neurons
-  ngpu.Connect(pg[0], i11, 0, 0, poiss_weight, poiss_delay);
-  ngpu.Connect(pg[0], i12, 0, 0, poiss_weight, poiss_delay);
-  ngpu.Connect(pg[0], i13, 0, 0, poiss_weight, poiss_delay);
-  ngpu.Connect(pg[0], i14, 0, 0, poiss_weight, poiss_delay);
-  ngpu.Connect(pg[0], i21, 0, 0, poiss_weight, poiss_delay);
-  ngpu.Connect(pg[0], i31, 0, 0, poiss_weight, poiss_delay);
-  ngpu.Connect(pg[0], i32, 0, 0, poiss_weight, poiss_delay);
-
-  float weight = 0.01; // connection weight
-  float delay = 0.2; // connection delay in ms
-
-  // connect neurons to target neuron n. 1
-  ngpu.Connect(i11, it1, 0, 0, weight, delay);
-  ngpu.Connect(i12, it1, 1, 0, weight, delay);
-  ngpu.Connect(i13, it1, 1, 0, weight, delay);
-  ngpu.Connect(i14, it1, 2, 0, weight, delay);
-
-  // connect neuron to target neuron n. 2
-  ngpu.Connect(i21, it2, 0, 0, weight, delay);
-
-    // connect neurons to target neuron n. 3
-  ngpu.Connect(i31, it3, 0, 0, weight, delay);
-  ngpu.Connect(i32, it3, 1, 0, weight, delay);
-  
-  // create multimeter record n.1
-  string filename1 = "test_neuron_groups_voltage.dat";
-  int i_neuron_arr1[] = {i11, i12, i13, i14, i21, i31, i32, it1, it2, it3};
-  string var_name_arr1[] = {"V_m", "V_m", "V_m", "V_m", "V_m", "V_m",
-			    "V_m", "V_m", "V_m", "V_m"};
-  int record1 = ngpu.CreateRecord(filename1, var_name_arr1,
-				  i_neuron_arr1, 10);
-
-  // create multimeter record n.2
-  string filename2 = "test_neuron_groups_g1.dat";
-  int i_neuron_arr2[] = {it1, it1, it1, it2, it3, it3};
-  int i_receptor_arr[] = {0, 1, 2, 0, 0, 1};
-  string var_name_arr2[] = {"g1", "g1", "g1", "g1", "g1", "g1"};
-  //int record2 =
-  ngpu.CreateRecord(filename2, var_name_arr2,
-		    i_neuron_arr2, i_receptor_arr, 6);
-
-  // create multimeter record n.3
-  string filename3 = "test_neuron_groups_spikes.dat";
-  int i_neuron_arr3[] = {i11, i12, i13, i14, i21, i31, i32};
-  string var_name_arr3[] = {"spike", "spike", "spike", "spike", "spike",
-				 "spike", "spike"};
-  //int record3 =
-  ngpu.CreateRecord(filename3, var_name_arr3,
-		    i_neuron_arr3, 7);
-
-  ngpu.Simulate();
-
-  std::vector<std::vector<float>> data_vect1 =
-    *ngpu.GetRecordData(record1);
-
-  FILE *fp=fopen("test_neuron_group_record.dat", "w");
-  for (uint i=0; i<data_vect1.size(); i++) {
-    std::vector<float> vect = data_vect1[i];
-    for (uint j=0; j<vect.size()-1; j++) {
-      fprintf(fp,"%f\t", vect[j]);
-    }
-    fprintf(fp,"%f\n", vect[vect.size()-1]);
-  }
-  fclose(fp);
-  
-  return 0;
-}
diff --git a/configure.ac b/configure.ac
index 2d3414595..2532fa844 100644
--- a/configure.ac
+++ b/configure.ac
@@ -77,9 +77,9 @@ AC_CHECK_HEADERS([limits.h stdlib.h string.h])
 # check for CUDA SDK
 #CT_CHECK_CUDA_SDK
 
-GPU_ARCH=sm_35
+GPU_ARCH=sm_70
 AC_ARG_WITH(gpu-arch, [AC_HELP_STRING([--with-gpu-arch=GPU architecture],
-[Specify the GPU compute capability [default=sm_35]])],
+[Specify the GPU compute capability [default=sm_70]])],
 [GPU_ARCH=$withval])
 AC_SUBST(GPU_ARCH)
 AC_MSG_NOTICE(GPU architecture: ${GPU_ARCH})
diff --git a/lib1/libnestgpu.so.bk b/lib1/libnestgpu.so.bk
new file mode 100755
index 000000000..15c4d53d6
Binary files /dev/null and b/lib1/libnestgpu.so.bk differ
diff --git a/python/Potjans_2014/network.py b/python/Potjans_2014/network.py
index f57cfca77..e7f14451f 100644
--- a/python/Potjans_2014/network.py
+++ b/python/Potjans_2014/network.py
@@ -295,7 +295,8 @@ def __setup_ngpu(self):
         else:
             ngpu.SetKernelStatus({'print_time': self.sim_dict['print_time']})
             print("Cannot set remove_conn_key in NEST GPU version < 2.0")
-            
+        # 'conn_struct_type': self.sim_dict['conn_struct_type'],
+
         self.sim_resolution = self.sim_dict['sim_resolution']
 
         if hasattr(ngpu, "SetNestedLoopAlgo"):
diff --git a/python/Potjans_2014/sim_params.py b/python/Potjans_2014/sim_params.py
index eb079fd2c..de57d5f6c 100644
--- a/python/Potjans_2014/sim_params.py
+++ b/python/Potjans_2014/sim_params.py
@@ -49,6 +49,8 @@
     'master_seed': 12349, #55,
     # optimizes GPU memory by removing unnecessary connection keys
     'remove_conn_key': False,
+    # connection structure type (0: 12 byte, 1: 16 byte)
+    #'conn_struct_type': 0,
     # set nested loop algorithm {0: "BlockStep", 1: "CumulSum", 2: "Simple", 3: "ParallelInner", 4: "ParallelOuter", 5: "Frame1D", 6: "Frame2D", 7: "Smart1D", 8: "Smart2D"}
     'nl_algo': 0,
     # recording interval of the membrane potential (in ms)
diff --git a/python/Potjans_2014/sim_params.templ b/python/Potjans_2014/sim_params.templ
index c29632da6..b1a5b4f28 100644
--- a/python/Potjans_2014/sim_params.templ
+++ b/python/Potjans_2014/sim_params.templ
@@ -48,6 +48,9 @@ sim_dict = {
     'master_seed': __seed__, #55,
     # optimizes GPU memory by removing unnecessary connection keys
     'remove_conn_key': False,
+    # connection structure type (0: 12 byte, 1: 16 byte)
+    #'conn_struct_type': 0,
+=======
     # set nested loop algorithm {0: "BlockStep", 1: "CumulSum", 2: "Simple", 3: "ParallelInner", 4: "ParallelOuter", 5: "Frame1D", 6: "Frame2D", 7: "Smart1D", 8: "Smart2D"}
     'nl_algo': 0,
     # recording interval of the membrane potential (in ms)
diff --git a/python/mpi_mem_check/benchmark_terminal.sh b/python/mpi_mem_check/benchmark_terminal.sh
new file mode 100755
index 000000000..f9f4d856e
--- /dev/null
+++ b/python/mpi_mem_check/benchmark_terminal.sh
@@ -0,0 +1,63 @@
+#!/bin/bash -x
+
+:>full_test.dat
+:>full_req_mem.dat
+:>full_out_of_mem.dat
+
+for T in 0 1; do
+    for P in $(cat n_mpi_list.txt); do
+	for N in $(cat n_neuron_list.txt); do
+	    R=0
+	    id=P$P-N$N-R$R
+	    for iP in $(seq 0 $(($P-1))); do
+		:> test_$iP.dat
+		:> req_mem_$iP.dat
+	    done
+	    mpirun -np $P python3 test.py --N=$N --R=$R --T=$T |& tee log_$id.txt
+	    for iP in $(seq 0 $(($P-1))); do
+		cat test_$iP.dat >> full_test.dat
+		cat req_mem_$iP.dat >> full_req_mem.dat
+	    done
+	    l=$(cat test_0.dat | wc -l)
+	    if [ $l -eq 0 ]; then
+		cat req_mem_0.dat >> full_out_of_mem.dat
+	    fi
+	    for C in $(cat n_conn_list.txt); do
+		for R in 1 2 3; do
+		    id=P$P-N$N-C$C-R$R
+		    for iP in $(seq 0 $(($P-1))); do
+			:> test_$iP.dat
+			:> req_mem_$iP.dat
+		    done
+		    mpirun -np $P python3 test.py --N=$N --C=$C --R=$R --T=$T |& tee log_$id.txt
+		    for iP in $(seq 0 $(($P-1))); do
+			cat test_$iP.dat >> full_test.dat
+			cat req_mem_$iP.dat >> full_req_mem.dat
+		    done
+		    l=$(cat test_0.dat | wc -l)
+		    if [ $l -eq 0 ]; then
+			cat req_mem_0.dat >> full_out_of_mem.dat
+		    fi
+		done
+	    done
+
+	    R=4
+	    id=P$P-N$N-R$R
+	    for iP in $(seq 0 $(($P-1))); do
+		:> test_$iP.dat
+		:> req_mem_$iP.dat
+	    done
+	    mpirun -np $P python3 test.py --N=$N --R=$R --T=$T |& tee log_$id.txt
+	    for iP in $(seq 0 $(($P-1))); do
+		cat test_$iP.dat >> full_test.dat
+		cat req_mem_$iP.dat >> full_req_mem.dat
+	    done
+	    l=$(cat test_0.dat | wc -l)
+	    if [ $l -eq 0 ]; then
+		cat req_mem_0.dat >> full_out_of_mem.dat
+	    fi
+	done
+    done
+done
+
+./summary.sh | tee summary.txt
diff --git a/python/mpi_mem_check/n_conn_list.txt b/python/mpi_mem_check/n_conn_list.txt
new file mode 100644
index 000000000..7262e0bf9
--- /dev/null
+++ b/python/mpi_mem_check/n_conn_list.txt
@@ -0,0 +1 @@
+1 10 100 1000 10000
diff --git a/python/mpi_mem_check/n_mpi_list.txt b/python/mpi_mem_check/n_mpi_list.txt
new file mode 100644
index 000000000..78d399bc3
--- /dev/null
+++ b/python/mpi_mem_check/n_mpi_list.txt
@@ -0,0 +1 @@
+2 3 4 5 6
diff --git a/python/mpi_mem_check/n_neuron_list.txt b/python/mpi_mem_check/n_neuron_list.txt
new file mode 100644
index 000000000..78fa7cf7c
--- /dev/null
+++ b/python/mpi_mem_check/n_neuron_list.txt
@@ -0,0 +1 @@
+1 10 100 1000 10000    
diff --git a/python/mpi_mem_check/run_terminal.sh b/python/mpi_mem_check/run_terminal.sh
new file mode 100644
index 000000000..f12529e4b
--- /dev/null
+++ b/python/mpi_mem_check/run_terminal.sh
@@ -0,0 +1,8 @@
+#!/bin/bash -x
+
+NP=2
+if [ ! -z $1 ]; then
+    NP=$1
+fi
+
+mpirun -np $NP python3 test.py |& tee log.txt
diff --git a/python/mpi_mem_check/summary.sh b/python/mpi_mem_check/summary.sh
new file mode 100755
index 000000000..9b337025a
--- /dev/null
+++ b/python/mpi_mem_check/summary.sh
@@ -0,0 +1,66 @@
+#evaluates total number of MPI processes launched by benchmark_terminal script
+n_mpi=$(cat n_mpi_list.txt | head -1 | sed 's/^ *//;s/ *$//;s/  */+/g' | bc -l)
+n_loop_neur=$(cat n_neuron_list.txt | head -1 | awk '{print NF}')
+n_loop_conn=$(cat n_conn_list.txt | head -1 | awk '{print NF}')
+Ntot_th=$(( 2 * ( $n_mpi * $n_loop_neur * ( 2 + 3 * $n_loop_conn ) ) ))        # 3400
+
+cat full_req_mem.dat | awk '{print $1, $2, $3, $4, $5}' | sort -n > list_all.dat
+Ntot_proc=$(cat list_all.dat | wc -l)
+echo "$Ntot_proc MPI processes out of $Ntot_th expected have been processed"
+if [ $Ntot_proc -lt $Ntot_th ]; then
+    echo "Error: not all expected MPI processes have been processed"
+elif [ $Ntot_proc -gt $Ntot_th ]; then
+    echo "Error: number of processed MPI processes is larger than expected"
+fi
+
+N_complete=$(cat full_test.dat | wc -l)
+echo "$N_complete MPI processes have been completed"
+N_passed=$(cat full_test.dat | awk '{print $10}' | grep '^1' | wc -l)
+echo "$N_passed MPI processes out of $N_complete completed have GPU memory usage in the predicted range"
+N_notpassed=$(($N_complete - $N_passed))
+if [ $N_notpassed -ne 0 ]; then
+    cat full_test.dat | awk '{print $10}' | grep '^0'
+    echo "$N_notpassed MPI processes out of $N_complete completed do not have GPU memory usage in the predicted range"
+    echo "TEST NOT PASSED"
+    exit 1
+fi
+
+cat full_test.dat | awk '{print $1, $2, $3, $4, $5}' | sort -n > list_complete.dat
+diff list_complete.dat list_all.dat | grep '>' | awk '{print $2, $4, $5, $6}' | sort -n | uniq > list_not_complete.dat
+diff list_complete.dat list_all.dat | grep '>' | awk '{print $2, $3, $4, $5, $6}' | sort -n > list_not_complete_proc.dat
+
+N_not_complete_mpirun=$(cat list_not_complete.dat | wc -l)
+echo "$N_not_complete_mpirun mpirun launches have not been completed"
+N_not_complete=$(($Ntot_proc - $N_complete))
+echo "$N_not_complete MPI processes have not been completed"
+N_not_complete_check=$(cat list_not_complete_proc.dat | wc -l)
+if [ $N_not_complete_check -ne $N_not_complete ]; then
+    echo "Error: inconsistent number of MPI processes that have not been completed. Check this script"
+fi
+
+cat full_req_mem.dat | while read a b c d e f g h i; do
+    out_of_mem=$(echo "($i * $a) > $g" | bc -l)
+    echo "$a $b $c $d $e $f $g $h $i $out_of_mem"
+done | grep '1$' | awk '{print $1, $2, $3, $4, $5, $6, $7, $8, $9}' | sort -n > list_out_of_mem_proc.dat
+
+N_not_complete_expected=0
+N_not_complete_unexpected=0
+while read l; do
+    if grep -q "^$l" list_out_of_mem_proc.dat; then
+	N_not_complete_expected=$(( $N_not_complete_expected + 1 ))
+    else
+	N_not_complete_unexpected=$(( $N_not_complete_unexpected + 1 ))
+    fi
+done <<< "$(cat list_not_complete_proc.dat)"
+
+echo -n "$N_not_complete_expected MPI processes out of $N_not_complete MPI processes that have not been completed"
+echo " are in the list of the procesess that were predicted to go out of memory"
+echo -n "$N_not_complete_unexpected MPI processes out of $N_not_complete MPI processes that have not been completed"
+echo " are NOT in the list of the procesess that were predicted to go out of memory"
+if [ $N_not_complete_unexpected -eq 0 ]; then
+    echo "TEST PASSED"
+    exit 0
+else
+    echo "TEST NOT PASSED"
+    exit 1
+fi
diff --git a/python/mpi_mem_check/test.py b/python/mpi_mem_check/test.py
index 4caaeab2d..5e5ab9084 100644
--- a/python/mpi_mem_check/test.py
+++ b/python/mpi_mem_check/test.py
@@ -18,10 +18,13 @@
   	2 -> Fixed outdegree rule.
   	3 -> Fixed total number rule.
   	4 -> All to all connections. Argument C will be ignored.
+  T: Connection struct type. Intege
+        0 -> 12 byte connection structure
+        1 -> 16 byte connection structure
 
 """
 
-
+from mpi4py import MPI
 import nestgpu as ngpu
 
 from argparse import ArgumentParser
@@ -31,6 +34,7 @@
 parser.add_argument("--N", type=int, default=1000)
 parser.add_argument("--C", type=int, default=10000)
 parser.add_argument("--R", type=int, default=1)
+parser.add_argument("--T", type=int, default=0)
 args = parser.parse_args()
 
 rules_dict = {
@@ -41,12 +45,13 @@
     4: [{"rule": "all_to_all"}],
 }
 
-assert args.N > 0 and args.C > 0 and args.R in rules_dict
+conn_struct_type = args.T
+assert args.N > 0 and args.C > 0 and args.R in rules_dict and args.T >= 0 and args.T <= 1  
 
+ngpu.SetKernelStatus({"verbosity_level": 5, "conn_struct_type": conn_struct_type})
 
 ngpu.ConnectMpiInit()
 
-
 mpi_id = ngpu.HostId()
 mpi_np = ngpu.HostNum()
 rank_list = list(range(mpi_np))
@@ -64,6 +69,60 @@
     print(f"Creating {args.N} neurons per MPI rank")
     print(f"Connection rule: {rule}")
 
+block_size = 10000000
+bytes_per_storage = 4
+bytes_per_node = 4
+if conn_struct_type==0:
+    bytes_per_conn = 12
+else:
+    bytes_per_conn = 16
+
+margin = 10 # margin in MB
+
+if args.R==0:
+    cuda_mem_exp = 0
+    cuda_mem_exp_woh = 0
+else:
+    if args.R==1 or args.R==2:
+        n_conn = int(args.C*args.N)
+    elif args.R==3:
+        n_conn = int(args.C)
+    elif args.R==4:
+        n_conn = int(args.N*args.N)
+    else:
+        n_conn = int(0)
+
+    n_blocks = (n_conn*(mpi_np - 1) - 1) // block_size + 1
+
+    cuda_mem_exp = (n_blocks*block_size*bytes_per_conn \
+                    + block_size*bytes_per_storage)/1024/1024
+
+    cuda_mem_exp_oh = n_conn*bytes_per_node/1024/1024
+    
+    cuda_mem_exp_woh = cuda_mem_exp + cuda_mem_exp_oh
+
+# Total CUDA memory (for all hosts)
+cuda_mem_tot = ngpu.getCUDAMemTotal()/1024/1024
+
+# Free CUDA memory (for all hosts)
+cuda_mem_free = ngpu.getCUDAMemFree()/1024/1024
+
+
+req_mem_str = f"{mpi_np}\t{mpi_id}\t{args.N}\t{args.C}\t{args.R}\t" \
+    f"{cuda_mem_tot:>9.3f}\t{cuda_mem_free:>9.3f}\t" \
+    f"{cuda_mem_exp:>9.3f}\t{cuda_mem_exp_woh:>9.3f}\n"
+
+print(f"CUDA available and requested memory summary\n"
+      f"mpi_np\tmpi_id\tN\tC\tR\ttotal (MB)\tfree (MB)\t"
+      f"exp/hst(no OH)\texp/hst(+OH)\n" + req_mem_str)
+
+req_mem_file_name = f"req_mem_{mpi_id}.dat"
+with open(req_mem_file_name, "w") as req_mem_file:
+    req_mem_file.write(req_mem_str)
+
+
+comm = MPI.COMM_WORLD
+comm.Barrier()
 
 neurons = []
 for i in rank_list:
@@ -76,6 +135,28 @@
             if i != j:
                 ngpu.RemoteConnect(i, neurons[i], j, neurons[j], rule[0], {})
 
+cuda_mem_used = ngpu.getCUDAMemHostUsed()/1024/1024
+
+cuda_mem_max = ngpu.getCUDAMemHostPeak()/1024/1024
+
+if cuda_mem_max>=cuda_mem_exp and cuda_mem_max<(cuda_mem_exp_woh+margin):
+    test_passed = 1
+else:
+    test_passed = 0
+    
+out_str = f"{mpi_np}\t{mpi_id}\t{args.N}\t{args.C}\t{args.R}\t" \
+    f"{cuda_mem_used:>9.3f}\t{cuda_mem_max:>9.3f}\t" \
+    f"{cuda_mem_exp:>9.3f}\t{cuda_mem_exp_woh:>9.3f}\t" \
+    f"{test_passed}\n"
+
+print(f"CUDA memory usage summary\n"
+      f"mpi_np\tmpi_id\tN\tC\tR\tused (MB)\tmax (MB)\t"
+      f"exp/hst(no OH)\texp/hst(+OH)\t"
+      f"passed\n" + out_str)
+
+test_file_name = f"test_{mpi_id}.dat"
+with open(test_file_name, "w") as test_file:
+    test_file.write(out_str)
 
 ngpu.MpiFinalize()
 
diff --git a/python/test/log_remote_connect.txt b/python/test/log_remote_connect.txt
new file mode 100644
index 000000000..fe87f23dd
--- /dev/null
+++ b/python/test/log_remote_connect.txt
@@ -0,0 +1,36 @@
+CHECK 0 {'index': 0, 'source': 6, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 150.0, 'weight': 5.0}
+CHECK 0 {'index': 10, 'source': 11, 'target': 0, 'port': 0, 'syn_group': 0, 'delay': 100.0, 'weight': 25.0}
+CHECK 0 {'index': 11, 'source': 11, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 150.0, 'weight': 25.0}
+CHECK 0 {'index': 1, 'source': 6, 'target': 2, 'port': 0, 'syn_group': 0, 'delay': 200.0, 'weight': 5.0}
+CHECK 0 {'index': 2, 'source': 7, 'target': 0, 'port': 0, 'syn_group': 0, 'delay': 100.0, 'weight': 15.0}
+CHECK 0 {'index': 3, 'source': 7, 'target': 2, 'port': 0, 'syn_group': 0, 'delay': 200.0, 'weight': 15.0}
+CHECK 0 {'index': 4, 'source': 8, 'target': 0, 'port': 0, 'syn_group': 0, 'delay': 100.0, 'weight': 25.0}
+CHECK 0 {'index': 5, 'source': 8, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 150.0, 'weight': 25.0}
+CHECK 0 {'index': 6, 'source': 9, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 150.0, 'weight': 5.0}
+CHECK 0 {'index': 7, 'source': 9, 'target': 2, 'port': 0, 'syn_group': 0, 'delay': 200.0, 'weight': 5.0}
+CHECK 0 {'index': 8, 'source': 10, 'target': 0, 'port': 0, 'syn_group': 0, 'delay': 100.0, 'weight': 15.0}
+CHECK 0 {'index': 9, 'source': 10, 'target': 2, 'port': 0, 'syn_group': 0, 'delay': 200.0, 'weight': 15.0}
+CHECK 1 {'index': 0, 'source': 6, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 250.0, 'weight': 5.0}
+CHECK 1 {'index': 10, 'source': 11, 'target': 0, 'port': 0, 'syn_group': 0, 'delay': 200.0, 'weight': 25.0}
+CHECK 1 {'index': 11, 'source': 11, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 250.0, 'weight': 25.0}
+CHECK 1 {'index': 1, 'source': 6, 'target': 2, 'port': 0, 'syn_group': 0, 'delay': 300.0, 'weight': 5.0}
+CHECK 1 {'index': 2, 'source': 7, 'target': 0, 'port': 0, 'syn_group': 0, 'delay': 200.0, 'weight': 15.0}
+CHECK 1 {'index': 3, 'source': 7, 'target': 2, 'port': 0, 'syn_group': 0, 'delay': 300.0, 'weight': 15.0}
+CHECK 1 {'index': 4, 'source': 8, 'target': 0, 'port': 0, 'syn_group': 0, 'delay': 200.0, 'weight': 25.0}
+CHECK 1 {'index': 5, 'source': 8, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 250.0, 'weight': 25.0}
+CHECK 1 {'index': 6, 'source': 9, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 250.0, 'weight': 5.0}
+CHECK 1 {'index': 7, 'source': 9, 'target': 2, 'port': 0, 'syn_group': 0, 'delay': 300.0, 'weight': 5.0}
+CHECK 1 {'index': 8, 'source': 10, 'target': 0, 'port': 0, 'syn_group': 0, 'delay': 200.0, 'weight': 15.0}
+CHECK 1 {'index': 9, 'source': 10, 'target': 2, 'port': 0, 'syn_group': 0, 'delay': 300.0, 'weight': 15.0}
+CHECK 2 {'index': 0, 'source': 6, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 350.0, 'weight': 5.0}
+CHECK 2 {'index': 10, 'source': 11, 'target': 0, 'port': 0, 'syn_group': 0, 'delay': 300.0, 'weight': 25.0}
+CHECK 2 {'index': 11, 'source': 11, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 350.0, 'weight': 25.0}
+CHECK 2 {'index': 1, 'source': 6, 'target': 2, 'port': 0, 'syn_group': 0, 'delay': 400.0, 'weight': 5.0}
+CHECK 2 {'index': 2, 'source': 7, 'target': 0, 'port': 0, 'syn_group': 0, 'delay': 300.0, 'weight': 15.0}
+CHECK 2 {'index': 3, 'source': 7, 'target': 2, 'port': 0, 'syn_group': 0, 'delay': 400.0, 'weight': 15.0}
+CHECK 2 {'index': 4, 'source': 8, 'target': 0, 'port': 0, 'syn_group': 0, 'delay': 300.0, 'weight': 25.0}
+CHECK 2 {'index': 5, 'source': 8, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 350.0, 'weight': 25.0}
+CHECK 2 {'index': 6, 'source': 9, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 350.0, 'weight': 5.0}
+CHECK 2 {'index': 7, 'source': 9, 'target': 2, 'port': 0, 'syn_group': 0, 'delay': 400.0, 'weight': 5.0}
+CHECK 2 {'index': 8, 'source': 10, 'target': 0, 'port': 0, 'syn_group': 0, 'delay': 300.0, 'weight': 15.0}
+CHECK 2 {'index': 9, 'source': 10, 'target': 2, 'port': 0, 'syn_group': 0, 'delay': 400.0, 'weight': 15.0}
diff --git a/python/test/logp3_connect.txt b/python/test/logp3_connect.txt
index 98e6af519..b0bd8aad5 100644
--- a/python/test/logp3_connect.txt
+++ b/python/test/logp3_connect.txt
@@ -10,31 +10,31 @@
 
 ########################################
 Even to all
-{'index': 0, 'source': 0, 'target': 1, 'port': 0, 'syn_group': b'\x00', 'delay': 1.0, 'weight': 100.0}
-{'index': 1, 'source': 0, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 3.0, 'weight': 300.0}
-{'index': 2, 'source': 0, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 5.0, 'weight': 500.0}
-{'index': 3, 'source': 0, 'target': 7, 'port': 0, 'syn_group': b'\x00', 'delay': 7.0, 'weight': 700.0}
-{'index': 4, 'source': 0, 'target': 9, 'port': 0, 'syn_group': b'\x00', 'delay': 9.0, 'weight': 900.0}
-{'index': 10, 'source': 2, 'target': 1, 'port': 0, 'syn_group': b'\x00', 'delay': 21.0, 'weight': 2100.0}
-{'index': 11, 'source': 2, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 23.0, 'weight': 2300.0}
-{'index': 12, 'source': 2, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 25.0, 'weight': 2500.0}
-{'index': 13, 'source': 2, 'target': 7, 'port': 0, 'syn_group': b'\x00', 'delay': 27.0, 'weight': 2700.0}
-{'index': 14, 'source': 2, 'target': 9, 'port': 0, 'syn_group': b'\x00', 'delay': 29.0, 'weight': 2900.0}
-{'index': 20, 'source': 4, 'target': 1, 'port': 0, 'syn_group': b'\x00', 'delay': 41.0, 'weight': 4100.0}
-{'index': 21, 'source': 4, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 43.0, 'weight': 4300.0}
-{'index': 22, 'source': 4, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 45.0, 'weight': 4500.0}
-{'index': 23, 'source': 4, 'target': 7, 'port': 0, 'syn_group': b'\x00', 'delay': 47.0, 'weight': 4700.0}
-{'index': 24, 'source': 4, 'target': 9, 'port': 0, 'syn_group': b'\x00', 'delay': 49.0, 'weight': 4900.0}
-{'index': 30, 'source': 6, 'target': 1, 'port': 0, 'syn_group': b'\x00', 'delay': 61.0, 'weight': 6100.0}
-{'index': 31, 'source': 6, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 63.0, 'weight': 6300.0}
-{'index': 32, 'source': 6, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 65.0, 'weight': 6500.0}
-{'index': 33, 'source': 6, 'target': 7, 'port': 0, 'syn_group': b'\x00', 'delay': 67.0, 'weight': 6700.0}
-{'index': 34, 'source': 6, 'target': 9, 'port': 0, 'syn_group': b'\x00', 'delay': 69.0, 'weight': 6900.0}
-{'index': 40, 'source': 8, 'target': 1, 'port': 0, 'syn_group': b'\x00', 'delay': 81.0, 'weight': 8100.0}
-{'index': 41, 'source': 8, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 83.0, 'weight': 8300.0}
-{'index': 42, 'source': 8, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 85.0, 'weight': 8500.0}
-{'index': 43, 'source': 8, 'target': 7, 'port': 0, 'syn_group': b'\x00', 'delay': 87.0, 'weight': 8700.0}
-{'index': 44, 'source': 8, 'target': 9, 'port': 0, 'syn_group': b'\x00', 'delay': 89.0, 'weight': 8900.0}
+{'index': 0, 'source': 0, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 1.0, 'weight': 100.0}
+{'index': 1, 'source': 0, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 3.0, 'weight': 300.0}
+{'index': 2, 'source': 0, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 5.0, 'weight': 500.0}
+{'index': 3, 'source': 0, 'target': 7, 'port': 0, 'syn_group': 0, 'delay': 7.0, 'weight': 700.0}
+{'index': 4, 'source': 0, 'target': 9, 'port': 0, 'syn_group': 0, 'delay': 9.0, 'weight': 900.0}
+{'index': 10, 'source': 2, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 21.0, 'weight': 2100.0}
+{'index': 11, 'source': 2, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 23.0, 'weight': 2300.0}
+{'index': 12, 'source': 2, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 25.0, 'weight': 2500.0}
+{'index': 13, 'source': 2, 'target': 7, 'port': 0, 'syn_group': 0, 'delay': 27.0, 'weight': 2700.0}
+{'index': 14, 'source': 2, 'target': 9, 'port': 0, 'syn_group': 0, 'delay': 29.0, 'weight': 2900.0}
+{'index': 20, 'source': 4, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 41.0, 'weight': 4100.0}
+{'index': 21, 'source': 4, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 43.0, 'weight': 4300.0}
+{'index': 22, 'source': 4, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 45.0, 'weight': 4500.0}
+{'index': 23, 'source': 4, 'target': 7, 'port': 0, 'syn_group': 0, 'delay': 47.0, 'weight': 4700.0}
+{'index': 24, 'source': 4, 'target': 9, 'port': 0, 'syn_group': 0, 'delay': 49.0, 'weight': 4900.0}
+{'index': 30, 'source': 6, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 61.0, 'weight': 6100.0}
+{'index': 31, 'source': 6, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 63.0, 'weight': 6300.0}
+{'index': 32, 'source': 6, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 65.0, 'weight': 6500.0}
+{'index': 33, 'source': 6, 'target': 7, 'port': 0, 'syn_group': 0, 'delay': 67.0, 'weight': 6700.0}
+{'index': 34, 'source': 6, 'target': 9, 'port': 0, 'syn_group': 0, 'delay': 69.0, 'weight': 6900.0}
+{'index': 40, 'source': 8, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 81.0, 'weight': 8100.0}
+{'index': 41, 'source': 8, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 83.0, 'weight': 8300.0}
+{'index': 42, 'source': 8, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 85.0, 'weight': 8500.0}
+{'index': 43, 'source': 8, 'target': 7, 'port': 0, 'syn_group': 0, 'delay': 87.0, 'weight': 8700.0}
+{'index': 44, 'source': 8, 'target': 9, 'port': 0, 'syn_group': 0, 'delay': 89.0, 'weight': 8900.0}
 
 
 ########################################
@@ -45,59 +45,59 @@ Even to all weight, delat
 
 ########################################
 All to odd
-{'index': 0, 'source': 0, 'target': 1, 'port': 0, 'syn_group': b'\x00', 'delay': 1.0, 'weight': 100.0}
-{'index': 1, 'source': 0, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 3.0, 'weight': 300.0}
-{'index': 2, 'source': 0, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 5.0, 'weight': 500.0}
-{'index': 3, 'source': 0, 'target': 7, 'port': 0, 'syn_group': b'\x00', 'delay': 7.0, 'weight': 700.0}
-{'index': 4, 'source': 0, 'target': 9, 'port': 0, 'syn_group': b'\x00', 'delay': 9.0, 'weight': 900.0}
-{'index': 10, 'source': 2, 'target': 1, 'port': 0, 'syn_group': b'\x00', 'delay': 21.0, 'weight': 2100.0}
-{'index': 11, 'source': 2, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 23.0, 'weight': 2300.0}
-{'index': 12, 'source': 2, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 25.0, 'weight': 2500.0}
-{'index': 13, 'source': 2, 'target': 7, 'port': 0, 'syn_group': b'\x00', 'delay': 27.0, 'weight': 2700.0}
-{'index': 14, 'source': 2, 'target': 9, 'port': 0, 'syn_group': b'\x00', 'delay': 29.0, 'weight': 2900.0}
-{'index': 20, 'source': 4, 'target': 1, 'port': 0, 'syn_group': b'\x00', 'delay': 41.0, 'weight': 4100.0}
-{'index': 21, 'source': 4, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 43.0, 'weight': 4300.0}
-{'index': 22, 'source': 4, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 45.0, 'weight': 4500.0}
-{'index': 23, 'source': 4, 'target': 7, 'port': 0, 'syn_group': b'\x00', 'delay': 47.0, 'weight': 4700.0}
-{'index': 24, 'source': 4, 'target': 9, 'port': 0, 'syn_group': b'\x00', 'delay': 49.0, 'weight': 4900.0}
-{'index': 30, 'source': 6, 'target': 1, 'port': 0, 'syn_group': b'\x00', 'delay': 61.0, 'weight': 6100.0}
-{'index': 31, 'source': 6, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 63.0, 'weight': 6300.0}
-{'index': 32, 'source': 6, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 65.0, 'weight': 6500.0}
-{'index': 33, 'source': 6, 'target': 7, 'port': 0, 'syn_group': b'\x00', 'delay': 67.0, 'weight': 6700.0}
-{'index': 34, 'source': 6, 'target': 9, 'port': 0, 'syn_group': b'\x00', 'delay': 69.0, 'weight': 6900.0}
-{'index': 40, 'source': 8, 'target': 1, 'port': 0, 'syn_group': b'\x00', 'delay': 81.0, 'weight': 8100.0}
-{'index': 41, 'source': 8, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 83.0, 'weight': 8300.0}
-{'index': 42, 'source': 8, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 85.0, 'weight': 8500.0}
-{'index': 43, 'source': 8, 'target': 7, 'port': 0, 'syn_group': b'\x00', 'delay': 87.0, 'weight': 8700.0}
-{'index': 44, 'source': 8, 'target': 9, 'port': 0, 'syn_group': b'\x00', 'delay': 89.0, 'weight': 8900.0}
+{'index': 0, 'source': 0, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 1.0, 'weight': 100.0}
+{'index': 1, 'source': 0, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 3.0, 'weight': 300.0}
+{'index': 2, 'source': 0, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 5.0, 'weight': 500.0}
+{'index': 3, 'source': 0, 'target': 7, 'port': 0, 'syn_group': 0, 'delay': 7.0, 'weight': 700.0}
+{'index': 4, 'source': 0, 'target': 9, 'port': 0, 'syn_group': 0, 'delay': 9.0, 'weight': 900.0}
+{'index': 10, 'source': 2, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 21.0, 'weight': 2100.0}
+{'index': 11, 'source': 2, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 23.0, 'weight': 2300.0}
+{'index': 12, 'source': 2, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 25.0, 'weight': 2500.0}
+{'index': 13, 'source': 2, 'target': 7, 'port': 0, 'syn_group': 0, 'delay': 27.0, 'weight': 2700.0}
+{'index': 14, 'source': 2, 'target': 9, 'port': 0, 'syn_group': 0, 'delay': 29.0, 'weight': 2900.0}
+{'index': 20, 'source': 4, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 41.0, 'weight': 4100.0}
+{'index': 21, 'source': 4, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 43.0, 'weight': 4300.0}
+{'index': 22, 'source': 4, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 45.0, 'weight': 4500.0}
+{'index': 23, 'source': 4, 'target': 7, 'port': 0, 'syn_group': 0, 'delay': 47.0, 'weight': 4700.0}
+{'index': 24, 'source': 4, 'target': 9, 'port': 0, 'syn_group': 0, 'delay': 49.0, 'weight': 4900.0}
+{'index': 30, 'source': 6, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 61.0, 'weight': 6100.0}
+{'index': 31, 'source': 6, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 63.0, 'weight': 6300.0}
+{'index': 32, 'source': 6, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 65.0, 'weight': 6500.0}
+{'index': 33, 'source': 6, 'target': 7, 'port': 0, 'syn_group': 0, 'delay': 67.0, 'weight': 6700.0}
+{'index': 34, 'source': 6, 'target': 9, 'port': 0, 'syn_group': 0, 'delay': 69.0, 'weight': 6900.0}
+{'index': 40, 'source': 8, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 81.0, 'weight': 8100.0}
+{'index': 41, 'source': 8, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 83.0, 'weight': 8300.0}
+{'index': 42, 'source': 8, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 85.0, 'weight': 8500.0}
+{'index': 43, 'source': 8, 'target': 7, 'port': 0, 'syn_group': 0, 'delay': 87.0, 'weight': 8700.0}
+{'index': 44, 'source': 8, 'target': 9, 'port': 0, 'syn_group': 0, 'delay': 89.0, 'weight': 8900.0}
 
 
 ########################################
 Even to 3,4,5,6
-{'index': 1, 'source': 0, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 3.0, 'weight': 300.0}
-{'index': 2, 'source': 0, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 5.0, 'weight': 500.0}
-{'index': 11, 'source': 2, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 23.0, 'weight': 2300.0}
-{'index': 12, 'source': 2, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 25.0, 'weight': 2500.0}
-{'index': 21, 'source': 4, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 43.0, 'weight': 4300.0}
-{'index': 22, 'source': 4, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 45.0, 'weight': 4500.0}
-{'index': 31, 'source': 6, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 63.0, 'weight': 6300.0}
-{'index': 32, 'source': 6, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 65.0, 'weight': 6500.0}
-{'index': 41, 'source': 8, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 83.0, 'weight': 8300.0}
-{'index': 42, 'source': 8, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 85.0, 'weight': 8500.0}
+{'index': 1, 'source': 0, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 3.0, 'weight': 300.0}
+{'index': 2, 'source': 0, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 5.0, 'weight': 500.0}
+{'index': 11, 'source': 2, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 23.0, 'weight': 2300.0}
+{'index': 12, 'source': 2, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 25.0, 'weight': 2500.0}
+{'index': 21, 'source': 4, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 43.0, 'weight': 4300.0}
+{'index': 22, 'source': 4, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 45.0, 'weight': 4500.0}
+{'index': 31, 'source': 6, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 63.0, 'weight': 6300.0}
+{'index': 32, 'source': 6, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 65.0, 'weight': 6500.0}
+{'index': 41, 'source': 8, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 83.0, 'weight': 8300.0}
+{'index': 42, 'source': 8, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 85.0, 'weight': 8500.0}
 
 
 ########################################
 3,4,5,6 to odd
-{'index': 20, 'source': 4, 'target': 1, 'port': 0, 'syn_group': b'\x00', 'delay': 41.0, 'weight': 4100.0}
-{'index': 21, 'source': 4, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 43.0, 'weight': 4300.0}
-{'index': 22, 'source': 4, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 45.0, 'weight': 4500.0}
-{'index': 23, 'source': 4, 'target': 7, 'port': 0, 'syn_group': b'\x00', 'delay': 47.0, 'weight': 4700.0}
-{'index': 24, 'source': 4, 'target': 9, 'port': 0, 'syn_group': b'\x00', 'delay': 49.0, 'weight': 4900.0}
-{'index': 30, 'source': 6, 'target': 1, 'port': 0, 'syn_group': b'\x00', 'delay': 61.0, 'weight': 6100.0}
-{'index': 31, 'source': 6, 'target': 3, 'port': 0, 'syn_group': b'\x00', 'delay': 63.0, 'weight': 6300.0}
-{'index': 32, 'source': 6, 'target': 5, 'port': 0, 'syn_group': b'\x00', 'delay': 65.0, 'weight': 6500.0}
-{'index': 33, 'source': 6, 'target': 7, 'port': 0, 'syn_group': b'\x00', 'delay': 67.0, 'weight': 6700.0}
-{'index': 34, 'source': 6, 'target': 9, 'port': 0, 'syn_group': b'\x00', 'delay': 69.0, 'weight': 6900.0}
+{'index': 20, 'source': 4, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 41.0, 'weight': 4100.0}
+{'index': 21, 'source': 4, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 43.0, 'weight': 4300.0}
+{'index': 22, 'source': 4, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 45.0, 'weight': 4500.0}
+{'index': 23, 'source': 4, 'target': 7, 'port': 0, 'syn_group': 0, 'delay': 47.0, 'weight': 4700.0}
+{'index': 24, 'source': 4, 'target': 9, 'port': 0, 'syn_group': 0, 'delay': 49.0, 'weight': 4900.0}
+{'index': 30, 'source': 6, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 61.0, 'weight': 6100.0}
+{'index': 31, 'source': 6, 'target': 3, 'port': 0, 'syn_group': 0, 'delay': 63.0, 'weight': 6300.0}
+{'index': 32, 'source': 6, 'target': 5, 'port': 0, 'syn_group': 0, 'delay': 65.0, 'weight': 6500.0}
+{'index': 33, 'source': 6, 'target': 7, 'port': 0, 'syn_group': 0, 'delay': 67.0, 'weight': 6700.0}
+{'index': 34, 'source': 6, 'target': 9, 'port': 0, 'syn_group': 0, 'delay': 69.0, 'weight': 6900.0}
 
 
 Calibrating ...
diff --git a/python/test/test_mpi.sh b/python/test/test_mpi.sh
index 9560bb926..56f3e7ba8 100755
--- a/python/test/test_mpi.sh
+++ b/python/test/test_mpi.sh
@@ -11,3 +11,4 @@ for fn in test_brunel_mpi.py test_brunel_outdegree_mpi.py test_izh_psc_exp_2s_mp
     fi
     echo $fn : ${mpi_pass_str[$res]}
 done
+. test_remote_connect.sh
diff --git a/python/test/test_remote_connect.py b/python/test/test_remote_connect.py
new file mode 100644
index 000000000..e586560ef
--- /dev/null
+++ b/python/test/test_remote_connect.py
@@ -0,0 +1,59 @@
+import sys
+import math
+import ctypes
+import nestgpu as ngpu
+from random import randrange
+import numpy as np
+
+
+ngpu.ConnectMpiInit();
+mpi_np = ngpu.HostNum()
+
+if mpi_np != 3:
+    print ("Usage: mpirun -np 3 python %s" % sys.argv[0])
+    quit()
+
+mpi_id = ngpu.HostId()
+print("Building on host ", mpi_id, " ...")
+
+ngpu.SetKernelStatus("rnd_seed", 1234) # seed for GPU random numbers
+
+neuron = ngpu.Create('iaf_psc_exp_g', 3)
+
+spike = ngpu.Create("spike_generator", 3)
+
+for i in range(3):
+    spike_times = [1.0*(mpi_id*20 + i*5 + 10), 50.0 + 1.0*(mpi_id*20 + i*5 + 10)]
+    n_spikes = 2
+    # set spike times and height
+    ngpu.SetStatus([spike[i]], {"spike_times": spike_times})
+    
+
+conn_spec = {"rule": "one_to_one"}
+
+for ish in range(3):
+    for ith in range(3):
+        if ish != ith:
+            for isn in range(3):
+                for itn in range(3):
+                    if itn != isn:
+                        delay = 100 + 100.0*ith + 50.0*itn
+                        weight = 5.0 + 10.0*isn
+                        syn_spec = {'weight': weight, 'delay': delay}
+                        #print (ish, [spike[isn]], ith, [neuron[itn]])
+                        #ngpu.RemoteConnect(ish, spike[isn:isn+1], \
+                        #                   ith, neuron[itn:itn+1], \
+                        #                   conn_spec, syn_spec)
+                        ngpu.RemoteConnect(ish, [spike[isn]], \
+                                           ith, [neuron[itn]], \
+                                           conn_spec, syn_spec)
+
+
+ngpu.Simulate(1)                        
+conn_id = ngpu.GetConnections()
+conn_status_dict = ngpu.GetStatus(conn_id)
+for i in range(len(conn_status_dict)):
+    print ("CHECK", mpi_id, conn_status_dict[i])
+print()
+print()
+ngpu.MpiFinalize()
diff --git a/python/test/test_remote_connect.sh b/python/test/test_remote_connect.sh
new file mode 100644
index 000000000..aebea1c6a
--- /dev/null
+++ b/python/test/test_remote_connect.sh
@@ -0,0 +1,7 @@
+pass_str[0]="TEST PASSED"
+pass_str[1]="TEST NOT PASSED"
+fn=test_remote_connect.py
+mpirun -np 3 python3 $fn | grep CHECK | sort -n > tmp
+diff -qs tmp log_remote_connect.txt 2>&1 >> log.txt
+res=$?
+echo $fn : ${pass_str[$res]}    
diff --git a/python/test/test_stdp/cases/test_all.sh b/python/test/test_stdp/cases/test_all.sh
index 3cf5c02d1..b87eeda89 100755
--- a/python/test/test_stdp/cases/test_all.sh
+++ b/python/test/test_stdp/cases/test_all.sh
@@ -1 +1 @@
-for i in $(seq 1 10); do python3 case$i.py | tail -1; done
+for i in $(seq 1 10); do python3 case$i.py | grep '^dw'; done
diff --git a/python/test/tmp.py b/python/test/tmp.py
deleted file mode 100644
index d3760f533..000000000
--- a/python/test/tmp.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import sys
-import math
-import ctypes
-import nestgpu as ngpu
-from random import randrange
-import numpy as np
-
-order = 200
-n_test = 100
-
-expected_rate = 30.78
-print("Building ...")
-
-ngpu.SetKernelStatus("rnd_seed", 1234) # seed for GPU random numbers
-
-n_receptors = 2
-
-NE = 4 * order       # number of excitatory neurons
-NI = 1 * order       # number of inhibitory neurons
-n_neurons = NE + NI  # number of neurons in total
-
-CE = 800   # number of excitatory synapses per neuron
-CI = CE//4  # number of inhibitory synapses per neuron
-
-Wex = 0.05
-Win = 0.35
-
-# poisson generator parameters
-poiss_rate = 20000.0 # poisson signal rate in Hz
-poiss_weight = 0.37
-poiss_delay = 0.2 # poisson signal delay in ms
-
-# create poisson generator
-pg = ngpu.Create("poisson_generator")
-ngpu.SetStatus(pg, "rate", poiss_rate)
-pg_list = pg.ToList()
-
-# Create n_neurons neurons with n_receptor receptor ports
-neuron = ngpu.Create("user_m1", n_neurons, n_receptors)
-exc_neuron = neuron[0:NE]      # excitatory neurons
-inh_neuron = neuron[NE:n_neurons]   # inhibitory neurons
-neuron_list = neuron.ToList()
-exc_neuron_list = exc_neuron.ToList()
-inh_neuron_list = inh_neuron.ToList()
-
-# receptor parameters
-E_rev = [0.0, -85.0]
-tau_decay = [1.0, 1.0]
-tau_rise = [1.0, 1.0]
-ngpu.SetStatus(neuron, {"E_rev":E_rev, "tau_decay":tau_decay,
-                        "tau_rise":tau_rise})
-
-
-mean_delay = 0.5
-std_delay = 0.25
-min_delay = 0.1
-# Excitatory connections
-# connect excitatory neurons to port 0 of all neurons
-# normally distributed delays, weight Wex and CE connections per neuron
-exc_conn_dict={"rule": "fixed_indegree", "indegree": CE}
-exc_syn_dict={"weight": Wex, "delay": {"distribution":"normal_clipped",
-                                       "mu":mean_delay, "low":min_delay,
-                                       "high":mean_delay+3*std_delay,
-                                       "sigma":std_delay}, "receptor":0}
-ngpu.Connect(exc_neuron, neuron_list, exc_conn_dict, exc_syn_dict)
-
-# Inhibitory connections
-# connect inhibitory neurons to port 1 of all neurons
-# normally distributed delays, weight Win and CI connections per neuron
-inh_conn_dict={"rule": "fixed_indegree", "indegree": CI}
-inh_syn_dict={"weight": Win, "delay":{"distribution":"normal_clipped",
-                                       "mu":mean_delay, "low":min_delay,
-                                       "high":mean_delay+3*std_delay,
-                                       "sigma":std_delay}, "receptor":1}
-ngpu.Connect(inh_neuron_list, exc_neuron_list, inh_conn_dict, inh_syn_dict)
-ngpu.Connect(inh_neuron_list, inh_neuron, inh_conn_dict, inh_syn_dict)
-
-#connect poisson generator to port 0 of all neurons
-pg_conn_dict={"rule": "all_to_all"}
-pg_syn_dict={"weight": poiss_weight, "delay": poiss_delay,
-              "receptor":0}
-
-ngpu.Connect(pg_list, neuron_list, pg_conn_dict, pg_syn_dict)
-
-i_neuron_list = [neuron[0], neuron[n_neurons-1]]
-i_receptor_list = [0, 0]
-var_name_list = ["spike", "spike"]
-                 
-for i in range(n_test-2):
-    i_neuron_list.append(neuron[randrange(n_neurons)])
-    i_receptor_list.append(0)
-    var_name_list.append("spike")
-
-# create multimeter record of spikes
-record = ngpu.CreateRecord("", var_name_list, i_neuron_list, i_receptor_list)
-
-ngpu.Simulate()
-
-data_list = ngpu.GetRecordData(record)
-
-for i in range(1000):
-    conn_id = ngpu.GetConnections(i+1)
-    n_out_conn = len(conn_id)
-    if (n_out_conn!=NE+NI):
-        print("Expected number of out connections per neuron: ", NE+NI)
-        print("Number of out connections of neuron ", i + 1, ": ", \
-              n_out_conn)
-        #sys.exit(1)
-        
-
-for i in range(10):
-    i_target = randrange(n_neurons)
-    conn_id = ngpu.GetConnections(target=i_target+1)
-    n_in_conn = len(conn_id)
-    if (n_in_conn!=NE+NI+1):
-        print("Expected number of in connections per neuron: ", NE+NI+1)
-        print("Number of in connections of neuron ", i_target, ": ", \
-              n_in_conn)
-        #sys.exit(1)
-
-
-row_sum = list(data_list[0])
-for row in data_list[1:len(data_list)]:
-    for i in range(len(row_sum)):
-        row_sum[i] = row_sum[i] + row[i]
-
-spike = row_sum[1:len(row_sum)]
-spike_arr = np.array(spike)
-
-min_spike_num = np.min(spike_arr)
-max_spike_num = np.max(spike_arr)
-if (min_spike_num < expected_rate - 3.0*math.sqrt(expected_rate)):
-    print ("Expected rate: ", expected_rate)
-    print("Min rate :", min_spike_num)
-    sys.exit(1)
-    
-if (max_spike_num > expected_rate + 3.0*math.sqrt(expected_rate)):
-    print ("Expected rate: ", expected_rate)
-    print("Max rate :", max_spike_num)
-    sys.exit(1)
-
-mean_spike_num = np.mean(spike_arr)
-diff = abs(mean_spike_num - expected_rate)
-max_diff = 3.0*np.sqrt(expected_rate)/np.sqrt(n_test)
-print ("Expected rate: ", expected_rate)
-print("Mean rate: ", mean_spike_num)
-if diff > max_diff:
-    sys.exit(1)
-else:
-    sys.exit(0)
-
diff --git a/python/test/tmp2.py b/python/test/tmp2.py
deleted file mode 100644
index bab70d9c1..000000000
--- a/python/test/tmp2.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import sys
-import math
-import ctypes
-import nestgpu as ngpu
-from random import randrange
-import numpy as np
-
-order = 200
-n_test = 100
-
-expected_rate = 30.78
-print("Building ...")
-
-ngpu.SetKernelStatus("rnd_seed", 1234) # seed for GPU random numbers
-
-n_receptors = 2
-
-NE = 4 * order       # number of excitatory neurons
-NI = 1 * order       # number of inhibitory neurons
-n_neurons = NE + NI  # number of neurons in total
-
-CPN = 1000 # number of connections per neuron
-
-Wex = 0.05
-Win = 0.35
-
-# poisson generator parameters
-poiss_rate = 20000.0 # poisson signal rate in Hz
-poiss_weight = 0.37
-poiss_delay = 0.2 # poisson signal delay in ms
-
-# create poisson generator
-pg = ngpu.Create("poisson_generator")
-ngpu.SetStatus(pg, "rate", poiss_rate)
-pg_list = pg.ToList()
-
-# Create n_neurons neurons with n_receptor receptor ports
-neuron = ngpu.Create("aeif_cond_beta", n_neurons, n_receptors)
-exc_neuron = neuron[0:NE]      # excitatory neurons
-inh_neuron = neuron[NE:n_neurons]   # inhibitory neurons
-neuron_list = neuron.ToList()
-exc_neuron_list = exc_neuron.ToList()
-inh_neuron_list = inh_neuron.ToList()
-
-# receptor parameters
-E_rev = [0.0, -85.0]
-tau_decay = [1.0, 1.0]
-tau_rise = [1.0, 1.0]
-ngpu.SetStatus(neuron, {"E_rev":E_rev, "tau_decay":tau_decay,
-                        "tau_rise":tau_rise})
-
-
-mean_delay = 0.5
-std_delay = 0.25
-min_delay = 0.1
-# Excitatory connections
-# connect excitatory neurons to port 0 of all neurons
-# normally distributed delays, weight Wex and CPN connections per neuron
-exc_conn_dict={"rule": "fixed_total_number", "total_num": CPN*NE}
-exc_syn_dict={"weight": Wex, "delay": {"distribution":"normal_clipped",
-                                       "mu":mean_delay, "low":min_delay,
-                                       "high":mean_delay+3*std_delay,
-                                       "sigma":std_delay}, "receptor":0}
-ngpu.Connect(exc_neuron, neuron_list, exc_conn_dict, exc_syn_dict)
-
-# Inhibitory connections
-# connect inhibitory neurons to port 1 of all neurons
-# normally distributed delays, weight Win and CPN connections per neuron
-inh_conn_dict={"rule": "fixed_total_number", "total_num": CPN*NI}
-inh_syn_dict={"weight": Win, "delay":{"distribution":"normal_clipped",
-                                       "mu":mean_delay, "low":min_delay,
-                                       "high":mean_delay+3*std_delay,
-                                       "sigma":std_delay}, "receptor":1}
-ngpu.Connect(inh_neuron_list, neuron, inh_conn_dict, inh_syn_dict)
-
-#connect poisson generator to port 0 of all neurons
-pg_conn_dict={"rule": "all_to_all"}
-pg_syn_dict={"weight": poiss_weight, "delay": poiss_delay, "receptor":0}
-
-ngpu.Connect(pg_list, neuron_list, pg_conn_dict, pg_syn_dict)
-
-i_neuron_list = [neuron[0], neuron[n_neurons-1]]
-i_receptor_list = [0, 0]
-var_name_list = ["spike", "spike"]
-                 
-for i in range(n_test-2):
-    i_neuron_list.append(neuron[randrange(n_neurons)])
-    i_receptor_list.append(0)
-    var_name_list.append("spike")
-
-# create multimeter record of spikes
-record = ngpu.CreateRecord("", var_name_list, i_neuron_list, i_receptor_list)
-
-ngpu.Simulate()
-
-data_list = ngpu.GetRecordData(record)
-
-n_conn_tot = 0
-for i in range(10):
-    conn_id = ngpu.GetConnections(i+1)
-    n_out_conn = len(conn_id)
-    n_conn_tot = n_conn_tot + n_out_conn
-
-if (n_conn_tot!=(NE+NI)*CPN):
-    print("Expected total number of connections: ", (NE+NI)*CPN)
-    print("Total number of connections ", n_conn_tot)
-    #sys.exit(1)
-        
-row_sum = list(data_list[0])
-for row in data_list[1:len(data_list)]:
-    for i in range(len(row_sum)):
-        row_sum[i] = row_sum[i] + row[i]
-
-spike = row_sum[1:len(row_sum)]
-spike_arr = np.array(spike)
-
-min_spike_num = np.min(spike_arr)
-max_spike_num = np.max(spike_arr)
-if (min_spike_num < expected_rate - 3.0*math.sqrt(expected_rate)):
-    print ("Expected rate: ", expected_rate)
-    print("Min rate :", min_spike_num)
-    sys.exit(1)
-    
-if (max_spike_num > expected_rate + 3.0*math.sqrt(expected_rate)):
-    print ("Expected rate: ", expected_rate)
-    print("Max rate :", max_spike_num)
-    sys.exit(1)
-
-mean_spike_num = np.mean(spike_arr)
-diff = abs(mean_spike_num - expected_rate)
-max_diff = 3.0*np.sqrt(expected_rate)/np.sqrt(n_test)
-print ("Expected rate: ", expected_rate)
-print("Mean rate: ", mean_spike_num)
-if diff > max_diff:
-    sys.exit(1)
-else:
-    sys.exit(0)
-
diff --git a/python/test/tmp3.py b/python/test/tmp3.py
deleted file mode 100644
index 4e48a56e2..000000000
--- a/python/test/tmp3.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import sys
-import math
-import ctypes
-import nestgpu as ngpu
-from random import randrange
-import numpy as np
-
-order = 200
-n_test = 100
-
-expected_rate = 30.78
-print("Building ...")
-
-ngpu.SetKernelStatus("rnd_seed", 1234) # seed for GPU random numbers
-
-n_receptors = 2
-
-NE = 4 * order       # number of excitatory neurons
-NI = 1 * order       # number of inhibitory neurons
-n_neurons = NE + NI  # number of neurons in total
-
-CPN = 1000 # number of connections per neuron
-
-Wex = 0.05
-Win = 0.35
-
-# poisson generator parameters
-poiss_rate = 20000.0 # poisson signal rate in Hz
-poiss_weight = 0.37
-poiss_delay = 0.2 # poisson signal delay in ms
-
-# create poisson generator
-pg = ngpu.Create("poisson_generator")
-ngpu.SetStatus(pg, "rate", poiss_rate)
-pg_list = pg.ToList()
-
-# Create n_neurons neurons with n_receptor receptor ports
-neuron = ngpu.Create("aeif_cond_beta", n_neurons, n_receptors)
-exc_neuron = neuron[0:NE]      # excitatory neurons
-inh_neuron = neuron[NE:n_neurons]   # inhibitory neurons
-neuron_list = neuron.ToList()
-exc_neuron_list = exc_neuron.ToList()
-inh_neuron_list = inh_neuron.ToList()
-
-# receptor parameters
-E_rev = [0.0, -85.0]
-tau_decay = [1.0, 1.0]
-tau_rise = [1.0, 1.0]
-ngpu.SetStatus(neuron, {"E_rev":E_rev, "tau_decay":tau_decay,
-                        "tau_rise":tau_rise})
-
-
-mean_delay = 0.5
-std_delay = 0.25
-min_delay = 0.1
-# Excitatory connections
-# connect excitatory neurons to port 0 of all neurons
-# normally distributed delays, weight Wex and CPN connections per neuron
-exc_conn_dict={"rule": "fixed_total_number", "total_num": CPN*NE}
-exc_syn_dict={"weight": Wex, "delay": {"distribution":"normal_clipped",
-                                       "mu":mean_delay, "low":min_delay,
-                                       "high":mean_delay+3*std_delay,
-                                       "sigma":std_delay}, "receptor":0}
-ngpu.Connect(exc_neuron, neuron_list, exc_conn_dict, exc_syn_dict)
-
-# Inhibitory connections
-# connect inhibitory neurons to port 1 of all neurons
-# normally distributed delays, weight Win and CPN connections per neuron
-inh_conn_dict={"rule": "fixed_total_number", "total_num": CPN*NI}
-inh_syn_dict={"weight": Win, "delay":{"distribution":"normal_clipped",
-                                       "mu":mean_delay, "low":min_delay,
-                                       "high":mean_delay+3*std_delay,
-                                       "sigma":std_delay}, "receptor":1}
-ngpu.Connect(inh_neuron_list, neuron, inh_conn_dict, inh_syn_dict)
-
-#connect poisson generator to port 0 of all neurons
-pg_conn_dict={"rule": "all_to_all"}
-pg_syn_dict={"weight": poiss_weight, "delay": poiss_delay, "receptor":0}
-
-ngpu.Connect(pg_list, neuron_list, pg_conn_dict, pg_syn_dict)
-
-i_neuron_list = [neuron[0], neuron[n_neurons-1]]
-i_receptor_list = [0, 0]
-var_name_list = ["spike", "spike"]
-                 
-for i in range(n_test-2):
-    i_neuron_list.append(neuron[randrange(n_neurons)])
-    i_receptor_list.append(0)
-    var_name_list.append("spike")
-
-# create multimeter record of spikes
-record = ngpu.CreateRecord("", var_name_list, i_neuron_list, i_receptor_list)
-
-ngpu.Simulate()
-
-data_list = ngpu.GetRecordData(record)
-
-n_conn_tot = 0
-for i in range(1):
-    conn_id = ngpu.GetConnections(i+1)
-    n_out_conn = len(conn_id)
-    n_conn_tot = n_conn_tot + n_out_conn
-
-if (n_conn_tot!=(NE+NI)*CPN):
-    print("Expected total number of connections: ", (NE+NI)*CPN)
-    print("Total number of connections ", n_conn_tot)
-    #sys.exit(1)
-        
-row_sum = list(data_list[0])
-for row in data_list[1:len(data_list)]:
-    for i in range(len(row_sum)):
-        row_sum[i] = row_sum[i] + row[i]
-
-spike = row_sum[1:len(row_sum)]
-spike_arr = np.array(spike)
-
-min_spike_num = np.min(spike_arr)
-max_spike_num = np.max(spike_arr)
-if (min_spike_num < expected_rate - 3.0*math.sqrt(expected_rate)):
-    print ("Expected rate: ", expected_rate)
-    print("Min rate :", min_spike_num)
-    sys.exit(1)
-    
-if (max_spike_num > expected_rate + 3.0*math.sqrt(expected_rate)):
-    print ("Expected rate: ", expected_rate)
-    print("Max rate :", max_spike_num)
-    sys.exit(1)
-
-mean_spike_num = np.mean(spike_arr)
-diff = abs(mean_spike_num - expected_rate)
-max_diff = 3.0*np.sqrt(expected_rate)/np.sqrt(n_test)
-print ("Expected rate: ", expected_rate)
-print("Mean rate: ", mean_spike_num)
-if diff > max_diff:
-    sys.exit(1)
-else:
-    sys.exit(0)
-
diff --git a/python/test/tmp4.py b/python/test/tmp4.py
deleted file mode 100644
index 09ddc141a..000000000
--- a/python/test/tmp4.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import sys
-import nestgpu as ngpu
-
-tolerance = 1.0e-6
-dt_step = 0.1
-N = 5
-fact = 0.2
-offset = 0.03
-
-syn_group = ngpu.CreateSynGroup("test_syn_model")
-ngpu.SetSynGroupParam(syn_group, "fact", fact)
-ngpu.SetSynGroupParam(syn_group, "offset", offset)
-
-sg = ngpu.Create("spike_generator", N)
-neuron = ngpu.Create("aeif_cond_beta", 2*N)
-ngpu.SetStatus(neuron, {"t_ref": 10.0})
-neuron0 = neuron[0:N]
-neuron1 = neuron[N:2*N]
-dt_list = []
-for i in range(N):
-    dt_list.append(dt_step*(-0.5*(N-1) + i))
-
-spike_time = [50.0]
-spike_height = [1.0]
-n_spikes = 1
-time_diff = 10.0
-
-# set spike times and height
-ngpu.SetStatus(sg, {"spike_times": spike_time, "spike_heights":spike_height})
-delay0 = 1.0
-delay1 = delay0 + time_diff
-weight_sg = 17.9
-weight_test = 0.0
-
-conn_dict={"rule": "one_to_one"}
-syn_dict0={"weight":weight_sg, "delay":delay0, "receptor":0, "synapse_group":0}
-syn_dict1={"weight":weight_sg, "delay":delay1, "receptor":0, "synapse_group":0}
-
-ngpu.Connect(sg, neuron0, conn_dict, syn_dict0)
-ngpu.Connect(sg, neuron1, conn_dict, syn_dict1)
-
-for i in range(N):
-    delay_test = time_diff - dt_list[i]
-    syn_dict_test={"weight":weight_test, "delay":delay_test, "receptor":0, \
-                   "synapse_group":syn_group}
-    ngpu.Connect([neuron0[i]], [neuron1[i]], conn_dict, syn_dict_test)
-
-ngpu.Simulate(200.0)
-
-conn_id = ngpu.GetConnections(neuron0, neuron1)
-conn_status_dict = ngpu.GetStatus(conn_id, ["weight", "delay"])
-print (conn_status_dict)
-for i in range(N):
-    print (dt_list[i], conn_status_dict[0][i])
-    expect_w = dt_list[i]*fact + offset
-    if abs(expect_w - conn_status_dict[0][i])>tolerance:
-        print("Expected weight: ", expect_w, " simulated: ", \
-              conn_status_dict[i][0])
-        #sys.exit(1)
-
-sys.exit(0)
diff --git a/python/test/tmp5.py b/python/test/tmp5.py
deleted file mode 100644
index 54f6d3461..000000000
--- a/python/test/tmp5.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import ctypes
-import nestgpu as ngpu
-
-N = 5
-
-neuron = ngpu.Create("aeif_cond_beta", 2*N)
-neuron_even = []
-neuron_odd = []
-for i in range(N):
-    neuron_even.append(neuron[2*i])
-    neuron_odd.append(neuron[2*i+1])
-
-even_to_odd_delay = []
-even_to_odd_weight = []
-odd_to_even_delay = []
-odd_to_even_weight = []
-
-for isrc in range(N):
-    ise = 2*isrc
-    iso = 2*isrc + 1
-    
-    for itgt in range(N):
-        ite = 2*itgt
-        ito = 2*itgt + 1
-        even_to_odd_delay.append(2.0*N*ise + ito)
-        even_to_odd_weight.append(100.0*(2.0*N*ise + ito))
-        odd_to_even_delay.append(2.0*N*iso + ite)
-        odd_to_even_weight.append(100.0*(2.0*N*iso + ite))
-
-
-conn_dict={"rule": "all_to_all"}
-even_to_odd_syn_dict={
-    "weight_array":even_to_odd_weight,
-    "delay_array":even_to_odd_delay}
-  
-odd_to_even_syn_dict={
-    "weight_array":odd_to_even_weight,
-    "delay_array":odd_to_even_delay}
-  
-ngpu.Connect(neuron_even, neuron_odd, conn_dict, even_to_odd_syn_dict);
-ngpu.Connect(neuron_odd, neuron_even, conn_dict, odd_to_even_syn_dict);
-
-ngpu.Calibrate()
-# Even to all
-conn_id = ngpu.GetConnections(neuron_even, neuron)
-conn_status_dict = ngpu.GetStatus(conn_id)
-print("########################################")
-print("Even to all")
-for i in range(len(conn_status_dict)):
-    print (conn_status_dict[i])
-print()
-print()
-
-# Even to all weight, delay
-conn_status_dict = ngpu.GetStatus(conn_id, ["weight", "delay"])
-print("########################################")
-print("Even to all weight, delat")
-for i in range(len(conn_status_dict)):
-    print (conn_status_dict[i])
-print()
-print()
-
-conn_id = ngpu.GetConnections(neuron, neuron_odd)
-conn_status_dict = ngpu.GetStatus(conn_id)
-print("########################################")
-print("All to odd")
-for i in range(len(conn_status_dict)):
-    print (conn_status_dict[i])
-print()
-print()
-
-# Even to 3,4,5,6
-neuron_3_6 = neuron[3:7]
-conn_id = ngpu.GetConnections(neuron_even, neuron_3_6)
-conn_status_dict = ngpu.GetStatus(conn_id)
-print("########################################")
-print("Even to 3,4,5,6")
-for i in range(len(conn_status_dict)):
-    print (conn_status_dict[i])
-print()
-print()
-
- 
-# 3,4,5,6 to odd
-conn_id = ngpu.GetConnections(neuron_3_6, neuron_odd)
-conn_status_dict = ngpu.GetStatus(conn_id)
-print("########################################")
-print("3,4,5,6 to odd")
-for i in range(len(conn_status_dict)):
-    print (conn_status_dict[i])
-print()
-print()
-
- 
diff --git a/pythonlib/nestgpu.py b/pythonlib/nestgpu.py
index 3570f2dcf..890027ee8 100644
--- a/pythonlib/nestgpu.py
+++ b/pythonlib/nestgpu.py
@@ -48,7 +48,7 @@ class NodeSeq(object):
     def __init__(self, i0, n=1):
         if i0 == None:
             i0 = 0
-            n = -1
+            n = 0 # -1
         self.i0 = i0
         self.n = n
 
@@ -1394,7 +1394,6 @@ def SetNeuronStatus(nodes, var_name, val):
     if (type(nodes)!=list) & (type(nodes)!=tuple) & (type(nodes)!=NodeSeq):
         raise ValueError("Unknown node type")
     if (type(val)==dict):
-        # print("pok0")
         if ((type(nodes)==NodeSeq
              and (IsNeuronScalParam(nodes.i0, var_name)
                   or IsNeuronScalVar(nodes.i0, var_name)
@@ -1404,10 +1403,8 @@ def SetNeuronStatus(nodes, var_name, val):
             or IsNeuronScalVar(nodes[0], var_name)
             or IsNeuronPortParam(nodes[0], var_name)
             or IsNeuronPortVar(nodes[0], var_name)):
-            # print("pok1")
             for dict_param_name in val:
                 pval = val[dict_param_name]
-                # print("pok2 ", dict_param_name, pval) 
                 if dict_param_name=="array":
                     arr = (ctypes.c_float * len(pval))(*pval)
                     array_pt = ctypes.cast(arr, ctypes.c_void_p)
@@ -1417,7 +1414,6 @@ def SetNeuronStatus(nodes, var_name, val):
                 elif dict_param_name=="distribution":
                     distr_idx = distribution_dict[pval]
                     SetDistributionIntParam("distr_idx", distr_idx)
-                    # print("pok3 distr_idx", distr_idx) 
                 else:
                     if IsDistributionFloatParam(dict_param_name):
                         if ((type(nodes)==NodeSeq
@@ -1425,11 +1421,8 @@ def SetNeuronStatus(nodes, var_name, val):
                                  or IsNeuronScalVar(nodes.i0, var_name)))
                             or IsNeuronScalParam(nodes[0], var_name)
                             or IsNeuronScalVar(nodes[0], var_name)):
-                            # print("pok4")
                             SetDistributionIntParam("vect_size", 1)
-                            # print("pok5 ", dict_param_name, pval)
                             SetDistributionScalParam(dict_param_name, pval)
-                            # print("pok6 ", dict_param_name, pval)
                         elif ((type(nodes)==NodeSeq
                             and (IsNeuronPortParam(nodes.i0, var_name)
                                  or IsNeuronPortVar(nodes.i0, var_name)))
@@ -1442,16 +1435,12 @@ def SetNeuronStatus(nodes, var_name, val):
                     else:
                         print("Parameter name: ", dict_param_name)
                         raise ValueError("Unknown distribution parameter")
-                    # print("pok7")
             # set values from array or from distribution
             if type(nodes)==NodeSeq:
-                # print("pok8")
                 if IsNeuronScalParam(nodes.i0, var_name):
                     SetNeuronScalParamDistr(nodes.i0, nodes.n, var_name)
                 elif IsNeuronScalVar(nodes.i0, var_name):
-                    # print("pok9")
                     SetNeuronScalVarDistr(nodes.i0, nodes.n, var_name)
-                    # print("pok10")
                 elif IsNeuronPortParam(nodes.i0, var_name):
                     SetNeuronPortParamDistr(nodes.i0, nodes.n, var_name)
                 elif IsNeuronPortVar(nodes.i0, var_name):
@@ -1622,6 +1611,42 @@ def HostNum():
     return ret
 
 
+NESTGPU_getCUDAMemHostUsed = _nestgpu.NESTGPU_getCUDAMemHostUsed
+NESTGPU_getCUDAMemHostUsed.restype = ctypes.c_size_t
+def getCUDAMemHostUsed():
+    "Get CUDA memory currently used by this host"
+    ret = NESTGPU_getCUDAMemHostUsed()
+    if GetErrorCode() != 0:
+        raise ValueError(GetErrorMessage())
+    return ret
+
+NESTGPU_getCUDAMemHostPeak = _nestgpu.NESTGPU_getCUDAMemHostPeak
+NESTGPU_getCUDAMemHostPeak.restype = ctypes.c_size_t
+def getCUDAMemHostPeak():
+    "Get maximum CUDA memory used by this host"
+    ret = NESTGPU_getCUDAMemHostPeak()
+    if GetErrorCode() != 0:
+        raise ValueError(GetErrorMessage())
+    return ret
+
+NESTGPU_getCUDAMemTotal = _nestgpu.NESTGPU_getCUDAMemTotal
+NESTGPU_getCUDAMemTotal.restype = ctypes.c_size_t
+def getCUDAMemTotal():
+    "Get total CUDA memory"
+    ret = NESTGPU_getCUDAMemTotal()
+    if GetErrorCode() != 0:
+        raise ValueError(GetErrorMessage())
+    return ret
+
+NESTGPU_getCUDAMemFree = _nestgpu.NESTGPU_getCUDAMemFree
+NESTGPU_getCUDAMemFree.restype = ctypes.c_size_t
+def getCUDAMemFree():
+    "Get free CUDA memory"
+    ret = NESTGPU_getCUDAMemFree()
+    if GetErrorCode() != 0:
+        raise ValueError(GetErrorMessage())
+    return ret
+
 NESTGPU_MpiFinalize = _nestgpu.NESTGPU_MpiFinalize
 NESTGPU_MpiFinalize.restype = ctypes.c_int
 def MpiFinalize():
@@ -2220,7 +2245,7 @@ def GetConnections(source=None, target=None, syn_group=-1):
 NESTGPU_GetConnectionStatus = _nestgpu.NESTGPU_GetConnectionStatus
 NESTGPU_GetConnectionStatus.argtypes = (c_int64_p, ctypes.c_int64,
                                         c_int_p, c_int_p,
-                                         c_int_p, c_char_p,
+                                         c_int_p, c_int_p,
                                          c_float_p, c_float_p)
 NESTGPU_GetConnectionStatus.restype = ctypes.c_int
 def GetConnectionStatus(conn):
@@ -2237,14 +2262,13 @@ def GetConnectionStatus(conn):
     i_source = (ctypes.c_int * n_conn)()
     i_target = (ctypes.c_int * n_conn)()
     i_port = (ctypes.c_int * n_conn)()
-    i_syn_group = (ctypes.c_char * n_conn)()
+    i_syn_group = (ctypes.c_int * n_conn)()
     delay = (ctypes.c_float * n_conn)()
     weight = (ctypes.c_float * n_conn)()
     
     NESTGPU_GetConnectionStatus(conn_arr, n_conn, i_source,
                                 i_target, i_port, i_syn_group,
                                 delay, weight)
-        
     status_list = []
     for i in range(n_conn):
         status_dict = {}
@@ -2257,9 +2281,10 @@ def GetConnectionStatus(conn):
         status_dict["weight"] = weight[i]
         
         status_list.append(status_dict)
-
+        
     return status_list
 
+
 NESTGPU_IsConnectionFloatParam = _nestgpu.NESTGPU_IsConnectionFloatParam
 NESTGPU_IsConnectionFloatParam.argtypes = (c_char_p,)
 NESTGPU_IsConnectionFloatParam.restype = ctypes.c_int
@@ -2972,7 +2997,7 @@ def IsIntParam(param_name):
 
     c_param_name = ctypes.create_string_buffer(to_byte_str(param_name),
                                                len(param_name)+1)
-    ret = (NESTGPU_IsIntParam(c_param_name)!=0) 
+    ret = (NESTGPU_IsIntParam(c_param_name)!=0)
     if GetErrorCode() != 0:
         raise ValueError(GetErrorMessage())
     return ret
diff --git a/src/.clang-format b/src/.clang-format
new file mode 100644
index 000000000..c40becf81
--- /dev/null
+++ b/src/.clang-format
@@ -0,0 +1,124 @@
+Language:        Cpp
+AccessModifierOffset: -2
+AlignAfterOpenBracket: DontAlign
+AlignConsecutiveMacros: false
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands:   false
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: All
+AlwaysBreakAfterReturnType: AllDefinitions
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterCaseLabel:  true
+  AfterClass:      true
+  AfterControlStatement: true
+  AfterEnum:       true
+  AfterFunction:   true
+  AfterNamespace:  true
+  AfterObjCDeclaration: true
+  AfterStruct:     true
+  AfterUnion:      false
+  AfterExternBlock: true
+  BeforeCatch:     true
+  BeforeElse:      true
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: NonAssignment
+BreakBeforeBraces: Allman
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeComma
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     120
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 2
+ContinuationIndentWidth: 2
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: false
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Preserve
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+IndentPPDirectives: None
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+InsertBraces: true
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 2
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: true
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: true
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  true
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: true
+SpacesInParentheses: true
+SpacesInSquareBrackets: true
+Standard:        Cpp03
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseTab:          Never
diff --git a/src/.clang-tidy b/src/.clang-tidy
new file mode 100644
index 000000000..e02b1c455
--- /dev/null
+++ b/src/.clang-tidy
@@ -0,0 +1 @@
+Checks: '-*,modernize-use-nullptr,modernize-use-override,bugprone,modernize-redundant-void-arg'
diff --git a/src/.clang-tidy-ignore b/src/.clang-tidy-ignore
new file mode 100644
index 000000000..fbb2a9235
--- /dev/null
+++ b/src/.clang-tidy-ignore
@@ -0,0 +1 @@
+*.cuh
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3bae95d0e..ee83c35fe 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -45,6 +45,8 @@ base_neuron.h
 connect.h
 connect_rules.h
 connect_spec.h
+conn12b.h
+conn16b.h
 copass_kernels.h
 copass_sort.h
 cuda_error.h
@@ -112,6 +114,8 @@ aeif_psc_exp_multisynapse.cu
 base_neuron.cu
 connect.cu
 connect_rules.cu
+conn12b.cu
+conn16b.cu
 copass_kernels.cu
 copass_sort.cu
 distribution.cu
diff --git a/src/aeif_cond_alpha.cu b/src/aeif_cond_alpha.cu
index 43509eb3e..058e80583 100644
--- a/src/aeif_cond_alpha.cu
+++ b/src/aeif_cond_alpha.cu
@@ -20,25 +20,20 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
+#include "aeif_cond_alpha.h"
 #include "aeif_cond_alpha_kernel.h"
 #include "rk5.h"
-#include "aeif_cond_alpha.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace aeif_cond_alpha_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      aeif_cond_alpha_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_cond_alpha_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -67,11 +62,10 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   g1_in = 0;
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, aeif_cond_alpha_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_cond_alpha_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
 
   refractory_step = 0;
   // use normalization for alpha function
@@ -79,28 +73,27 @@ void NodeCalibrate(int n_var, int n_param, double x, float *y,
   g0_in = M_E / tau_syn_in;
 }
 
-}
-			    
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, aeif_cond_alpha_rk5 data_struct)
+} // namespace aeif_cond_alpha_ns
+
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_cond_alpha_rk5 data_struct )
 {
-    aeif_cond_alpha_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  aeif_cond_alpha_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, aeif_cond_alpha_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_cond_alpha_rk5 data_struct )
 
 {
-    aeif_cond_alpha_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  aeif_cond_alpha_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace aeif_cond_alpha_ns;
 
-int aeif_cond_alpha::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+aeif_cond_alpha::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_aeif_cond_alpha_model;
   n_scal_var_ = N_SCAL_VAR;
   n_var_ = n_scal_var_;
@@ -108,45 +101,48 @@ int aeif_cond_alpha::Init(int i_node_0, int n_node, int n_port,
   n_param_ = n_scal_param_;
   n_group_param_ = N_GROUP_PARAM;
 
-  group_param_ = new float[N_GROUP_PARAM];
-  
+  group_param_ = new float[ N_GROUP_PARAM ];
+
   scal_var_name_ = aeif_cond_alpha_scal_var_name;
   scal_param_name_ = aeif_cond_alpha_scal_param_name;
   group_param_name_ = aeif_cond_alpha_group_param_name;
-  //rk5_data_struct_.node_type_ = i_aeif_cond_alpha_model;
+  // rk5_data_struct_.node_type_ = i_aeif_cond_alpha_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
-  port_weight_arr_ = GetParamArr() + GetScalParamIdx("g0_ex");
+  port_weight_arr_ = GetParamArr() + GetScalParamIdx( "g0_ex" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = 1;
 
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("g1_ex");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "g1_ex" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 1;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int aeif_cond_alpha::Calibrate(double time_min, float time_resolution)
+int
+aeif_cond_alpha::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
-int aeif_cond_alpha::Update(long long it, double t1) {
-  rk5_.Update<N_SCAL_VAR, N_SCAL_PARAM>(t1, h_min_, rk5_data_struct_);
+int
+aeif_cond_alpha::Update( long long it, double t1 )
+{
+  rk5_.Update< N_SCAL_VAR, N_SCAL_PARAM >( t1, h_min_, rk5_data_struct_ );
 
   return 0;
 }
diff --git a/src/aeif_cond_alpha.h b/src/aeif_cond_alpha.h
index a89575a1f..46a14a52b 100644
--- a/src/aeif_cond_alpha.h
+++ b/src/aeif_cond_alpha.h
@@ -20,22 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFCONDALPHA_H
 #define AEIFCONDALPHA_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
-/* BeginUserDocs: neuron, integrate-and-fire, adaptive threshold, conductance-based
+/* BeginUserDocs: neuron, integrate-and-fire, adaptive threshold,
+conductance-based
 
 Short description
 +++++++++++++++++
@@ -45,7 +42,7 @@ Conductance-based adaptive exponential integrate-and-fire neuron model
 Description
 +++++++++++
 
-``aeif_cond_alpha`` is a conductance-based adaptive exponential 
+``aeif_cond_alpha`` is a conductance-based adaptive exponential
 integrate-and-fire neuron model according to [1]_ with synaptic
 conductance modeled by an alpha function, as described in [2]_
 
@@ -56,7 +53,8 @@ The membrane potential is given by the following differential equation:
 
 .. math::
 
-  C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T \exp\left(\frac{V-V_{th}}{\Delta_T}\right)
+  C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T
+\exp\left(\frac{V-V_{th}}{\Delta_T}\right)
   + g_{ex}(t) (V - E_{rev\_ ex,i}) + g_{in}(t) (V - E_{rev\_ in,i}) - w + I_e
 
 The differential equation for the spike-adaptation current `w` is
@@ -71,8 +69,9 @@ When the neuron fires a spike, the adaptation current :math:`w <- w + b`.
 
   Although this is not multisynapse, the port (excitatory or inhibitory)
   to be chosen must be specified using the synapse property ``receptor``.
-  The excitatory port has index 0, whereas the inhibitory one has index 1. Differently from
-  NEST, the connection weights related to the inhibitory port must be positive.
+  The excitatory port has index 0, whereas the inhibitory one has index 1.
+Differently from NEST, the connection weights related to the inhibitory port
+must be positive.
 
 Parameters
 ++++++++++
@@ -111,21 +110,23 @@ The following parameters can be set in the status dictionary.
  tau_w   ms      Adaptation time constant
 ======== ======= ==================================
 
-=========== ============= ========================================================
+=========== =============
+========================================================
 **Synaptic parameters**
 ----------------------------------------------------------------------------------
 E_rev_ex    mV            Excitatory reversal potential
 E_rev_in    mV            Inhibitory reversal potential
 tau_syn_ex  ms            Time constant of excitatory synaptic conductance
 tau_syn_in  ms            Time constant of inhibitory synaptic conductance
-=========== ============= ========================================================
+=========== =============
+========================================================
 
 ============= ======= =========================================================
 **Integration parameters**
 -------------------------------------------------------------------------------
-h0_rel        real    Starting step in ODE integration relative to time 
+h0_rel        real    Starting step in ODE integration relative to time
                       resolution
-h_min_rel     real    Minimum step in ODE integration relative to time 
+h_min_rel     real    Minimum step in ODE integration relative to time
                       resolution
 ============= ======= =========================================================
 
@@ -148,7 +149,7 @@ aeif_cond_alpha_multisynapse, aeif_cond_beta
 
 EndUserDocs */
 
-//#define MAX_PORT_NUM 20
+// #define MAX_PORT_NUM 20
 
 struct aeif_cond_alpha_rk5
 {
@@ -157,30 +158,32 @@ struct aeif_cond_alpha_rk5
 
 class aeif_cond_alpha : public BaseNeuron
 {
- public:
-  RungeKutta5<aeif_cond_alpha_rk5> rk5_;
+public:
+  RungeKutta5< aeif_cond_alpha_rk5 > rk5_;
   float h_min_;
   float h_;
   aeif_cond_alpha_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
-
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/aeif_cond_alpha_kernel.h b/src/aeif_cond_alpha_kernel.h
index 52a31ec17..e638b9fb0 100644
--- a/src/aeif_cond_alpha_kernel.h
+++ b/src/aeif_cond_alpha_kernel.h
@@ -20,26 +20,23 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFCONDALPHAKERNEL_H
 #define AEIFCONDALPHAKERNEL_H
 
 #include <string>
-				    //#include <cmath>
-#include "spike_buffer.h"
-#include "node_group.h"
+// #include <cmath>
 #include "aeif_cond_alpha.h"
+#include "node_group.h"
+#include "spike_buffer.h"
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace aeif_cond_alpha_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_g_ex = 0,
   i_g_in,
   i_g1_ex,
@@ -49,7 +46,8 @@ enum ScalVarIndexes {
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_g0_ex = 0,
   i_g0_in,
   i_E_rev_ex,
@@ -73,22 +71,16 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
-const std::string aeif_cond_alpha_scal_var_name[N_SCAL_VAR] = {
-  "g_ex",
-  "g_in",
-  "g1_ex",
-  "g1_in",
-  "V_m",
-  "w"
-};
+const std::string aeif_cond_alpha_scal_var_name[ N_SCAL_VAR ] = { "g_ex", "g_in", "g1_ex", "g1_in", "V_m", "w" };
 
-const std::string aeif_cond_alpha_scal_param_name[N_SCAL_PARAM] = {
+const std::string aeif_cond_alpha_scal_param_name[ N_SCAL_PARAM ] = {
   "g0_ex",
   "g0_in",
   "E_rev_ex",
@@ -111,75 +103,69 @@ const std::string aeif_cond_alpha_scal_param_name[N_SCAL_PARAM] = {
   "den_delay",
 };
 
-const std::string aeif_cond_alpha_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string aeif_cond_alpha_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define g_ex y[i_g_ex]
-#define g1_ex y[i_g1_ex]
-#define g_in y[i_g_in]
-#define g1_in y[i_g1_in]
-#define V_m y[i_V_m]
-#define w y[i_w]
-
-#define dg_exdt dydx[i_g_ex]
-#define dg1_exdt dydx[i_g1_ex]
-#define dg_indt dydx[i_g_in]
-#define dg1_indt dydx[i_g1_in]
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-
-#define g0_ex param[i_g0_ex]
-#define g0_in param[i_g0_in]
-#define E_rev_ex param[i_E_rev_ex]
-#define E_rev_in param[i_E_rev_in]
-#define tau_syn_ex param[i_tau_syn_ex]
-#define tau_syn_in param[i_tau_syn_in]
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     aeif_cond_alpha_rk5 data_struct)
+#define g_ex y[ i_g_ex ]
+#define g1_ex y[ i_g1_ex ]
+#define g_in y[ i_g_in ]
+#define g1_in y[ i_g1_in ]
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+
+#define dg_exdt dydx[ i_g_ex ]
+#define dg1_exdt dydx[ i_g1_ex ]
+#define dg_indt dydx[ i_g_in ]
+#define dg1_indt dydx[ i_g1_in ]
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+
+#define g0_ex param[ i_g0_ex ]
+#define g0_in param[ i_g0_in ]
+#define E_rev_ex param[ i_E_rev_ex ]
+#define E_rev_in param[ i_E_rev_in ]
+#define tau_syn_ex param[ i_tau_syn_ex ]
+#define tau_syn_in param[ i_tau_syn_in ]
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_cond_alpha_rk5 data_struct )
 {
   float I_syn_ex = 0.0;
   float I_syn_in = 0.0;
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
 
-  I_syn_ex += g_ex*(E_rev_ex - V);
-  I_syn_in += g_in*(E_rev_in - V);
+  I_syn_ex += g_ex * ( E_rev_ex - V );
+  I_syn_in += g_in * ( E_rev_in - V );
 
-  float V_spike = Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn_ex + I_syn_in - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn_ex + I_syn_in - w + I_e ) / C_m;
 
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
   // Synaptic conductance derivative
   dg1_exdt = -g1_ex / tau_syn_ex;
   dg_exdt = g1_ex - g_ex / tau_syn_ex;
@@ -187,67 +173,65 @@ __device__
   dg_indt = g1_in - g_in / tau_syn_in;
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			aeif_cond_alpha_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_cond_alpha_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)::round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) ::round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
+}; // namespace aeif_cond_alpha_ns
 
-};
-
-int Update(long long it, double t1);
+int Update( long long it, double t1 );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 aeif_cond_alpha_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_cond_alpha_rk5 data_struct )
 {
-    aeif_cond_alpha_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  aeif_cond_alpha_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    aeif_cond_alpha_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_cond_alpha_rk5 data_struct )
 {
-    aeif_cond_alpha_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  aeif_cond_alpha_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/aeif_cond_alpha_multisynapse.cu b/src/aeif_cond_alpha_multisynapse.cu
index 73d7ce656..095fbd20f 100644
--- a/src/aeif_cond_alpha_multisynapse.cu
+++ b/src/aeif_cond_alpha_multisynapse.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
+#include "aeif_cond_alpha_multisynapse.h"
 #include "aeif_cond_alpha_multisynapse_kernel.h"
 #include "rk5.h"
-#include "aeif_cond_alpha_multisynapse.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace aeif_cond_alpha_multisynapse_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      aeif_cond_alpha_multisynapse_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_cond_alpha_multisynapse_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -54,54 +49,54 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0;
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
-    g(i) = 0;
-    g1(i) = 0;
-    E_rev(i) = 0.0;
-    tau_syn(i) = 2.0;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    g( i ) = 0;
+    g1( i ) = 0;
+    E_rev( i ) = 0.0;
+    tau_syn( i ) = 2.0;
   }
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, aeif_cond_alpha_multisynapse_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_cond_alpha_multisynapse_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
+  for ( int i = 0; i < n_port; i++ )
+  {
     // use normalization for alpha function
-    g0(i) = M_E / tau_syn(i);
+    g0( i ) = M_E / tau_syn( i );
   }
 }
 
-}
-			    
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, aeif_cond_alpha_multisynapse_rk5 data_struct)
+} // namespace aeif_cond_alpha_multisynapse_ns
+
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_cond_alpha_multisynapse_rk5 data_struct )
 {
-    aeif_cond_alpha_multisynapse_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  aeif_cond_alpha_multisynapse_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, aeif_cond_alpha_multisynapse_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_cond_alpha_multisynapse_rk5 data_struct )
 
 {
-    aeif_cond_alpha_multisynapse_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  aeif_cond_alpha_multisynapse_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace aeif_cond_alpha_multisynapse_ns;
 
-int aeif_cond_alpha_multisynapse::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+aeif_cond_alpha_multisynapse::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_aeif_cond_alpha_multisynapse_model;
   n_scal_var_ = N_SCAL_VAR;
   n_port_var_ = N_PORT_VAR;
@@ -109,58 +104,60 @@ int aeif_cond_alpha_multisynapse::Init(int i_node_0, int n_node, int n_port,
   n_port_param_ = N_PORT_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
+
+  group_param_ = new float[ N_GROUP_PARAM ];
 
-  group_param_ = new float[N_GROUP_PARAM];
-  
   scal_var_name_ = aeif_cond_alpha_multisynapse_scal_var_name;
-  port_var_name_= aeif_cond_alpha_multisynapse_port_var_name;
+  port_var_name_ = aeif_cond_alpha_multisynapse_port_var_name;
   scal_param_name_ = aeif_cond_alpha_multisynapse_scal_param_name;
   port_param_name_ = aeif_cond_alpha_multisynapse_port_param_name;
   group_param_name_ = aeif_cond_alpha_multisynapse_group_param_name;
-  //rk5_data_struct_.node_type_ = i_aeif_cond_alpha_multisynapse_model;
+  // rk5_data_struct_.node_type_ = i_aeif_cond_alpha_multisynapse_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
-  port_weight_arr_ = GetParamArr() + n_scal_param_
-    + GetPortParamIdx("g0");
+  port_weight_arr_ = GetParamArr() + n_scal_param_ + GetPortParamIdx( "g0" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = n_port_param_;
 
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("g1");
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "g1" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int aeif_cond_alpha_multisynapse::Calibrate(double time_min, float time_resolution)
+int
+aeif_cond_alpha_multisynapse::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
 template <>
-int aeif_cond_alpha_multisynapse::UpdateNR<0>(long long it, double t1)
+int
+aeif_cond_alpha_multisynapse::UpdateNR< 0 >( long long it, double t1 )
 {
   return 0;
 }
 
-int aeif_cond_alpha_multisynapse::Update(long long it, double t1) {
-  UpdateNR<MAX_PORT_NUM>(it, t1);
+int
+aeif_cond_alpha_multisynapse::Update( long long it, double t1 )
+{
+  UpdateNR< MAX_PORT_NUM >( it, t1 );
 
   return 0;
 }
diff --git a/src/aeif_cond_alpha_multisynapse.h b/src/aeif_cond_alpha_multisynapse.h
index 0a4b2074f..fa3e856fb 100644
--- a/src/aeif_cond_alpha_multisynapse.h
+++ b/src/aeif_cond_alpha_multisynapse.h
@@ -20,23 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFCONDALPHAMULTISYNAPSE_H
 #define AEIFCONDALPHAMULTISYNAPSE_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
-
-/* BeginUserDocs: neuron, integrate-and-fire, adaptive threshold, conductance-based
+/* BeginUserDocs: neuron, integrate-and-fire, adaptive threshold,
+conductance-based
 
 Short description
 +++++++++++++++++
@@ -46,7 +42,7 @@ Conductance-based adaptive exponential integrate-and-fire neuron model
 Description
 +++++++++++
 
-``aeif_cond_alpha_multisynapse`` is a conductance-based adaptive exponential 
+``aeif_cond_alpha_multisynapse`` is a conductance-based adaptive exponential
 integrate-and-fire neuron model according to [1]_ with multiple
 synaptic time constants, and synaptic conductance modeled by an
 alpha function.
@@ -61,7 +57,8 @@ The membrane potential is given by the following differential equation:
 
 .. math::
 
-  C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T \exp\left(\frac{V-V_{th}}{\Delta_T}\right)
+  C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T
+\exp\left(\frac{V-V_{th}}{\Delta_T}\right)
   + I_{syn_{tot}}(V, t)- w + I_e
 
 where
@@ -82,12 +79,13 @@ When the neuron fires a spike, the adaptation current :math:`w <- w + b`.
 
 .. note::
 
-  The number of receptor ports must be specified at neuron creation (default value is 1) and
-  the receptor index starts from 0 (and not from 1 as in NEST multisynapse models).
-  The time constants are supplied by an array, ``tau_syn``, and the pertaining
-  synaptic reversal potentials are supplied by the array ``E_rev``. Port numbers
-  are automatically assigned in the range 0 to ``n_receptors-1``.
-  During connection, the ports are selected with the synapse property ``receptor``.
+  The number of receptor ports must be specified at neuron creation (default
+value is 1) and the receptor index starts from 0 (and not from 1 as in NEST
+multisynapse models). The time constants are supplied by an array, ``tau_syn``,
+and the pertaining synaptic reversal potentials are supplied by the array
+``E_rev``. Port numbers are automatically assigned in the range 0 to
+``n_receptors-1``. During connection, the ports are selected with the synapse
+property ``receptor``.
 
 Parameters
 ++++++++++
@@ -134,9 +132,9 @@ tau_syn  list of ms    Time constant of synaptic conductance
 ============= ======= =========================================================
 **Integration parameters**
 -------------------------------------------------------------------------------
-h0_rel        real    Starting step in ODE integration relative to time 
+h0_rel        real    Starting step in ODE integration relative to time
                       resolution
-h_min_rel     real    Minimum step in ODE integration relative to time 
+h_min_rel     real    Minimum step in ODE integration relative to time
                       resolution
 ============= ======= =========================================================
 
@@ -159,7 +157,6 @@ aeif_cond_beta_multisynapse
 
 EndUserDocs */
 
-
 #define MAX_PORT_NUM 20
 
 struct aeif_cond_alpha_multisynapse_rk5
@@ -169,29 +166,32 @@ struct aeif_cond_alpha_multisynapse_rk5
 
 class aeif_cond_alpha_multisynapse : public BaseNeuron
 {
- public:
-  RungeKutta5<aeif_cond_alpha_multisynapse_rk5> rk5_;
+public:
+  RungeKutta5< aeif_cond_alpha_multisynapse_rk5 > rk5_;
   float h_min_;
   float h_;
   aeif_cond_alpha_multisynapse_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/aeif_cond_alpha_multisynapse_kernel.h b/src/aeif_cond_alpha_multisynapse_kernel.h
index b4d206f85..7c80e1888 100644
--- a/src/aeif_cond_alpha_multisynapse_kernel.h
+++ b/src/aeif_cond_alpha_multisynapse_kernel.h
@@ -20,38 +20,37 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFCONDALPHAMULTISYNAPSEKERNEL_H
 #define AEIFCONDALPHAMULTISYNAPSEKERNEL_H
 
 #include <string>
-				    //#include <cmath>
-#include "spike_buffer.h"
-#include "node_group.h"
+// #include <cmath>
 #include "aeif_cond_alpha_multisynapse.h"
+#include "node_group.h"
+#include "spike_buffer.h"
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace aeif_cond_alpha_multisynapse_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_g = 0,
   i_g1,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -69,31 +68,26 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_E_rev = 0,
   i_tau_syn,
   i_g0,
   N_PORT_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
-const std::string aeif_cond_alpha_multisynapse_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
+const std::string aeif_cond_alpha_multisynapse_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string aeif_cond_alpha_multisynapse_port_var_name[N_PORT_VAR] = {
-  "g",
-  "g1"
-};
+const std::string aeif_cond_alpha_multisynapse_port_var_name[ N_PORT_VAR ] = { "g", "g1" };
 
-const std::string aeif_cond_alpha_multisynapse_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string aeif_cond_alpha_multisynapse_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -106,164 +100,158 @@ const std::string aeif_cond_alpha_multisynapse_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string aeif_cond_alpha_multisynapse_port_param_name[N_PORT_PARAM] = {
-  "E_rev",
-  "tau_syn",
-  "g0"  
-};
+const std::string aeif_cond_alpha_multisynapse_port_param_name[ N_PORT_PARAM ] = { "E_rev", "tau_syn", "g0" };
 
-const std::string aeif_cond_alpha_multisynapse_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string aeif_cond_alpha_multisynapse_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-#define g(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define g1(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-#define dgdt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define dg1dt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define E_rev(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_E_rev]
-#define tau_syn(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_syn]
-#define g0(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_g0]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     aeif_cond_alpha_multisynapse_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+#define g( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define g1( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+#define dgdt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define dg1dt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define E_rev( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_E_rev ]
+#define tau_syn( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_syn ]
+#define g0( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_g0 ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_cond_alpha_multisynapse_rk5 data_struct )
 {
-  enum { n_port = (NVAR-N_SCAL_VAR)/N_PORT_VAR };
+  enum
+  {
+    n_port = ( NVAR - N_SCAL_VAR ) / N_PORT_VAR
+  };
   float I_syn = 0.0;
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
-  for (int i = 0; i<n_port; i++) {
-    I_syn += g(i)*(E_rev(i) - V);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn += g( i ) * ( E_rev( i ) - V );
   }
-  float V_spike = Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn - w + I_e ) / C_m;
 
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
-  for (int i=0; i<n_port; i++) {
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
+  for ( int i = 0; i < n_port; i++ )
+  {
     // Synaptic conductance derivative
-    dg1dt(i) = -g1(i) / tau_syn(i);
-    dgdt(i) = g1(i) - g(i) / tau_syn(i);
+    dg1dt( i ) = -g1( i ) / tau_syn( i );
+    dgdt( i ) = g1( i ) - g( i ) / tau_syn( i );
   }
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			aeif_cond_alpha_multisynapse_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_cond_alpha_multisynapse_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)::round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) ::round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
-};
+}; // namespace aeif_cond_alpha_multisynapse_ns
 
 template <>
-int aeif_cond_alpha_multisynapse::UpdateNR<0>(long long it, double t1);
+int aeif_cond_alpha_multisynapse::UpdateNR< 0 >( long long it, double t1 );
 
-template<int N_PORT>
-int aeif_cond_alpha_multisynapse::UpdateNR(long long it, double t1)
+template < int N_PORT >
+int
+aeif_cond_alpha_multisynapse::UpdateNR( long long it, double t1 )
 {
-  if (N_PORT == n_port_) {
-    const int NVAR = aeif_cond_alpha_multisynapse_ns::N_SCAL_VAR
-      + aeif_cond_alpha_multisynapse_ns::N_PORT_VAR*N_PORT;
-    const int NPARAM = aeif_cond_alpha_multisynapse_ns::N_SCAL_PARAM
-      + aeif_cond_alpha_multisynapse_ns::N_PORT_PARAM*N_PORT;
+  if ( N_PORT == n_port_ )
+  {
+    const int NVAR = aeif_cond_alpha_multisynapse_ns::N_SCAL_VAR + aeif_cond_alpha_multisynapse_ns::N_PORT_VAR * N_PORT;
+    const int NPARAM =
+      aeif_cond_alpha_multisynapse_ns::N_SCAL_PARAM + aeif_cond_alpha_multisynapse_ns::N_PORT_PARAM * N_PORT;
 
-    rk5_.Update<NVAR, NPARAM>(t1, h_min_, rk5_data_struct_);
+    rk5_.Update< NVAR, NPARAM >( t1, h_min_, rk5_data_struct_ );
   }
-  else {
-    UpdateNR<N_PORT - 1>(it, t1);
+  else
+  {
+    UpdateNR< N_PORT - 1 >( it, t1 );
   }
 
   return 0;
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 aeif_cond_alpha_multisynapse_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_cond_alpha_multisynapse_rk5 data_struct )
 {
-    aeif_cond_alpha_multisynapse_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  aeif_cond_alpha_multisynapse_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    aeif_cond_alpha_multisynapse_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_cond_alpha_multisynapse_rk5 data_struct )
 {
-    aeif_cond_alpha_multisynapse_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  aeif_cond_alpha_multisynapse_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/aeif_cond_alpha_multisynapse_rk5.h b/src/aeif_cond_alpha_multisynapse_rk5.h
index 159a18eca..96ee690e7 100644
--- a/src/aeif_cond_alpha_multisynapse_rk5.h
+++ b/src/aeif_cond_alpha_multisynapse_rk5.h
@@ -20,32 +20,23 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFCONDALPHAMULTISYNAPSERK5_H
 #define AEIFCONDALPHAMULTISYNAPSERK5_H
 
 struct aeif_cond_alpha_multisynapse_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_cond_alpha_multisynapse_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 aeif_cond_alpha_multisynapse_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    aeif_cond_alpha_multisynapse_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_cond_alpha_multisynapse_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, aeif_cond_alpha_multisynapse_rk5 data_struct);
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_cond_alpha_multisynapse_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, aeif_cond_alpha_multisynapse_rk5 data_struct);
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_cond_alpha_multisynapse_rk5 data_struct );
 
 #endif
diff --git a/src/aeif_cond_beta.cu b/src/aeif_cond_beta.cu
index 5208c99a0..559a5ded9 100644
--- a/src/aeif_cond_beta.cu
+++ b/src/aeif_cond_beta.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
+#include "aeif_cond_beta.h"
 #include "aeif_cond_beta_kernel.h"
 #include "rk5.h"
-#include "aeif_cond_beta.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace aeif_cond_beta_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      aeif_cond_beta_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_cond_beta_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
   V_th = -50.4;
   Delta_T = 2.0;
   g_L = 30.0;
@@ -59,7 +54,7 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   tau_decay_in = 20.0;
   tau_rise_ex = 2.0;
   tau_rise_in = 2.0;
-  
+
   V_m = E_L;
   w = 0;
   refractory_step = 0;
@@ -69,71 +64,75 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   g1_in = 0;
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, aeif_cond_beta_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_cond_beta_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+
   refractory_step = 0;
-  
+
   // denominator is computed here to check that it is != 0
   float denom1 = tau_decay_ex - tau_rise_ex;
   float denom2 = 0;
-  if (denom1 != 0) {
+  if ( denom1 != 0 )
+  {
     // peak time
-    float t_p = tau_decay_ex*tau_rise_ex*log(tau_decay_ex/tau_rise_ex) / denom1;
+    float t_p = tau_decay_ex * tau_rise_ex * log( tau_decay_ex / tau_rise_ex ) / denom1;
     // another denominator is computed here to check that it is != 0
-    denom2 = exp(-t_p / tau_decay_ex) - exp(-t_p / tau_rise_ex);
+    denom2 = exp( -t_p / tau_decay_ex ) - exp( -t_p / tau_rise_ex );
   }
-  if (denom2 == 0) { // if rise time == decay time use alpha function
+  if ( denom2 == 0 )
+  { // if rise time == decay time use alpha function
     // use normalization for alpha function in this case
     g0_ex = M_E / tau_decay_ex;
   }
-  else { // if rise time != decay time use beta function
+  else
+  { // if rise time != decay time use beta function
     // normalization factor for conductance
     g0_ex = ( 1. / tau_rise_ex - 1. / tau_decay_ex ) / denom2;
   }
 
   denom1 = tau_decay_in - tau_rise_in;
   denom2 = 0;
-  if (denom1 != 0) {
+  if ( denom1 != 0 )
+  {
     // peak time
-    float t_p = tau_decay_in*tau_rise_in*log(tau_decay_in/tau_rise_in) / denom1;
+    float t_p = tau_decay_in * tau_rise_in * log( tau_decay_in / tau_rise_in ) / denom1;
     // another denominator is computed here to check that it is != 0
-    denom2 = exp(-t_p / tau_decay_in) - exp(-t_p / tau_rise_in);
+    denom2 = exp( -t_p / tau_decay_in ) - exp( -t_p / tau_rise_in );
   }
-  if (denom2 == 0) { // if rise time == decay time use alpha function
+  if ( denom2 == 0 )
+  { // if rise time == decay time use alpha function
     // use normalization for alpha function in this case
     g0_in = M_E / tau_decay_in;
   }
-  else { // if rise time != decay time use beta function
+  else
+  { // if rise time != decay time use beta function
     // normalization factor for conductance
     g0_in = ( 1. / tau_rise_in - 1. / tau_decay_in ) / denom2;
   }
 }
-}
-			    
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, aeif_cond_beta_rk5 data_struct)
+} // namespace aeif_cond_beta_ns
+
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_cond_beta_rk5 data_struct )
 {
-    aeif_cond_beta_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  aeif_cond_beta_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, aeif_cond_beta_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_cond_beta_rk5 data_struct )
 
 {
-    aeif_cond_beta_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  aeif_cond_beta_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace aeif_cond_beta_ns;
 
-int aeif_cond_beta::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+aeif_cond_beta::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_aeif_cond_beta_model;
   n_scal_var_ = N_SCAL_VAR;
   n_scal_param_ = N_SCAL_PARAM;
@@ -142,45 +141,48 @@ int aeif_cond_beta::Init(int i_node_0, int n_node, int n_port,
   n_var_ = n_scal_var_;
   n_param_ = n_scal_param_;
 
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = aeif_cond_beta_scal_var_name;
   scal_param_name_ = aeif_cond_beta_scal_param_name;
   group_param_name_ = aeif_cond_beta_group_param_name;
-  //rk5_data_struct_.node_type_ = i_aeif_cond_beta_model;
+  // rk5_data_struct_.node_type_ = i_aeif_cond_beta_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
-  port_weight_arr_ = GetParamArr() + GetScalParamIdx("g0_ex");
+  port_weight_arr_ = GetParamArr() + GetScalParamIdx( "g0_ex" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = 1;
 
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("g1_ex");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "g1_ex" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 1;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int aeif_cond_beta::Calibrate(double time_min, float time_resolution)
+int
+aeif_cond_beta::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
-int aeif_cond_beta::Update(long long it, double t1) {
-  rk5_.Update<N_SCAL_VAR, N_SCAL_PARAM>(t1, h_min_, rk5_data_struct_);
+int
+aeif_cond_beta::Update( long long it, double t1 )
+{
+  rk5_.Update< N_SCAL_VAR, N_SCAL_PARAM >( t1, h_min_, rk5_data_struct_ );
 
   return 0;
 }
diff --git a/src/aeif_cond_beta.h b/src/aeif_cond_beta.h
index 8721be54a..4f0c00815 100644
--- a/src/aeif_cond_beta.h
+++ b/src/aeif_cond_beta.h
@@ -20,23 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFCONDBETA_H
 #define AEIFCONDBETA_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
-
-/* BeginUserDocs: neuron, adaptive threshold, integrate-and-fire, conductance-based
+/* BeginUserDocs: neuron, adaptive threshold, integrate-and-fire,
+conductance-based
 
 Short description
 +++++++++++++++++
@@ -46,7 +42,7 @@ Conductance-based adaptive exponential integrate-and-fire neuron model
 Description
 +++++++++++
 
-``aeif_cond_beta`` is a conductance-based adaptive exponential 
+``aeif_cond_beta`` is a conductance-based adaptive exponential
 integrate-and-fire neuron model according to [1]_ with synaptic
 conductance modeled by a beta function, as described in [2]_.
 
@@ -57,7 +53,8 @@ The membrane potential is given by the following differential equation:
 
 .. math::
 
-  C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T \exp\left(\frac{V-V_{th}}{\Delta_T}\right)
+  C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T
+\exp\left(\frac{V-V_{th}}{\Delta_T}\right)
   + g_{ex}(t) (V - E_{rev\_ ex,i}) + g_{in}(t) (V - E_{rev\_ in,i}) - w + I_e
 
 The differential equation for the spike-adaptation current `w` is
@@ -72,8 +69,9 @@ When the neuron fires a spike, the adaptation current `w <- w + b`.
 
   Although this model is not multisynapse, the port (excitatory or inhibitory)
   to be chosen must be specified using the synapse property ``receptor``.
-  The excitatory port has index 0, whereas the inhibitory one has index 1. Differently from
-  NEST, the connection weights related to the inhibitory port must be positive.
+  The excitatory port has index 0, whereas the inhibitory one has index 1.
+Differently from NEST, the connection weights related to the inhibitory port
+must be positive.
 
 Parameters
 ++++++++++
@@ -112,23 +110,26 @@ The following parameters can be set in the status dictionary.
  tau_w   ms      Adaptation time constant
 ======== ======= ==================================
 
-============ ============= ======================================================
+============ =============
+======================================================
 **Synaptic parameters**
 ---------------------------------------------------------------------------------
 E_rev_ex     mV            Excitatory reversal potential
 E_rev_in     mV            Inhibitory reversal potential
 tau_rise_ex  ms            Rise time constant of excitatory synaptic conductance
 tau_rise_in  ms            Rise time constant of inhibitory synaptic conductance
-tau_decay_ex ms            Decay time constant of excitatory synaptic conductance
-tau_decay_in ms            Decay time constant of inhibitory synaptic conductance
-============ ============= ======================================================
+tau_decay_ex ms            Decay time constant of excitatory synaptic
+conductance tau_decay_in ms            Decay time constant of inhibitory
+synaptic conductance
+============ =============
+======================================================
 
 ========= ======= =========================================================
 **Integration parameters**
 ---------------------------------------------------------------------------
-h0_rel    real    Starting step in ODE integration relative to time 
+h0_rel    real    Starting step in ODE integration relative to time
                   resolution
-h_min_rel real    Minimum step in ODE integration relative to time 
+h_min_rel real    Minimum step in ODE integration relative to time
                   resolution
 ========= ======= =========================================================
 
@@ -151,8 +152,7 @@ aeif_cond_beta_multisynapse, aeif_cond_alpha
 
 EndUserDocs */
 
-
-//#define MAX_PORT_NUM 20
+// #define MAX_PORT_NUM 20
 
 struct aeif_cond_beta_rk5
 {
@@ -161,30 +161,32 @@ struct aeif_cond_beta_rk5
 
 class aeif_cond_beta : public BaseNeuron
 {
- public:
-  RungeKutta5<aeif_cond_beta_rk5> rk5_;
+public:
+  RungeKutta5< aeif_cond_beta_rk5 > rk5_;
   float h_min_;
   float h_;
   aeif_cond_beta_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
-
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/aeif_cond_beta_kernel.h b/src/aeif_cond_beta_kernel.h
index f324342c5..5eafb29ef 100644
--- a/src/aeif_cond_beta_kernel.h
+++ b/src/aeif_cond_beta_kernel.h
@@ -20,26 +20,23 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFCONDBETAKERNEL_H
 #define AEIFCONDBETAKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
-#include "node_group.h"
 #include "aeif_cond_beta.h"
+#include "node_group.h"
+#include "spike_buffer.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace aeif_cond_beta_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_g_ex = 0,
   i_g_in,
   i_g1_ex,
@@ -49,7 +46,8 @@ enum ScalVarIndexes {
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_g0_ex = 0,
   i_g0_in,
   i_E_rev_ex,
@@ -75,24 +73,16 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
+const std::string aeif_cond_beta_scal_var_name[ N_SCAL_VAR ] = { "g_ex", "g_in", "g1_ex", "g1_in", "V_m", "w" };
 
-const std::string aeif_cond_beta_scal_var_name[N_SCAL_VAR] = {
-  "g_ex",
-  "g_in",
-  "g1_ex",
-  "g1_in",
-  "V_m",
-  "w"
-};
-
-const std::string aeif_cond_beta_scal_param_name[N_SCAL_PARAM] = {
-  "g0_ex",
+const std::string aeif_cond_beta_scal_param_name[ N_SCAL_PARAM ] = { "g0_ex",
   "g0_in",
   "E_rev_ex",
   "E_rev_in",
@@ -113,145 +103,136 @@ const std::string aeif_cond_beta_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string aeif_cond_beta_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string aeif_cond_beta_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define g_ex y[i_g_ex]
-#define g1_ex y[i_g1_ex]
-#define g_in y[i_g_in]
-#define g1_in y[i_g1_in]
-#define V_m y[i_V_m]
-#define w y[i_w]
-
-#define dg_exdt dydx[i_g_ex]
-#define dg1_exdt dydx[i_g1_ex]
-#define dg_indt dydx[i_g_in]
-#define dg1_indt dydx[i_g1_in]
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-
-#define g0_ex param[i_g0_ex]
-#define g0_in param[i_g0_in]
-#define E_rev_ex param[i_E_rev_ex]
-#define E_rev_in param[i_E_rev_in]
-#define tau_rise_ex param[i_tau_rise_ex]
-#define tau_rise_in param[i_tau_rise_in]
-#define tau_decay_ex param[i_tau_decay_ex]
-#define tau_decay_in param[i_tau_decay_in]
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     aeif_cond_beta_rk5 data_struct)
+#define g_ex y[ i_g_ex ]
+#define g1_ex y[ i_g1_ex ]
+#define g_in y[ i_g_in ]
+#define g1_in y[ i_g1_in ]
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+
+#define dg_exdt dydx[ i_g_ex ]
+#define dg1_exdt dydx[ i_g1_ex ]
+#define dg_indt dydx[ i_g_in ]
+#define dg1_indt dydx[ i_g1_in ]
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+
+#define g0_ex param[ i_g0_ex ]
+#define g0_in param[ i_g0_in ]
+#define E_rev_ex param[ i_E_rev_ex ]
+#define E_rev_in param[ i_E_rev_in ]
+#define tau_rise_ex param[ i_tau_rise_ex ]
+#define tau_rise_in param[ i_tau_rise_in ]
+#define tau_decay_ex param[ i_tau_decay_ex ]
+#define tau_decay_in param[ i_tau_decay_in ]
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_cond_beta_rk5 data_struct )
 {
   float I_syn_in = 0.0;
   float I_syn_ex = 0.0;
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
-  I_syn_ex += g_ex*(E_rev_ex - V);
-  I_syn_in += g_in*(E_rev_in - V);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+  I_syn_ex += g_ex * ( E_rev_ex - V );
+  I_syn_in += g_in * ( E_rev_in - V );
 
-  float V_spike = Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn_ex + I_syn_in - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn_ex + I_syn_in - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
   dg1_exdt = -g1_ex / tau_rise_ex;
   dg_exdt = g1_ex - g_ex / tau_decay_ex;
   dg1_indt = -g1_in / tau_rise_in;
   dg_indt = g1_in - g_in / tau_decay_in;
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			aeif_cond_beta_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_cond_beta_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
+}; // namespace aeif_cond_beta_ns
 
-};
-
-int Update(long long it, double t1);
+int Update( long long it, double t1 );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 aeif_cond_beta_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_cond_beta_rk5 data_struct )
 {
-    aeif_cond_beta_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  aeif_cond_beta_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    aeif_cond_beta_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_cond_beta_rk5 data_struct )
 {
-    aeif_cond_beta_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  aeif_cond_beta_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/aeif_cond_beta_multisynapse.cu b/src/aeif_cond_beta_multisynapse.cu
index 93a83e070..e4aaa42fa 100644
--- a/src/aeif_cond_beta_multisynapse.cu
+++ b/src/aeif_cond_beta_multisynapse.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
+#include "aeif_cond_beta_multisynapse.h"
 #include "aeif_cond_beta_multisynapse_kernel.h"
 #include "rk5.h"
-#include "aeif_cond_beta_multisynapse.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace aeif_cond_beta_multisynapse_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      aeif_cond_beta_multisynapse_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_cond_beta_multisynapse_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -54,72 +49,73 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0;
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
-    g(i) = 0;
-    g1(i) = 0;
-    E_rev(i) = 0.0;
-    tau_decay(i) = 20.0;
-    tau_rise(i) = 2.0;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    g( i ) = 0;
+    g1( i ) = 0;
+    E_rev( i ) = 0.0;
+    tau_decay( i ) = 20.0;
+    tau_rise( i ) = 2.0;
   }
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, aeif_cond_beta_multisynapse_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_cond_beta_multisynapse_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
+  for ( int i = 0; i < n_port; i++ )
+  {
     // denominator is computed here to check that it is != 0
-    float denom1 = tau_decay(i) - tau_rise(i);
+    float denom1 = tau_decay( i ) - tau_rise( i );
     float denom2 = 0;
-    if (denom1 != 0) {
+    if ( denom1 != 0 )
+    {
       // peak time
-      float t_p = tau_decay(i)*tau_rise(i)
-	*log(tau_decay(i)/tau_rise(i)) / denom1;
+      float t_p = tau_decay( i ) * tau_rise( i ) * log( tau_decay( i ) / tau_rise( i ) ) / denom1;
       // another denominator is computed here to check that it is != 0
-      denom2 = exp(-t_p / tau_decay(i))
-	- exp(-t_p / tau_rise(i));
+      denom2 = exp( -t_p / tau_decay( i ) ) - exp( -t_p / tau_rise( i ) );
     }
-    if (denom2 == 0) { // if rise time == decay time use alpha function
+    if ( denom2 == 0 )
+    { // if rise time == decay time use alpha function
       // use normalization for alpha function in this case
-      g0(i) = M_E / tau_decay(i);
+      g0( i ) = M_E / tau_decay( i );
     }
-    else { // if rise time != decay time use beta function
-      g0(i) // normalization factor for conductance
-	= ( 1. / tau_rise(i) - 1. / tau_decay(i) ) / denom2;
+    else
+    {         // if rise time != decay time use beta function
+      g0( i ) // normalization factor for conductance
+        = ( 1. / tau_rise( i ) - 1. / tau_decay( i ) ) / denom2;
     }
   }
 }
 
-}
-			    
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, aeif_cond_beta_multisynapse_rk5 data_struct)
+} // namespace aeif_cond_beta_multisynapse_ns
+
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_cond_beta_multisynapse_rk5 data_struct )
 {
-    aeif_cond_beta_multisynapse_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  aeif_cond_beta_multisynapse_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, aeif_cond_beta_multisynapse_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_cond_beta_multisynapse_rk5 data_struct )
 
 {
-    aeif_cond_beta_multisynapse_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  aeif_cond_beta_multisynapse_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace aeif_cond_beta_multisynapse_ns;
 
-int aeif_cond_beta_multisynapse::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+aeif_cond_beta_multisynapse::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_aeif_cond_beta_multisynapse_model;
   n_scal_var_ = N_SCAL_VAR;
   n_port_var_ = N_PORT_VAR;
@@ -127,58 +123,60 @@ int aeif_cond_beta_multisynapse::Init(int i_node_0, int n_node, int n_port,
   n_port_param_ = N_PORT_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
 
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = aeif_cond_beta_multisynapse_scal_var_name;
-  port_var_name_= aeif_cond_beta_multisynapse_port_var_name;
+  port_var_name_ = aeif_cond_beta_multisynapse_port_var_name;
   scal_param_name_ = aeif_cond_beta_multisynapse_scal_param_name;
   port_param_name_ = aeif_cond_beta_multisynapse_port_param_name;
   group_param_name_ = aeif_cond_beta_multisynapse_group_param_name;
-  //rk5_data_struct_.node_type_ = i_aeif_cond_beta_multisynapse_model;
+  // rk5_data_struct_.node_type_ = i_aeif_cond_beta_multisynapse_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
-  port_weight_arr_ = GetParamArr() + n_scal_param_
-    + GetPortParamIdx("g0");
+  port_weight_arr_ = GetParamArr() + n_scal_param_ + GetPortParamIdx( "g0" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = n_port_param_;
 
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("g1");
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "g1" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int aeif_cond_beta_multisynapse::Calibrate(double time_min, float time_resolution)
+int
+aeif_cond_beta_multisynapse::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
 template <>
-int aeif_cond_beta_multisynapse::UpdateNR<0>(long long it, double t1)
+int
+aeif_cond_beta_multisynapse::UpdateNR< 0 >( long long it, double t1 )
 {
   return 0;
 }
 
-int aeif_cond_beta_multisynapse::Update(long long it, double t1) {
-  UpdateNR<MAX_PORT_NUM>(it, t1);
+int
+aeif_cond_beta_multisynapse::Update( long long it, double t1 )
+{
+  UpdateNR< MAX_PORT_NUM >( it, t1 );
 
   return 0;
 }
diff --git a/src/aeif_cond_beta_multisynapse.h b/src/aeif_cond_beta_multisynapse.h
index 5c9d01ae1..3749616d1 100644
--- a/src/aeif_cond_beta_multisynapse.h
+++ b/src/aeif_cond_beta_multisynapse.h
@@ -20,23 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFCONDBETAMULTISYNAPSE_H
 #define AEIFCONDBETAMULTISYNAPSE_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
-
-/* BeginUserDocs: neuron, adaptive threshold, integrate-and-fire, conductance-based
+/* BeginUserDocs: neuron, adaptive threshold, integrate-and-fire,
+conductance-based
 
 Short description
 +++++++++++++++++
@@ -46,7 +42,7 @@ Conductance-based adaptive exponential integrate-and-fire neuron model
 Description
 +++++++++++
 
-``aeif_cond_beta_multisynapse`` is a conductance-based adaptive exponential 
+``aeif_cond_beta_multisynapse`` is a conductance-based adaptive exponential
 integrate-and-fire neuron model according to [1]_ with
 multiple synaptic rise time and decay time constants, and synaptic conductance
 modeled by a beta function.
@@ -61,7 +57,8 @@ The membrane potential is given by the following differential equation:
 
 .. math::
 
-  C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T \exp\left(\frac{V-V_{th}}{\Delta_T}\right)
+  C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T
+\exp\left(\frac{V-V_{th}}{\Delta_T}\right)
   + I_{syn_{tot}}(V, t)- w + I_e
 
 where:
@@ -82,13 +79,14 @@ When the neuron fires a spike, the adaptation current `w <- w + b`.
 
 .. note::
 
-  The number of receptor ports must be specified at neuron creation (default value is 1) and
-  the receptor index starts from 0 (and not from 1 as in NEST multisynapse models).
-  The time constants are supplied by by two arrays, ``tau_rise`` and ``tau_decay`` for
-  the synaptic rise time and decay time, respectively. The synaptic
-  reversal potentials are supplied by the array ``E_rev``. Port numbers
-  are automatically assigned in the range 0 to ``n_receptors-1``.
-  During connection, the ports are selected with the synapse property ``receptor``.
+  The number of receptor ports must be specified at neuron creation (default
+value is 1) and the receptor index starts from 0 (and not from 1 as in NEST
+multisynapse models). The time constants are supplied by by two arrays,
+``tau_rise`` and ``tau_decay`` for the synaptic rise time and decay time,
+respectively. The synaptic reversal potentials are supplied by the array
+``E_rev``. Port numbers are automatically assigned in the range 0 to
+``n_receptors-1``. During connection, the ports are selected with the synapse
+property ``receptor``.
 
 Parameters
 ++++++++++
@@ -136,9 +134,9 @@ tau_decay list of ms    Decay time constant of synaptic conductance
 ========= ======= =========================================================
 **Integration parameters**
 ---------------------------------------------------------------------------
-h0_rel    real    Starting step in ODE integration relative to time 
+h0_rel    real    Starting step in ODE integration relative to time
                   resolution
-h_min_rel real    Minimum step in ODE integration relative to time 
+h_min_rel real    Minimum step in ODE integration relative to time
                   resolution
 ========= ======= =========================================================
 
@@ -161,7 +159,6 @@ aeif_cond_alpha_multisynapse
 
 EndUserDocs */
 
-
 #define MAX_PORT_NUM 20
 
 struct aeif_cond_beta_multisynapse_rk5
@@ -171,29 +168,32 @@ struct aeif_cond_beta_multisynapse_rk5
 
 class aeif_cond_beta_multisynapse : public BaseNeuron
 {
- public:
-  RungeKutta5<aeif_cond_beta_multisynapse_rk5> rk5_;
+public:
+  RungeKutta5< aeif_cond_beta_multisynapse_rk5 > rk5_;
   float h_min_;
   float h_;
   aeif_cond_beta_multisynapse_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/aeif_cond_beta_multisynapse_kernel.h b/src/aeif_cond_beta_multisynapse_kernel.h
index 798cfa871..6d0d181b9 100644
--- a/src/aeif_cond_beta_multisynapse_kernel.h
+++ b/src/aeif_cond_beta_multisynapse_kernel.h
@@ -20,38 +20,37 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFCONDBETAMULTISYNAPSEKERNEL_H
 #define AEIFCONDBETAMULTISYNAPSEKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
-#include "node_group.h"
 #include "aeif_cond_beta_multisynapse.h"
+#include "node_group.h"
+#include "spike_buffer.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace aeif_cond_beta_multisynapse_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_g = 0,
   i_g1,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -69,7 +68,8 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_E_rev = 0,
   i_tau_rise,
   i_tau_decay,
@@ -77,25 +77,18 @@ enum PortParamIndexes {
   N_PORT_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
+const std::string aeif_cond_beta_multisynapse_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string aeif_cond_beta_multisynapse_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
-
-const std::string aeif_cond_beta_multisynapse_port_var_name[N_PORT_VAR] = {
-  "g",
-  "g1"
-};
+const std::string aeif_cond_beta_multisynapse_port_var_name[ N_PORT_VAR ] = { "g", "g1" };
 
-const std::string aeif_cond_beta_multisynapse_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string aeif_cond_beta_multisynapse_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -108,165 +101,161 @@ const std::string aeif_cond_beta_multisynapse_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string aeif_cond_beta_multisynapse_port_param_name[N_PORT_PARAM] = {
-  "E_rev",
+const std::string aeif_cond_beta_multisynapse_port_param_name[ N_PORT_PARAM ] = { "E_rev",
   "tau_rise",
   "tau_decay",
-  "g0"  
-};
+  "g0" };
 
-const std::string aeif_cond_beta_multisynapse_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string aeif_cond_beta_multisynapse_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-#define g(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define g1(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-#define dgdt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define dg1dt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define E_rev(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_E_rev]
-#define tau_rise(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_rise]
-#define tau_decay(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_decay]
-#define g0(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_g0]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     aeif_cond_beta_multisynapse_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+#define g( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define g1( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+#define dgdt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define dg1dt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define E_rev( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_E_rev ]
+#define tau_rise( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_rise ]
+#define tau_decay( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_decay ]
+#define g0( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_g0 ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_cond_beta_multisynapse_rk5 data_struct )
 {
-  enum { n_port = (NVAR-N_SCAL_VAR)/N_PORT_VAR };
+  enum
+  {
+    n_port = ( NVAR - N_SCAL_VAR ) / N_PORT_VAR
+  };
   float I_syn = 0.0;
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
-  for (int i = 0; i<n_port; i++) {
-    I_syn += g(i)*(E_rev(i) - V);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn += g( i ) * ( E_rev( i ) - V );
   }
-  float V_spike = Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
-  for (int i=0; i<n_port; i++) {
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
+  for ( int i = 0; i < n_port; i++ )
+  {
     // Synaptic conductance derivative
-    dg1dt(i) = -g1(i) / tau_rise(i);
-    dgdt(i) = g1(i) - g(i) / tau_decay(i);
+    dg1dt( i ) = -g1( i ) / tau_rise( i );
+    dgdt( i ) = g1( i ) - g( i ) / tau_decay( i );
   }
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			aeif_cond_beta_multisynapse_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_cond_beta_multisynapse_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
-};
+}; // namespace aeif_cond_beta_multisynapse_ns
 
 template <>
-int aeif_cond_beta_multisynapse::UpdateNR<0>(long long it, double t1);
+int aeif_cond_beta_multisynapse::UpdateNR< 0 >( long long it, double t1 );
 
-template<int N_PORT>
-int aeif_cond_beta_multisynapse::UpdateNR(long long it, double t1)
+template < int N_PORT >
+int
+aeif_cond_beta_multisynapse::UpdateNR( long long it, double t1 )
 {
-  if (N_PORT == n_port_) {
-    const int NVAR = aeif_cond_beta_multisynapse_ns::N_SCAL_VAR
-      + aeif_cond_beta_multisynapse_ns::N_PORT_VAR*N_PORT;
-    const int NPARAM = aeif_cond_beta_multisynapse_ns::N_SCAL_PARAM
-      + aeif_cond_beta_multisynapse_ns::N_PORT_PARAM*N_PORT;
+  if ( N_PORT == n_port_ )
+  {
+    const int NVAR = aeif_cond_beta_multisynapse_ns::N_SCAL_VAR + aeif_cond_beta_multisynapse_ns::N_PORT_VAR * N_PORT;
+    const int NPARAM =
+      aeif_cond_beta_multisynapse_ns::N_SCAL_PARAM + aeif_cond_beta_multisynapse_ns::N_PORT_PARAM * N_PORT;
 
-    rk5_.Update<NVAR, NPARAM>(t1, h_min_, rk5_data_struct_);
+    rk5_.Update< NVAR, NPARAM >( t1, h_min_, rk5_data_struct_ );
   }
-  else {
-    UpdateNR<N_PORT - 1>(it, t1);
+  else
+  {
+    UpdateNR< N_PORT - 1 >( it, t1 );
   }
 
   return 0;
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 aeif_cond_beta_multisynapse_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_cond_beta_multisynapse_rk5 data_struct )
 {
-    aeif_cond_beta_multisynapse_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  aeif_cond_beta_multisynapse_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    aeif_cond_beta_multisynapse_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_cond_beta_multisynapse_rk5 data_struct )
 {
-    aeif_cond_beta_multisynapse_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  aeif_cond_beta_multisynapse_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/aeif_cond_beta_multisynapse_rk5.h b/src/aeif_cond_beta_multisynapse_rk5.h
index 543ed5879..9bb14f932 100644
--- a/src/aeif_cond_beta_multisynapse_rk5.h
+++ b/src/aeif_cond_beta_multisynapse_rk5.h
@@ -20,32 +20,23 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFCONDBETAMULTISYNAPSERK5_H
 #define AEIFCONDBETAMULTISYNAPSERK5_H
 
 struct aeif_cond_beta_multisynapse_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_cond_beta_multisynapse_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 aeif_cond_beta_multisynapse_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    aeif_cond_beta_multisynapse_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_cond_beta_multisynapse_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, aeif_cond_beta_multisynapse_rk5 data_struct);
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_cond_beta_multisynapse_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, aeif_cond_beta_multisynapse_rk5 data_struct);
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_cond_beta_multisynapse_rk5 data_struct );
 
 #endif
diff --git a/src/aeif_psc_alpha.cu b/src/aeif_psc_alpha.cu
index 1cf89bc48..d2a51cf7d 100644
--- a/src/aeif_psc_alpha.cu
+++ b/src/aeif_psc_alpha.cu
@@ -20,25 +20,20 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
+#include "aeif_psc_alpha.h"
 #include "aeif_psc_alpha_kernel.h"
 #include "rk5.h"
-#include "aeif_psc_alpha.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace aeif_psc_alpha_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      aeif_psc_alpha_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_psc_alpha_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -53,7 +48,7 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0.0;
   refractory_step = 0;
@@ -65,89 +60,91 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   tau_syn_in = 2.0;
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, aeif_psc_alpha_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_psc_alpha_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
   refractory_step = 0;
   // set the right threshold depending on Delta_T
-  if (Delta_T <= 0.0) {
+  if ( Delta_T <= 0.0 )
+  {
     V_peak = V_th; // same as IAF dynamics for spikes if Delta_T == 0.
   }
   I0_ex = M_E / tau_syn_ex;
   I0_in = M_E / tau_syn_in;
 }
 
-}
+} // namespace aeif_psc_alpha_ns
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, aeif_psc_alpha_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_psc_alpha_rk5 data_struct )
 {
-    aeif_psc_alpha_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  aeif_psc_alpha_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, aeif_psc_alpha_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_psc_alpha_rk5 data_struct )
 
 {
-    aeif_psc_alpha_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  aeif_psc_alpha_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace aeif_psc_alpha_ns;
 
-int aeif_psc_alpha::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+aeif_psc_alpha::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_aeif_psc_alpha_model;
   n_scal_var_ = N_SCAL_VAR;
   n_scal_param_ = N_SCAL_PARAM;
-  n_group_param_ = N_GROUP_PARAM; 
+  n_group_param_ = N_GROUP_PARAM;
 
   n_var_ = n_scal_var_;
   n_param_ = n_scal_param_;
 
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = aeif_psc_alpha_scal_var_name;
   scal_param_name_ = aeif_psc_alpha_scal_param_name;
   group_param_name_ = aeif_psc_alpha_group_param_name;
-  //rk5_data_struct_.node_type_ = i_aeif_psc_alpha_model;
+  // rk5_data_struct_.node_type_ = i_aeif_psc_alpha_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
 
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
-  port_weight_arr_ = GetParamArr()  + GetScalParamIdx("I0_ex");
+  port_weight_arr_ = GetParamArr() + GetScalParamIdx( "I0_ex" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = 1;
-  
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I1_syn_ex");
+
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I1_syn_ex" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 1;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int aeif_psc_alpha::Calibrate(double time_min, float time_resolution)
+int
+aeif_psc_alpha::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
-int aeif_psc_alpha::Update(long long it, double t1) {
-  rk5_.Update<N_SCAL_VAR, N_SCAL_PARAM>(t1, h_min_, rk5_data_struct_);
+int
+aeif_psc_alpha::Update( long long it, double t1 )
+{
+  rk5_.Update< N_SCAL_VAR, N_SCAL_PARAM >( t1, h_min_, rk5_data_struct_ );
 
   return 0;
 }
diff --git a/src/aeif_psc_alpha.h b/src/aeif_psc_alpha.h
index 7e2152697..52c845c68 100644
--- a/src/aeif_psc_alpha.h
+++ b/src/aeif_psc_alpha.h
@@ -20,20 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFPSCALPHA_H
 #define AEIFPSCALPHA_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 /* BeginUserDocs: neuron, adaptive threshold, integrate-and-fire, current-based
 
@@ -45,8 +41,8 @@ Current-based exponential integrate-and-fire neuron model
 Description
 +++++++++++
 
-``aeif_psc_alpha`` is the adaptive exponential integrate and fire neuron according
-to [1]_. Synaptic currents are modeled as alpha functions.
+``aeif_psc_alpha`` is the adaptive exponential integrate and fire neuron
+according to [1]_. Synaptic currents are modeled as alpha functions.
 
 This implementation uses the 5th order Runge-Kutta solver with
 adaptive step size to integrate the differential equation.
@@ -55,11 +51,12 @@ The membrane potential is given by the following differential equation:
 
 .. math::
 
-  C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T \exp\left(\frac{V-V_{th}}{\Delta_T}\right)
+  C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T
+\exp\left(\frac{V-V_{th}}{\Delta_T}\right)
   + I_{syn\_ ex}(V, t) - I_{syn\_ in}(V, t) - w + I_e
 
-where `I_syn_ex` and `I_syn_in` are the excitatory and inhibitory synaptic currents
-modeled as alpha functions.
+where `I_syn_ex` and `I_syn_in` are the excitatory and inhibitory synaptic
+currents modeled as alpha functions.
 
 The differential equation for the spike-adaptation current `w` is:
 
@@ -71,8 +68,9 @@ The differential equation for the spike-adaptation current `w` is:
 
   Although this model is not multisynapse, the port (excitatory or inhibitory)
   to be chosen must be specified using the synapse property ``receptor``.
-  The excitatory port has index 0, whereas the inhibitory one has index 1. Differently from
-  NEST, the connection weights related to the inhibitory port must be positive.
+  The excitatory port has index 0, whereas the inhibitory one has index 1.
+Differently from NEST, the connection weights related to the inhibitory port
+must be positive.
 
 Parameters
 ++++++++++
@@ -121,9 +119,9 @@ The following parameters can be set in the status dictionary.
 ============= ======= =========================================================
 **Integration parameters**
 -------------------------------------------------------------------------------
-h0_rel        real    Starting step in ODE integration relative to time 
+h0_rel        real    Starting step in ODE integration relative to time
                       resolution
-h_min_rel     real    Minimum step in ODE integration relative to time 
+h_min_rel     real    Minimum step in ODE integration relative to time
                       resolution
 ============= ======= =========================================================
 
@@ -142,7 +140,7 @@ aeif_psc_alpha_multisynapse, iaf_psc_alpha, aeif_cond_alpha
 
 EndUserDocs */
 
-//#define MAX_PORT_NUM 20
+// #define MAX_PORT_NUM 20
 
 struct aeif_psc_alpha_rk5
 {
@@ -151,30 +149,32 @@ struct aeif_psc_alpha_rk5
 
 class aeif_psc_alpha : public BaseNeuron
 {
- public:
-  RungeKutta5<aeif_psc_alpha_rk5> rk5_;
+public:
+  RungeKutta5< aeif_psc_alpha_rk5 > rk5_;
   float h_min_;
   float h_;
   aeif_psc_alpha_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
-
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/aeif_psc_alpha_kernel.h b/src/aeif_psc_alpha_kernel.h
index c396c02b1..8fd503731 100644
--- a/src/aeif_psc_alpha_kernel.h
+++ b/src/aeif_psc_alpha_kernel.h
@@ -20,26 +20,23 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFPSCALPHAKERNEL_H
 #define AEIFPSCALPHAKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
-#include "node_group.h"
 #include "aeif_psc_alpha.h"
+#include "node_group.h"
+#include "spike_buffer.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace aeif_psc_alpha_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_I_syn_ex = 0,
   i_I_syn_in,
   i_I1_syn_ex,
@@ -49,7 +46,8 @@ enum ScalVarIndexes {
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_tau_syn_ex = 0,
   i_tau_syn_in,
   i_I0_ex,
@@ -71,24 +69,21 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
-
-const std::string aeif_psc_alpha_scal_var_name[N_SCAL_VAR] = {
-  "I_syn_ex",
+const std::string aeif_psc_alpha_scal_var_name[ N_SCAL_VAR ] = { "I_syn_ex",
   "I_syn_in",
   "I1_syn_ex",
   "I1_syn_in",
   "V_m",
-  "w"
-};
+  "w" };
 
-const std::string aeif_psc_alpha_scal_param_name[N_SCAL_PARAM] = {
-  "tau_syn_ex",
+const std::string aeif_psc_alpha_scal_param_name[ N_SCAL_PARAM ] = { "tau_syn_ex",
   "tau_syn_in",
   "I0_ex",
   "I0_in",
@@ -105,140 +100,130 @@ const std::string aeif_psc_alpha_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string aeif_psc_alpha_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string aeif_psc_alpha_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define I_syn_ex y[i_I_syn_ex]
-#define I_syn_in y[i_I_syn_in]
-#define I1_syn_ex y[i_I1_syn_ex]
-#define I1_syn_in y[i_I1_syn_in]
-#define V_m y[i_V_m]
-#define w y[i_w]
-
-#define dI_syn_exdt dydx[i_I_syn_ex]
-#define dI_syn_indt dydx[i_I_syn_in]
-#define dI1_syn_exdt dydx[i_I1_syn_ex]
-#define dI1_syn_indt dydx[i_I1_syn_in]
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-
-#define I0_ex param[i_I0_ex]
-#define I0_in param[i_I0_in]
-#define tau_syn_ex param[i_tau_syn_ex]
-#define tau_syn_in param[i_tau_syn_in]
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
-
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     aeif_psc_alpha_rk5 data_struct)
+#define I_syn_ex y[ i_I_syn_ex ]
+#define I_syn_in y[ i_I_syn_in ]
+#define I1_syn_ex y[ i_I1_syn_ex ]
+#define I1_syn_in y[ i_I1_syn_in ]
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+
+#define dI_syn_exdt dydx[ i_I_syn_ex ]
+#define dI_syn_indt dydx[ i_I_syn_in ]
+#define dI1_syn_exdt dydx[ i_I1_syn_ex ]
+#define dI1_syn_indt dydx[ i_I1_syn_in ]
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+
+#define I0_ex param[ i_I0_ex ]
+#define I0_in param[ i_I0_in ]
+#define tau_syn_ex param[ i_tau_syn_ex ]
+#define tau_syn_in param[ i_tau_syn_in ]
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_psc_alpha_rk5 data_struct )
 {
   float I_syn_tot = 0.0;
-  
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
   I_syn_tot += I_syn_ex - I_syn_in;
 
-  float V_spike = Delta_T == 0. ? 0. : Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T == 0. ? 0. : Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn_tot - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn_tot - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
-  dI1_syn_exdt = -I1_syn_ex/tau_syn_ex;
-  dI1_syn_indt = -I1_syn_in/tau_syn_in;
-  dI_syn_exdt = I1_syn_ex - I_syn_ex/tau_syn_ex;
-  dI_syn_indt = I1_syn_in - I_syn_in/tau_syn_in;
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
+  dI1_syn_exdt = -I1_syn_ex / tau_syn_ex;
+  dI1_syn_indt = -I1_syn_in / tau_syn_in;
+  dI_syn_exdt = I1_syn_ex - I_syn_ex / tau_syn_ex;
+  dI_syn_indt = I1_syn_in - I_syn_in / tau_syn_in;
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			aeif_psc_alpha_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_psc_alpha_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
+}; // namespace aeif_psc_alpha_ns
 
-};
+int Update( long long it, double t1 );
 
-int Update(long long it, double t1);
-
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 aeif_psc_alpha_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_psc_alpha_rk5 data_struct )
 {
-    aeif_psc_alpha_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  aeif_psc_alpha_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    aeif_psc_alpha_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_psc_alpha_rk5 data_struct )
 {
-    aeif_psc_alpha_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  aeif_psc_alpha_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/aeif_psc_alpha_multisynapse.cu b/src/aeif_psc_alpha_multisynapse.cu
index c9c068275..09398a794 100644
--- a/src/aeif_psc_alpha_multisynapse.cu
+++ b/src/aeif_psc_alpha_multisynapse.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
+#include "aeif_psc_alpha_multisynapse.h"
 #include "aeif_psc_alpha_multisynapse_kernel.h"
 #include "rk5.h"
-#include "aeif_psc_alpha_multisynapse.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace aeif_psc_alpha_multisynapse_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      aeif_psc_alpha_multisynapse_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_psc_alpha_multisynapse_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -54,56 +49,57 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0.0;
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
-    I_syn(i) = 0.0;
-    I1_syn(i) = 0.0;
-    tau_syn(i) = 0.2;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn( i ) = 0.0;
+    I1_syn( i ) = 0.0;
+    tau_syn( i ) = 0.2;
   }
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, aeif_psc_alpha_multisynapse_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_psc_alpha_multisynapse_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   refractory_step = 0;
   // set the right threshold depending on Delta_T
-  if (Delta_T <= 0.0) {
+  if ( Delta_T <= 0.0 )
+  {
     V_peak = V_th; // same as IAF dynamics for spikes if Delta_T == 0.
   }
-  for (int i = 0; i<n_port; i++) {
-    I0(i) = M_E / tau_syn(i);
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I0( i ) = M_E / tau_syn( i );
   }
 }
 
-}
+} // namespace aeif_psc_alpha_multisynapse_ns
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, aeif_psc_alpha_multisynapse_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_psc_alpha_multisynapse_rk5 data_struct )
 {
-    aeif_psc_alpha_multisynapse_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  aeif_psc_alpha_multisynapse_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, aeif_psc_alpha_multisynapse_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_psc_alpha_multisynapse_rk5 data_struct )
 
 {
-    aeif_psc_alpha_multisynapse_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  aeif_psc_alpha_multisynapse_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace aeif_psc_alpha_multisynapse_ns;
 
-int aeif_psc_alpha_multisynapse::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+aeif_psc_alpha_multisynapse::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_aeif_psc_alpha_multisynapse_model;
   n_scal_var_ = N_SCAL_VAR;
   n_port_var_ = N_PORT_VAR;
@@ -111,58 +107,60 @@ int aeif_psc_alpha_multisynapse::Init(int i_node_0, int n_node, int n_port,
   n_port_param_ = N_PORT_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
 
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = aeif_psc_alpha_multisynapse_scal_var_name;
-  port_var_name_= aeif_psc_alpha_multisynapse_port_var_name;
+  port_var_name_ = aeif_psc_alpha_multisynapse_port_var_name;
   scal_param_name_ = aeif_psc_alpha_multisynapse_scal_param_name;
   port_param_name_ = aeif_psc_alpha_multisynapse_port_param_name;
   group_param_name_ = aeif_psc_alpha_multisynapse_group_param_name;
-  //rk5_data_struct_.node_type_ = i_aeif_psc_alpha_multisynapse_model;
+  // rk5_data_struct_.node_type_ = i_aeif_psc_alpha_multisynapse_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
 
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
-  port_weight_arr_ = GetParamArr() + n_scal_param_
-    + GetPortParamIdx("I0");
+  port_weight_arr_ = GetParamArr() + n_scal_param_ + GetPortParamIdx( "I0" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = n_port_param_;
-  
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("I1_syn");
+
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "I1_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int aeif_psc_alpha_multisynapse::Calibrate(double time_min, float time_resolution)
+int
+aeif_psc_alpha_multisynapse::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
 template <>
-int aeif_psc_alpha_multisynapse::UpdateNR<0>(long long it, double t1)
+int
+aeif_psc_alpha_multisynapse::UpdateNR< 0 >( long long it, double t1 )
 {
   return 0;
 }
 
-int aeif_psc_alpha_multisynapse::Update(long long it, double t1) {
-  UpdateNR<MAX_PORT_NUM>(it, t1);
+int
+aeif_psc_alpha_multisynapse::Update( long long it, double t1 )
+{
+  UpdateNR< MAX_PORT_NUM >( it, t1 );
 
   return 0;
 }
diff --git a/src/aeif_psc_alpha_multisynapse.h b/src/aeif_psc_alpha_multisynapse.h
index fa085d776..820dec825 100644
--- a/src/aeif_psc_alpha_multisynapse.h
+++ b/src/aeif_psc_alpha_multisynapse.h
@@ -20,20 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFPSCALPHAMULTISYNAPSE_H
 #define AEIFPSCALPHAMULTISYNAPSE_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 /* BeginUserDocs: neuron, adaptive threshold, integrate-and-fire, current-based
 
@@ -45,8 +41,8 @@ Current-based exponential integrate-and-fire neuron model
 Description
 +++++++++++
 
-``aeif_psc_alpha_multisynapse`` is the adaptive exponential integrate and fire neuron according
-to [1]_. Synaptic currents are modeled as alpha functions.
+``aeif_psc_alpha_multisynapse`` is the adaptive exponential integrate and fire
+neuron according to [1]_. Synaptic currents are modeled as alpha functions.
 
 This implementation uses the 5th order Runge-Kutta solver with
 adaptive step size to integrate the differential equation.
@@ -55,7 +51,8 @@ The membrane potential is given by the following differential equation:
 
 .. math::
 
-  C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T \exp\left(\frac{V-V_{th}}{\Delta_T}\right)
+  C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T
+\exp\left(\frac{V-V_{th}}{\Delta_T}\right)
   + I_{syn}(V, t)- w + I_e
 
 where ``I_syn (V,t)`` is the sum of excitatory and inhibitory synaptic currents
@@ -69,11 +66,12 @@ The differential equation for the spike-adaptation current `w` is:
 
 .. note::
 
-  The number of receptor ports must be specified at neuron creation (default value is 1) and
-  the receptor index starts from 0 (and not from 1 as in NEST multisynapse models).
-  The time constants are supplied by an array, ``tau_syn``. Port numbers
-  are automatically assigned in the range 0 to ``n_receptors-1``.
-  During connection, the ports are selected with the synapse property ``receptor``.
+  The number of receptor ports must be specified at neuron creation (default
+value is 1) and the receptor index starts from 0 (and not from 1 as in NEST
+multisynapse models). The time constants are supplied by an array, ``tau_syn``.
+Port numbers are automatically assigned in the range 0 to ``n_receptors-1``.
+  During connection, the ports are selected with the synapse property
+``receptor``.
 
 Parameters
 ++++++++++
@@ -120,9 +118,9 @@ The following parameters can be set in the status dictionary.
 ============= ======= =========================================================
 **Integration parameters**
 -------------------------------------------------------------------------------
-h0_rel        real    Starting step in ODE integration relative to time 
+h0_rel        real    Starting step in ODE integration relative to time
                       resolution
-h_min_rel     real    Minimum step in ODE integration relative to time 
+h_min_rel     real    Minimum step in ODE integration relative to time
                       resolution
 ============= ======= =========================================================
 
@@ -150,29 +148,32 @@ struct aeif_psc_alpha_multisynapse_rk5
 
 class aeif_psc_alpha_multisynapse : public BaseNeuron
 {
- public:
-  RungeKutta5<aeif_psc_alpha_multisynapse_rk5> rk5_;
+public:
+  RungeKutta5< aeif_psc_alpha_multisynapse_rk5 > rk5_;
   float h_min_;
   float h_;
   aeif_psc_alpha_multisynapse_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/aeif_psc_alpha_multisynapse_kernel.h b/src/aeif_psc_alpha_multisynapse_kernel.h
index 61a34e895..b7adae578 100644
--- a/src/aeif_psc_alpha_multisynapse_kernel.h
+++ b/src/aeif_psc_alpha_multisynapse_kernel.h
@@ -20,38 +20,37 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFPSCALPHAMULTISYNAPSEKERNEL_H
 #define AEIFPSCALPHAMULTISYNAPSEKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
-#include "node_group.h"
 #include "aeif_psc_alpha_multisynapse.h"
+#include "node_group.h"
+#include "spike_buffer.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace aeif_psc_alpha_multisynapse_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_I_syn = 0,
   i_I1_syn,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -69,31 +68,25 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_tau_syn = 0,
   i_I0,
   N_PORT_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
+const std::string aeif_psc_alpha_multisynapse_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string aeif_psc_alpha_multisynapse_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
-
-const std::string aeif_psc_alpha_multisynapse_port_var_name[N_PORT_VAR] = {
-  "I_syn",
-  "I1_syn"
-};
+const std::string aeif_psc_alpha_multisynapse_port_var_name[ N_PORT_VAR ] = { "I_syn", "I1_syn" };
 
-const std::string aeif_psc_alpha_multisynapse_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string aeif_psc_alpha_multisynapse_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -106,162 +99,156 @@ const std::string aeif_psc_alpha_multisynapse_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string aeif_psc_alpha_multisynapse_port_param_name[N_PORT_PARAM] = {
-  "tau_syn",
-  "I0"
-};
+const std::string aeif_psc_alpha_multisynapse_port_param_name[ N_PORT_PARAM ] = { "tau_syn", "I0" };
 
-const std::string aeif_psc_alpha_multisynapse_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string aeif_psc_alpha_multisynapse_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-#define I_syn(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_I_syn]
-#define I1_syn(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_I1_syn]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-#define dI_syndt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_I_syn]
-#define dI1_syndt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_I1_syn]
-#define I0(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_I0]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define tau_syn(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_syn]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
-
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     aeif_psc_alpha_multisynapse_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+#define I_syn( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_I_syn ]
+#define I1_syn( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_I1_syn ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+#define dI_syndt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_I_syn ]
+#define dI1_syndt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_I1_syn ]
+#define I0( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_I0 ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define tau_syn( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_syn ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_psc_alpha_multisynapse_rk5 data_struct )
 {
-  enum { n_port = (NVAR-N_SCAL_VAR)/N_PORT_VAR };
+  enum
+  {
+    n_port = ( NVAR - N_SCAL_VAR ) / N_PORT_VAR
+  };
   float I_syn_tot = 0.0;
-  
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
-  for (int i = 0; i<n_port; i++) {
-    I_syn_tot += I_syn(i);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn_tot += I_syn( i );
   }
-  float V_spike = Delta_T == 0. ? 0. : Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T == 0. ? 0. : Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn_tot - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn_tot - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
-  for (int i=0; i<n_port; i++) {
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
+  for ( int i = 0; i < n_port; i++ )
+  {
     // Synaptic current derivatives
-    dI1_syndt(i) = -I1_syn(i)/tau_syn(i);
-    dI_syndt(i) = I1_syn(i) - I_syn(i)/tau_syn(i);
+    dI1_syndt( i ) = -I1_syn( i ) / tau_syn( i );
+    dI_syndt( i ) = I1_syn( i ) - I_syn( i ) / tau_syn( i );
   }
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			aeif_psc_alpha_multisynapse_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_psc_alpha_multisynapse_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
-};
+}; // namespace aeif_psc_alpha_multisynapse_ns
 
 template <>
-int aeif_psc_alpha_multisynapse::UpdateNR<0>(long long it, double t1);
+int aeif_psc_alpha_multisynapse::UpdateNR< 0 >( long long it, double t1 );
 
-template<int N_PORT>
-int aeif_psc_alpha_multisynapse::UpdateNR(long long it, double t1)
+template < int N_PORT >
+int
+aeif_psc_alpha_multisynapse::UpdateNR( long long it, double t1 )
 {
-  if (N_PORT == n_port_) {
-    const int NVAR = aeif_psc_alpha_multisynapse_ns::N_SCAL_VAR
-      + aeif_psc_alpha_multisynapse_ns::N_PORT_VAR*N_PORT;
-    const int NPARAM = aeif_psc_alpha_multisynapse_ns::N_SCAL_PARAM
-      + aeif_psc_alpha_multisynapse_ns::N_PORT_PARAM*N_PORT;
+  if ( N_PORT == n_port_ )
+  {
+    const int NVAR = aeif_psc_alpha_multisynapse_ns::N_SCAL_VAR + aeif_psc_alpha_multisynapse_ns::N_PORT_VAR * N_PORT;
+    const int NPARAM =
+      aeif_psc_alpha_multisynapse_ns::N_SCAL_PARAM + aeif_psc_alpha_multisynapse_ns::N_PORT_PARAM * N_PORT;
 
-    rk5_.Update<NVAR, NPARAM>(t1, h_min_, rk5_data_struct_);
+    rk5_.Update< NVAR, NPARAM >( t1, h_min_, rk5_data_struct_ );
   }
-  else {
-    UpdateNR<N_PORT - 1>(it, t1);
+  else
+  {
+    UpdateNR< N_PORT - 1 >( it, t1 );
   }
 
   return 0;
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 aeif_psc_alpha_multisynapse_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_psc_alpha_multisynapse_rk5 data_struct )
 {
-    aeif_psc_alpha_multisynapse_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  aeif_psc_alpha_multisynapse_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    aeif_psc_alpha_multisynapse_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_psc_alpha_multisynapse_rk5 data_struct )
 {
-    aeif_psc_alpha_multisynapse_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  aeif_psc_alpha_multisynapse_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/aeif_psc_alpha_multisynapse_rk5.h b/src/aeif_psc_alpha_multisynapse_rk5.h
index 3f0c82cad..bb670abde 100644
--- a/src/aeif_psc_alpha_multisynapse_rk5.h
+++ b/src/aeif_psc_alpha_multisynapse_rk5.h
@@ -20,32 +20,23 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFPSCALPHAMULTISYNAPSERK5_H
 #define AEIFPSCALPHAMULTISYNAPSERK5_H
 
 struct aeif_psc_alpha_multisynapse_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_psc_alpha_multisynapse_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 aeif_psc_alpha_multisynapse_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    aeif_psc_alpha_multisynapse_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_psc_alpha_multisynapse_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, aeif_psc_alpha_multisynapse_rk5 data_struct);
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_psc_alpha_multisynapse_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, aeif_psc_alpha_multisynapse_rk5 data_struct);
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_psc_alpha_multisynapse_rk5 data_struct );
 
 #endif
diff --git a/src/aeif_psc_delta.cu b/src/aeif_psc_delta.cu
index 82e472cef..29e9538f1 100644
--- a/src/aeif_psc_delta.cu
+++ b/src/aeif_psc_delta.cu
@@ -20,25 +20,20 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
+#include "aeif_psc_delta.h"
 #include "aeif_psc_delta_kernel.h"
 #include "rk5.h"
-#include "aeif_psc_delta.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace aeif_psc_delta_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      aeif_psc_delta_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_psc_delta_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -53,100 +48,100 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0;
   refractory_step = 0;
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, aeif_psc_delta_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_psc_delta_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  //int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  // int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
 
   refractory_step = 0;
   // set the right threshold depending on Delta_T
-  if (Delta_T <= 0.0) {
+  if ( Delta_T <= 0.0 )
+  {
     V_peak = V_th; // same as IAF dynamics for spikes if Delta_T == 0.
   }
 }
 
-}
+} // namespace aeif_psc_delta_ns
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, aeif_psc_delta_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_psc_delta_rk5 data_struct )
 {
-    aeif_psc_delta_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  aeif_psc_delta_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, aeif_psc_delta_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_psc_delta_rk5 data_struct )
 
 {
-    aeif_psc_delta_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  aeif_psc_delta_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace aeif_psc_delta_ns;
 
-int aeif_psc_delta::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+aeif_psc_delta::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_aeif_psc_delta_model;
   n_scal_var_ = N_SCAL_VAR;
   n_scal_param_ = N_SCAL_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
+
+  group_param_ = new float[ N_GROUP_PARAM ];
 
-  group_param_ = new float[N_GROUP_PARAM];
-  
   scal_var_name_ = aeif_psc_delta_scal_var_name;
   scal_param_name_ = aeif_psc_delta_scal_param_name;
   group_param_name_ = aeif_psc_delta_group_param_name;
-  //rk5_data_struct_.node_type_ = i_aeif_psc_delta_model;
+  // rk5_data_struct_.node_type_ = i_aeif_psc_delta_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
 
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("V_m");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "V_m" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int aeif_psc_delta::Calibrate(double time_min, float time_resolution)
+int
+aeif_psc_delta::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
-int aeif_psc_delta::Update(long long it, double t1)
+int
+aeif_psc_delta::Update( long long it, double t1 )
 {
-  rk5_.Update<N_SCAL_VAR, N_SCAL_PARAM>(t1, h_min_, rk5_data_struct_);
- 
+  rk5_.Update< N_SCAL_VAR, N_SCAL_PARAM >( t1, h_min_, rk5_data_struct_ );
+
   return 0;
 }
diff --git a/src/aeif_psc_delta.h b/src/aeif_psc_delta.h
index 3eb3724db..ee986cf09 100644
--- a/src/aeif_psc_delta.h
+++ b/src/aeif_psc_delta.h
@@ -20,28 +20,24 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFPSCDELTA_H
 #define AEIFPSCDELTA_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 /* BeginUserDocs: neuron, adaptive threshold, integrate-and-fire, current-based
 
 Short description
 +++++++++++++++++
 
-Current-based adaptive exponential integrate-and-fire neuron model with delta synapse
+Current-based adaptive exponential integrate-and-fire neuron model with delta
+synapse
 
 Description
 +++++++++++
@@ -56,7 +52,8 @@ The membrane potential is given by the following differential equation:
 
 .. math::
 
-  C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T \exp\left(\frac{V-V_{th}}{\Delta_T}\right)
+  C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T
+\exp\left(\frac{V-V_{th}}{\Delta_T}\right)
     + I(t)- w + I_e
 
 and
@@ -75,11 +72,12 @@ the value of J after a spike.
 
 .. note::
 
-  The number of receptor ports must be specified at neuron creation (default value is 1) and
-  the receptor index starts from 0 (and not from 1 as in NEST  models).
-  The time constants are supplied by an array, ``tau_syn``. Port numbers
+  The number of receptor ports must be specified at neuron creation (default
+value is 1) and the receptor index starts from 0 (and not from 1 as in NEST
+models). The time constants are supplied by an array, ``tau_syn``. Port numbers
   are automatically assigned in the range 0 to ``n_receptors-1``.
-  During connection, the ports are selected with the synapse property ``receptor``.
+  During connection, the ports are selected with the synapse property
+``receptor``.
 
 Parameters
 ++++++++++
@@ -119,9 +117,9 @@ The following parameters can be set in the status dictionary.
 ============= ======= =========================================================
 **Integration parameters**
 -------------------------------------------------------------------------------
-h0_rel        real    Starting step in ODE integration relative to time 
+h0_rel        real    Starting step in ODE integration relative to time
                       resolution
-h_min_rel     real    Minimum step in ODE integration relative to time 
+h_min_rel     real    Minimum step in ODE integration relative to time
                       resolution
 ============= ======= =========================================================
 
@@ -140,7 +138,6 @@ aeif_psc_exp
 
 EndUserDocs */
 
-
 #define MAX_PORT_NUM 20
 
 struct aeif_psc_delta_rk5
@@ -150,27 +147,29 @@ struct aeif_psc_delta_rk5
 
 class aeif_psc_delta : public BaseNeuron
 {
- public:
-  RungeKutta5<aeif_psc_delta_rk5> rk5_;
+public:
+  RungeKutta5< aeif_psc_delta_rk5 > rk5_;
   float h_min_;
   float h_;
   aeif_psc_delta_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
-
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
 };
 
 #endif
diff --git a/src/aeif_psc_delta_kernel.h b/src/aeif_psc_delta_kernel.h
index c193fd76a..1a1aaff09 100644
--- a/src/aeif_psc_delta_kernel.h
+++ b/src/aeif_psc_delta_kernel.h
@@ -20,36 +20,35 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFPSCDELTAKERNEL_H
 #define AEIFPSCDELTAKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
-#include "node_group.h"
 #include "aeif_psc_delta.h"
+#include "node_group.h"
+#include "spike_buffer.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace aeif_psc_delta_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   N_PORT_VAR = 0
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -67,20 +66,16 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
+const std::string aeif_psc_delta_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string aeif_psc_delta_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
-
-const std::string aeif_psc_delta_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string aeif_psc_delta_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -93,121 +88,110 @@ const std::string aeif_psc_delta_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
-
-const std::string aeif_psc_delta_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+  "den_delay" };
 
+const std::string aeif_psc_delta_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     aeif_psc_delta_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_psc_delta_rk5 data_struct )
 {
-  
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
 
-  float V_spike = Delta_T == 0. ? 0. : Delta_T*exp((V - V_th)/Delta_T);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+
+  float V_spike = Delta_T == 0. ? 0. : Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			aeif_psc_delta_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_psc_delta_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
+}; // namespace aeif_psc_delta_ns
 
-};
-
-
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 aeif_psc_delta_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_psc_delta_rk5 data_struct )
 {
-    aeif_psc_delta_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  aeif_psc_delta_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    aeif_psc_delta_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_psc_delta_rk5 data_struct )
 {
-    aeif_psc_delta_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  aeif_psc_delta_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/aeif_psc_delta_rk5.h b/src/aeif_psc_delta_rk5.h
index 281634952..3ac5ea37d 100644
--- a/src/aeif_psc_delta_rk5.h
+++ b/src/aeif_psc_delta_rk5.h
@@ -20,32 +20,23 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFPSCDELTAMULTISYNAPSERK5_H
 #define AEIFPSCDELTAMULTISYNAPSERK5_H
 
 struct aeif_psc_delta_multisynapse_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_psc_delta_multisynapse_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 aeif_psc_delta_multisynapse_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    aeif_psc_delta_multisynapse_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_psc_delta_multisynapse_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, aeif_psc_delta_multisynapse_rk5 data_struct);
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_psc_delta_multisynapse_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, aeif_psc_delta_multisynapse_rk5 data_struct);
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_psc_delta_multisynapse_rk5 data_struct );
 
 #endif
diff --git a/src/aeif_psc_exp.cu b/src/aeif_psc_exp.cu
index efd199272..48c32257e 100644
--- a/src/aeif_psc_exp.cu
+++ b/src/aeif_psc_exp.cu
@@ -20,25 +20,20 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
+#include "aeif_psc_exp.h"
 #include "aeif_psc_exp_kernel.h"
 #include "rk5.h"
-#include "aeif_psc_exp.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace aeif_psc_exp_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      aeif_psc_exp_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_psc_exp_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -53,7 +48,7 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   I_syn_ex = 0;
   I_syn_in = 0;
   V_m = E_L;
@@ -61,95 +56,95 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   tau_syn_ex = 0.2;
   tau_syn_in = 2.0;
   refractory_step = 0;
-
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, aeif_psc_exp_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_psc_exp_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
 
   refractory_step = 0;
   // set the right threshold depending on Delta_T
-  if (Delta_T <= 0.0) {
+  if ( Delta_T <= 0.0 )
+  {
     V_peak = V_th; // same as IAF dynamics for spikes if Delta_T == 0.
   }
 }
 
-}
+} // namespace aeif_psc_exp_ns
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, aeif_psc_exp_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_psc_exp_rk5 data_struct )
 {
-    aeif_psc_exp_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  aeif_psc_exp_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, aeif_psc_exp_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_psc_exp_rk5 data_struct )
 
 {
-    aeif_psc_exp_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  aeif_psc_exp_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace aeif_psc_exp_ns;
 
-int aeif_psc_exp::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+aeif_psc_exp::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_aeif_psc_exp_model;
   n_scal_var_ = N_SCAL_VAR;
   n_scal_param_ = N_SCAL_PARAM;
-  n_group_param_ = N_GROUP_PARAM; 
+  n_group_param_ = N_GROUP_PARAM;
 
   n_var_ = n_scal_var_;
   n_param_ = n_scal_param_;
 
-  group_param_ = new float[N_GROUP_PARAM];
-  
+  group_param_ = new float[ N_GROUP_PARAM ];
+
   scal_var_name_ = aeif_psc_exp_scal_var_name;
   scal_param_name_ = aeif_psc_exp_scal_param_name;
   group_param_name_ = aeif_psc_exp_group_param_name;
-  //rk5_data_struct_.node_type_ = i_aeif_psc_exp_model;
+  // rk5_data_struct_.node_type_ = i_aeif_psc_exp_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
 
-  port_input_arr_ = GetVarArr()  + GetScalVarIdx("I_syn_ex");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn_ex" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 1;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int aeif_psc_exp::Calibrate(double time_min, float time_resolution)
+int
+aeif_psc_exp::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
-int aeif_psc_exp::Update(long long it, double t1) {
-  rk5_.Update<N_SCAL_VAR, N_SCAL_PARAM>(t1, h_min_, rk5_data_struct_);
+int
+aeif_psc_exp::Update( long long it, double t1 )
+{
+  rk5_.Update< N_SCAL_VAR, N_SCAL_PARAM >( t1, h_min_, rk5_data_struct_ );
 
   return 0;
 }
diff --git a/src/aeif_psc_exp.h b/src/aeif_psc_exp.h
index cd53b7eb6..71168293b 100644
--- a/src/aeif_psc_exp.h
+++ b/src/aeif_psc_exp.h
@@ -20,21 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFPSCEXP_H
 #define AEIFPSCEXP_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 /* BeginUserDocs: neuron, integrate-and-fire, adaptive threshold, current-based
 
@@ -47,7 +42,7 @@ Description
 +++++++++++
 
 aeif_psc_exp is the adaptive exponential integrate and fire neuron
-according to [1]_, with postsynaptic currents in the form of 
+according to [1]_, with postsynaptic currents in the form of
 truncated exponentials.
 
 This implementation uses the embedded 5th order Runge-Kutta
@@ -57,11 +52,13 @@ The membrane potential is given by the following differential equation:
 
 .. math::
 
- C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T \exp\left(\frac{V-V_{th}}{\Delta_T}\right)
+ C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T
+\exp\left(\frac{V-V_{th}}{\Delta_T}\right)
     + I_{syn\_ex}(t) - I_{syn\_in}(t) - w + I_e
 
-where ``I_syn_ex`` and ``I_syn_in`` are the synaptic currents modeled as truncated exponentials
-with time constants ``tau_syn_ex`` and ``tau_syn_in`` respectively.
+where ``I_syn_ex`` and ``I_syn_in`` are the synaptic currents modeled as
+truncated exponentials with time constants ``tau_syn_ex`` and ``tau_syn_in``
+respectively.
 
 The differential equation for the spike-adaptation current `w` is:
 
@@ -73,8 +70,9 @@ The differential equation for the spike-adaptation current `w` is:
 
   Although this model is not multisynapse, the port (excitatory or inhibitory)
   to be chosen must be specified using the synapse property ``receptor``.
-  The excitatory port has index 0, whereas the inhibitory one has index 1. Differently from
-  NEST, the connection weights related to the inhibitory port must be positive.
+  The excitatory port has index 0, whereas the inhibitory one has index 1.
+Differently from NEST, the connection weights related to the inhibitory port
+must be positive.
 
 Parameters
 ++++++++++
@@ -125,9 +123,9 @@ The following parameters can be set in the status dictionary.
 ============= ======= =========================================================
 **Integration parameters**
 -------------------------------------------------------------------------------
-h0_rel        real    Starting step in ODE integration relative to time 
+h0_rel        real    Starting step in ODE integration relative to time
                       resolution
-h_min_rel     real    Minimum step in ODE integration relative to time 
+h_min_rel     real    Minimum step in ODE integration relative to time
                       resolution
 ============= ======= =========================================================
 
@@ -146,8 +144,7 @@ aeif_psc_exp_multisynapse, iaf_psc_exp, aeif_psc_alpha
 
 EndUserDocs */
 
-
-//#define MAX_PORT_NUM 20
+// #define MAX_PORT_NUM 20
 
 struct aeif_psc_exp_rk5
 {
@@ -156,30 +153,32 @@ struct aeif_psc_exp_rk5
 
 class aeif_psc_exp : public BaseNeuron
 {
- public:
-  RungeKutta5<aeif_psc_exp_rk5> rk5_;
+public:
+  RungeKutta5< aeif_psc_exp_rk5 > rk5_;
   float h_min_;
   float h_;
   aeif_psc_exp_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
-
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/aeif_psc_exp_kernel.h b/src/aeif_psc_exp_kernel.h
index 4314266ed..21d5a70b5 100644
--- a/src/aeif_psc_exp_kernel.h
+++ b/src/aeif_psc_exp_kernel.h
@@ -20,26 +20,23 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFPSCEXPKERNEL_H
 #define AEIFPSCEXPKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
-#include "node_group.h"
 #include "aeif_psc_exp.h"
+#include "node_group.h"
+#include "spike_buffer.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace aeif_psc_exp_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_I_syn_ex = 0,
   i_I_syn_in,
   i_V_m,
@@ -47,7 +44,8 @@ enum ScalVarIndexes {
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_tau_syn_ex = 0,
   i_tau_syn_in,
   i_V_th,
@@ -67,21 +65,16 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
-const std::string aeif_psc_exp_scal_var_name[N_SCAL_VAR] = {
-  "I_syn_ex",
-  "I_syn_in",
-  "V_m",
-  "w"
-};
+const std::string aeif_psc_exp_scal_var_name[ N_SCAL_VAR ] = { "I_syn_ex", "I_syn_in", "V_m", "w" };
 
-const std::string aeif_psc_exp_scal_param_name[N_SCAL_PARAM] = {
-  "tau_syn_ex",
+const std::string aeif_psc_exp_scal_param_name[ N_SCAL_PARAM ] = { "tau_syn_ex",
   "tau_syn_in",
   "V_th",
   "Delta_T",
@@ -96,132 +89,122 @@ const std::string aeif_psc_exp_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string aeif_psc_exp_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string aeif_psc_exp_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define I_syn_ex y[i_I_syn_ex]
-#define I_syn_in y[i_I_syn_in]
-#define V_m y[i_V_m]
-#define w y[i_w]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-#define dI_syn_exdt dydx[i_I_syn_ex]
-#define dI_syn_indt dydx[i_I_syn_in]
-
-#define tau_syn_ex param[i_tau_syn_ex]
-#define tau_syn_in param[i_tau_syn_in]
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     aeif_psc_exp_rk5 data_struct)
+#define I_syn_ex y[ i_I_syn_ex ]
+#define I_syn_in y[ i_I_syn_in ]
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+#define dI_syn_exdt dydx[ i_I_syn_ex ]
+#define dI_syn_indt dydx[ i_I_syn_in ]
+
+#define tau_syn_ex param[ i_tau_syn_ex ]
+#define tau_syn_in param[ i_tau_syn_in ]
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_psc_exp_rk5 data_struct )
 {
   float I_syn_tot = 0.0;
   I_syn_tot += I_syn_ex - I_syn_in;
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
 
-  float V_spike = Delta_T == 0. ? 0. : Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T == 0. ? 0. : Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn_tot - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn_tot - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
   dI_syn_exdt = -I_syn_ex / tau_syn_ex;
   dI_syn_indt = -I_syn_in / tau_syn_in;
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			aeif_psc_exp_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_psc_exp_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
+}; // namespace aeif_psc_exp_ns
 
-};
-
+int Update( long long it, double t1 );
 
-int Update(long long it, double t1);
-
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 aeif_psc_exp_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_psc_exp_rk5 data_struct )
 {
-    aeif_psc_exp_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  aeif_psc_exp_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    aeif_psc_exp_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_psc_exp_rk5 data_struct )
 {
-    aeif_psc_exp_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  aeif_psc_exp_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/aeif_psc_exp_multisynapse.cu b/src/aeif_psc_exp_multisynapse.cu
index ea2556ed4..d1597558b 100644
--- a/src/aeif_psc_exp_multisynapse.cu
+++ b/src/aeif_psc_exp_multisynapse.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
+#include "aeif_psc_exp_multisynapse.h"
 #include "aeif_psc_exp_multisynapse_kernel.h"
 #include "rk5.h"
-#include "aeif_psc_exp_multisynapse.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace aeif_psc_exp_multisynapse_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      aeif_psc_exp_multisynapse_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_psc_exp_multisynapse_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -54,52 +49,52 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0;
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
-    tau_syn(i) = 0.2;
-    I_syn(i) = 0;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    tau_syn( i ) = 0.2;
+    I_syn( i ) = 0;
   }
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, aeif_psc_exp_multisynapse_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_psc_exp_multisynapse_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  //int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  // int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
 
   refractory_step = 0;
   // set the right threshold depending on Delta_T
-  if (Delta_T <= 0.0) {
+  if ( Delta_T <= 0.0 )
+  {
     V_peak = V_th; // same as IAF dynamics for spikes if Delta_T == 0.
   }
 }
 
-}
+} // namespace aeif_psc_exp_multisynapse_ns
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, aeif_psc_exp_multisynapse_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_psc_exp_multisynapse_rk5 data_struct )
 {
-    aeif_psc_exp_multisynapse_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  aeif_psc_exp_multisynapse_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, aeif_psc_exp_multisynapse_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_psc_exp_multisynapse_rk5 data_struct )
 
 {
-    aeif_psc_exp_multisynapse_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  aeif_psc_exp_multisynapse_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace aeif_psc_exp_multisynapse_ns;
 
-int aeif_psc_exp_multisynapse::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+aeif_psc_exp_multisynapse::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_aeif_psc_exp_multisynapse_model;
   n_scal_var_ = N_SCAL_VAR;
   n_port_var_ = N_PORT_VAR;
@@ -107,61 +102,63 @@ int aeif_psc_exp_multisynapse::Init(int i_node_0, int n_node, int n_port,
   n_port_param_ = N_PORT_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
+
+  group_param_ = new float[ N_GROUP_PARAM ];
 
-  group_param_ = new float[N_GROUP_PARAM];
-  
   scal_var_name_ = aeif_psc_exp_multisynapse_scal_var_name;
-  port_var_name_= aeif_psc_exp_multisynapse_port_var_name;
+  port_var_name_ = aeif_psc_exp_multisynapse_port_var_name;
   scal_param_name_ = aeif_psc_exp_multisynapse_scal_param_name;
   port_param_name_ = aeif_psc_exp_multisynapse_port_param_name;
   group_param_name_ = aeif_psc_exp_multisynapse_group_param_name;
-  //rk5_data_struct_.node_type_ = i_aeif_psc_exp_multisynapse_model;
+  // rk5_data_struct_.node_type_ = i_aeif_psc_exp_multisynapse_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  gpuErrchk(cudaMalloc(&port_weight_arr_, sizeof(float)));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
 
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int aeif_psc_exp_multisynapse::Calibrate(double time_min, float time_resolution)
+int
+aeif_psc_exp_multisynapse::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
 template <>
-int aeif_psc_exp_multisynapse::UpdateNR<0>(long long it, double t1)
+int
+aeif_psc_exp_multisynapse::UpdateNR< 0 >( long long it, double t1 )
 {
   return 0;
 }
 
-int aeif_psc_exp_multisynapse::Update(long long it, double t1) {
-  UpdateNR<MAX_PORT_NUM>(it, t1);
+int
+aeif_psc_exp_multisynapse::Update( long long it, double t1 )
+{
+  UpdateNR< MAX_PORT_NUM >( it, t1 );
 
   return 0;
 }
diff --git a/src/aeif_psc_exp_multisynapse.h b/src/aeif_psc_exp_multisynapse.h
index 56de5c7f9..38ea389dd 100644
--- a/src/aeif_psc_exp_multisynapse.h
+++ b/src/aeif_psc_exp_multisynapse.h
@@ -20,21 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFPSCEXPMULTISYNAPSE_H
 #define AEIFPSCEXPMULTISYNAPSE_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 /* BeginUserDocs: neuron, integrate-and-fire, adaptive threshold, current-based
 
@@ -47,7 +42,7 @@ Description
 +++++++++++
 
 aeif_psc_exp_multisynapse is the adaptive exponential integrate and fire neuron
-according to [1]_, with postsynaptic currents in the form of 
+according to [1]_, with postsynaptic currents in the form of
 truncated exponentials.
 
 This implementation uses the embedded 5th order Runge-Kutta
@@ -57,11 +52,12 @@ The membrane potential is given by the following differential equation:
 
 .. math::
 
- C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T \exp\left(\frac{V-V_{th}}{\Delta_T}\right)
+ C_m \frac{dV}{dt} = -g_L(V-E_L) + g_L\Delta_T
+\exp\left(\frac{V-V_{th}}{\Delta_T}\right)
     + I_{syn}(t)- w + I_e
 
-where ``I_syn (t)`` is the sum of the synaptic currents modeled as truncated exponentials
-with time constant ``tau_syn``.
+where ``I_syn (t)`` is the sum of the synaptic currents modeled as truncated
+exponentials with time constant ``tau_syn``.
 
 The differential equation for the spike-adaptation current `w` is:
 
@@ -71,11 +67,12 @@ The differential equation for the spike-adaptation current `w` is:
 
 .. note::
 
-  The number of receptor ports must be specified at neuron creation (default value is 1) and
-  the receptor index starts from 0 (and not from 1 as in NEST multisynapse models).
-  The time constants are supplied by an array, ``tau_syn``. Port numbers
-  are automatically assigned in the range 0 to ``n_receptors-1``.
-  During connection, the ports are selected with the synapse property ``receptor``.
+  The number of receptor ports must be specified at neuron creation (default
+value is 1) and the receptor index starts from 0 (and not from 1 as in NEST
+multisynapse models). The time constants are supplied by an array, ``tau_syn``.
+Port numbers are automatically assigned in the range 0 to ``n_receptors-1``.
+  During connection, the ports are selected with the synapse property
+``receptor``.
 
 Parameters
 ++++++++++
@@ -123,9 +120,9 @@ The following parameters can be set in the status dictionary.
 ============= ======= =========================================================
 **Integration parameters**
 -------------------------------------------------------------------------------
-h0_rel        real    Starting step in ODE integration relative to time 
+h0_rel        real    Starting step in ODE integration relative to time
                       resolution
-h_min_rel     real    Minimum step in ODE integration relative to time 
+h_min_rel     real    Minimum step in ODE integration relative to time
                       resolution
 ============= ======= =========================================================
 
@@ -144,7 +141,6 @@ iaf_psc_exp
 
 EndUserDocs */
 
-
 #define MAX_PORT_NUM 20
 
 struct aeif_psc_exp_multisynapse_rk5
@@ -154,29 +150,32 @@ struct aeif_psc_exp_multisynapse_rk5
 
 class aeif_psc_exp_multisynapse : public BaseNeuron
 {
- public:
-  RungeKutta5<aeif_psc_exp_multisynapse_rk5> rk5_;
+public:
+  RungeKutta5< aeif_psc_exp_multisynapse_rk5 > rk5_;
   float h_min_;
   float h_;
   aeif_psc_exp_multisynapse_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/aeif_psc_exp_multisynapse_kernel.h b/src/aeif_psc_exp_multisynapse_kernel.h
index 7b735dc63..b22862bd0 100644
--- a/src/aeif_psc_exp_multisynapse_kernel.h
+++ b/src/aeif_psc_exp_multisynapse_kernel.h
@@ -20,37 +20,36 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFPSCEXPMULTISYNAPSEKERNEL_H
 #define AEIFPSCEXPMULTISYNAPSEKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
-#include "node_group.h"
 #include "aeif_psc_exp_multisynapse.h"
+#include "node_group.h"
+#include "spike_buffer.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace aeif_psc_exp_multisynapse_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_I_syn = 0,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -68,28 +67,24 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_tau_syn = 0,
   N_PORT_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
-const std::string aeif_psc_exp_multisynapse_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
+const std::string aeif_psc_exp_multisynapse_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string aeif_psc_exp_multisynapse_port_var_name[N_PORT_VAR] = {
-  "I_syn"
-};
+const std::string aeif_psc_exp_multisynapse_port_var_name[ N_PORT_VAR ] = { "I_syn" };
 
-const std::string aeif_psc_exp_multisynapse_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string aeif_psc_exp_multisynapse_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -102,157 +97,153 @@ const std::string aeif_psc_exp_multisynapse_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string aeif_psc_exp_multisynapse_port_param_name[N_PORT_PARAM] = {
+const std::string aeif_psc_exp_multisynapse_port_param_name[ N_PORT_PARAM ] = {
   "tau_syn",
 };
 
-const std::string aeif_psc_exp_multisynapse_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string aeif_psc_exp_multisynapse_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-#define I_syn(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_I_syn]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-#define dI_syndt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_I_syn]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define tau_syn(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_syn]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     aeif_psc_exp_multisynapse_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+#define I_syn( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_I_syn ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+#define dI_syndt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_I_syn ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define tau_syn( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_syn ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_psc_exp_multisynapse_rk5 data_struct )
 {
-  enum { n_port = (NVAR-N_SCAL_VAR)/N_PORT_VAR };
+  enum
+  {
+    n_port = ( NVAR - N_SCAL_VAR ) / N_PORT_VAR
+  };
   float I_syn_tot = 0.0;
-  
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
-  for (int i = 0; i<n_port; i++) {
-    I_syn_tot += I_syn(i);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn_tot += I_syn( i );
   }
-  float V_spike = Delta_T == 0. ? 0. : Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T == 0. ? 0. : Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn_tot - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn_tot - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
-  for (int i=0; i<n_port; i++) {
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
+  for ( int i = 0; i < n_port; i++ )
+  {
     // Synaptic current derivative
-    dI_syndt(i) = -I_syn(i) / tau_syn(i);
+    dI_syndt( i ) = -I_syn( i ) / tau_syn( i );
   }
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			aeif_psc_exp_multisynapse_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_psc_exp_multisynapse_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
-};
+}; // namespace aeif_psc_exp_multisynapse_ns
 
 template <>
-int aeif_psc_exp_multisynapse::UpdateNR<0>(long long it, double t1);
+int aeif_psc_exp_multisynapse::UpdateNR< 0 >( long long it, double t1 );
 
-template<int N_PORT>
-int aeif_psc_exp_multisynapse::UpdateNR(long long it, double t1)
+template < int N_PORT >
+int
+aeif_psc_exp_multisynapse::UpdateNR( long long it, double t1 )
 {
-  if (N_PORT == n_port_) {
-    const int NVAR = aeif_psc_exp_multisynapse_ns::N_SCAL_VAR
-      + aeif_psc_exp_multisynapse_ns::N_PORT_VAR*N_PORT;
-    const int NPARAM = aeif_psc_exp_multisynapse_ns::N_SCAL_PARAM
-      + aeif_psc_exp_multisynapse_ns::N_PORT_PARAM*N_PORT;
+  if ( N_PORT == n_port_ )
+  {
+    const int NVAR = aeif_psc_exp_multisynapse_ns::N_SCAL_VAR + aeif_psc_exp_multisynapse_ns::N_PORT_VAR * N_PORT;
+    const int NPARAM = aeif_psc_exp_multisynapse_ns::N_SCAL_PARAM + aeif_psc_exp_multisynapse_ns::N_PORT_PARAM * N_PORT;
 
-    rk5_.Update<NVAR, NPARAM>(t1, h_min_, rk5_data_struct_);
+    rk5_.Update< NVAR, NPARAM >( t1, h_min_, rk5_data_struct_ );
   }
-  else {
-    UpdateNR<N_PORT - 1>(it, t1);
+  else
+  {
+    UpdateNR< N_PORT - 1 >( it, t1 );
   }
 
   return 0;
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 aeif_psc_exp_multisynapse_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, aeif_psc_exp_multisynapse_rk5 data_struct )
 {
-    aeif_psc_exp_multisynapse_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  aeif_psc_exp_multisynapse_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    aeif_psc_exp_multisynapse_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_psc_exp_multisynapse_rk5 data_struct )
 {
-    aeif_psc_exp_multisynapse_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  aeif_psc_exp_multisynapse_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/aeif_psc_exp_multisynapse_rk5.h b/src/aeif_psc_exp_multisynapse_rk5.h
index 211b05a11..f8eb7aec7 100644
--- a/src/aeif_psc_exp_multisynapse_rk5.h
+++ b/src/aeif_psc_exp_multisynapse_rk5.h
@@ -20,32 +20,22 @@
  *
  */
 
-
-
-
-
 #ifndef AEIFPSCEXPMULTISYNAPSERK5_H
 #define AEIFPSCEXPMULTISYNAPSERK5_H
 
 struct aeif_psc_exp_multisynapse_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void Derivatives( double x, float* y, float* dydx, float* param, aeif_psc_exp_multisynapse_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 aeif_psc_exp_multisynapse_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    aeif_psc_exp_multisynapse_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, aeif_psc_exp_multisynapse_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, aeif_psc_exp_multisynapse_rk5 data_struct);
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, aeif_psc_exp_multisynapse_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, aeif_psc_exp_multisynapse_rk5 data_struct);
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, aeif_psc_exp_multisynapse_rk5 data_struct );
 
 #endif
diff --git a/src/base_neuron.cu b/src/base_neuron.cu
index d2ed1cfb2..951054041 100644
--- a/src/base_neuron.cu
+++ b/src/base_neuron.cu
@@ -20,210 +20,226 @@
  *
  */
 
-#include <vector>
 #include <algorithm>
+#include <vector>
 
 #include <config.h>
 
-#include <iostream>
-#include "utilities.h"
-#include "ngpu_exception.h"
+#include "base_neuron.h"
 #include "cuda_error.h"
 #include "distribution.h"
-#include "base_neuron.h"
-#include "spike_buffer.h"
+#include "ngpu_exception.h"
 #include "scan.h"
+#include "spike_buffer.h"
+#include "utilities.h"
+#include <iostream>
 
 // set equally spaced (index i*step) elements of array arr to value val
-__global__ void BaseNeuronSetIntArray(int *arr, int n_elem, int step,
-					int val)
+__global__ void
+BaseNeuronSetIntArray( int* arr, int n_elem, int step, int val )
 {
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<n_elem) {
-    arr[array_idx*step] = val;
+  if ( array_idx < n_elem )
+  {
+    arr[ array_idx * step ] = val;
   }
 }
 
 // set elements of array arr to value val using indexes from pointer pos
 // and given step: index = pos[array_idx]*step
-__global__ void BaseNeuronSetIntPtArray(int *arr, int *pos, int n_elem,
-					  int step, int val)
+__global__ void
+BaseNeuronSetIntPtArray( int* arr, int* pos, int n_elem, int step, int val )
 {
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<n_elem) {
-    arr[pos[array_idx]*step] = val;
+  if ( array_idx < n_elem )
+  {
+    arr[ pos[ array_idx ] * step ] = val;
   }
 }
 
 // copy equally spaced elements of array arr1 to equally spaced positions
 // of array arr2
-__global__ void BaseNeuronGetIntArray(int *arr1, int *arr2, int n_elem,
-					int step1, int step2)
+__global__ void
+BaseNeuronGetIntArray( int* arr1, int* arr2, int n_elem, int step1, int step2 )
 {
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<n_elem) {
-    arr2[array_idx*step2] = arr1[array_idx*step1];
+  if ( array_idx < n_elem )
+  {
+    arr2[ array_idx * step2 ] = arr1[ array_idx * step1 ];
   }
 }
 
 // copy elements of array arr1 with indexes from pointer pos
 // and given step (index = pos[array_idx]*step1)
 // to equally spaced positions of array arr2
-__global__ void BaseNeuronGetIntPtArray(int *arr1, int *arr2, int *pos,
-					  int n_elem, int step1, int step2)
+__global__ void
+BaseNeuronGetIntPtArray( int* arr1, int* arr2, int* pos, int n_elem, int step1, int step2 )
 {
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<n_elem) {
-    arr2[array_idx*step2] = arr1[pos[array_idx]*step1];
+  if ( array_idx < n_elem )
+  {
+    arr2[ array_idx * step2 ] = arr1[ pos[ array_idx ] * step1 ];
   }
 }
 
 // set equally spaced (index i*step) elements of array arr to value val
-__global__ void BaseNeuronSetFloatArray(float *arr, int n_elem, int step,
-					float val)
+__global__ void
+BaseNeuronSetFloatArray( float* arr, int n_elem, int step, float val )
 {
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<n_elem) {
-    arr[array_idx*step] = val;
+  if ( array_idx < n_elem )
+  {
+    arr[ array_idx * step ] = val;
   }
 }
 
 // copy array src_arr to equally spaced (index i*step) elements of target_arr
-__global__ void BaseNeuronCopyFloatArray(float *target_arr, int n_elem,
-					 int step, float *src_arr)
+__global__ void
+BaseNeuronCopyFloatArray( float* target_arr, int n_elem, int step, float* src_arr )
 {
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<n_elem) {
-    target_arr[array_idx*step] = src_arr[array_idx];
+  if ( array_idx < n_elem )
+  {
+    target_arr[ array_idx * step ] = src_arr[ array_idx ];
   }
 }
 
 // set elements of array arr to value val using indexes from pointer pos
 // and given step: index = pos[array_idx]*step
-__global__ void BaseNeuronSetFloatPtArray(float *arr, int *pos, int n_elem,
-					  int step, float val)
+__global__ void
+BaseNeuronSetFloatPtArray( float* arr, int* pos, int n_elem, int step, float val )
 {
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<n_elem) {
-    arr[pos[array_idx]*step] = val;
+  if ( array_idx < n_elem )
+  {
+    arr[ pos[ array_idx ] * step ] = val;
   }
 }
 
 // copy array src_arr to elements of array target_arr using indexes
 // from pointer pos and given step: index = pos[array_idx]*step
-__global__ void BaseNeuronCopyFloatPtArray(float *target_arr, int *pos,
-					   int n_elem, int step, float *src_arr)
+__global__ void
+BaseNeuronCopyFloatPtArray( float* target_arr, int* pos, int n_elem, int step, float* src_arr )
 {
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<n_elem) {
-    target_arr[pos[array_idx]*step] = src_arr[array_idx];
+  if ( array_idx < n_elem )
+  {
+    target_arr[ pos[ array_idx ] * step ] = src_arr[ array_idx ];
   }
 }
 
 // copy equally spaced elements of array arr1 to equally spaced positions
 // of array arr2
-__global__ void BaseNeuronGetFloatArray(float *arr1, float *arr2, int n_elem,
-					int step1, int step2)
+__global__ void
+BaseNeuronGetFloatArray( float* arr1, float* arr2, int n_elem, int step1, int step2 )
 {
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<n_elem) {
-    arr2[array_idx*step2] = arr1[array_idx*step1];
+  if ( array_idx < n_elem )
+  {
+    arr2[ array_idx * step2 ] = arr1[ array_idx * step1 ];
   }
 }
 
 // copy elements of array arr1 with indexes from pointer pos
 // and given step (index = pos[array_idx]*step1)
 // to equally spaced positions of array arr2
-__global__ void BaseNeuronGetFloatPtArray(float *arr1, float *arr2, int *pos,
-					  int n_elem, int step1, int step2)
+__global__ void
+BaseNeuronGetFloatPtArray( float* arr1, float* arr2, int* pos, int n_elem, int step1, int step2 )
 {
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<n_elem) {
-    arr2[array_idx*step2] = arr1[pos[array_idx]*step1];
+  if ( array_idx < n_elem )
+  {
+    arr2[ array_idx * step2 ] = arr1[ pos[ array_idx ] * step1 ];
   }
 }
 
 // Initialization method for class BaseNeuron
-int BaseNeuron::Init(int i_node_0, int n_node, int n_port,
-		     int i_group)
+int
+BaseNeuron::Init( int i_node_0, int n_node, int n_port, int i_group )
 {
-  node_type_= 0; // NULL MODEL
+  node_type_ = 0;           // NULL MODEL
   ext_neuron_flag_ = false; // by default neuron is not external
-  i_node_0_ = i_node_0; // first neuron of group index in spike buffer array
-  n_node_ = n_node; // number of nodes in the group
-  n_port_ = n_port; // number of receptor ports
-  i_group_ = i_group; // neuron group index
-
-  n_scal_var_ = 0; // number of scalar state variables
-  n_port_var_ = 0; // number of receptor-port state variables
-  n_scal_param_ = 0; // number of scalar parameters
-  n_port_param_ = 0; // number of receptor-port parameters
+  i_node_0_ = i_node_0;     // first neuron of group index in spike buffer array
+  n_node_ = n_node;         // number of nodes in the group
+  n_port_ = n_port;         // number of receptor ports
+  i_group_ = i_group;       // neuron group index
+
+  n_scal_var_ = 0;    // number of scalar state variables
+  n_port_var_ = 0;    // number of receptor-port state variables
+  n_scal_param_ = 0;  // number of scalar parameters
+  n_port_param_ = 0;  // number of receptor-port parameters
   n_group_param_ = 0; // number of neuron-group parameters
-  n_var_ = 0; // total number of state variables
-  n_param_ = 0; // total number of parameters
-
-  get_spike_array_ = NULL;
-  port_weight_arr_ = NULL; // pointer to array of receptor-port weights
-  port_weight_arr_step_ = 0; // step between elements for different neurons
-  port_weight_port_step_ = 0; // step between elements for different ports
-  port_input_arr_ = NULL; // pointer to array of receptor-port input
-  port_input_arr_step_ = 0; // step between elements for different neurons
-  port_input_port_step_ = 0; // step between elements for different ports
-  var_arr_ = NULL; // pointer to state-variables array
-  param_arr_ = NULL; // pointer to parameter array
-  group_param_ = NULL; // pointer to neuron-group parameters
-  int_var_name_.clear(); // vector of integer-variable names
-  scal_var_name_ = NULL; // array of scalar state-variable names
-  port_var_name_= NULL;// array of receptor-port state variable names
-  scal_param_name_ = NULL; // array of scalar parameter names
-  port_param_name_ = NULL; // array of receptor-port parameter names
-  group_param_name_ = NULL; // array of neuron-group parameter names
-  array_var_name_.clear(); // vector of array-variable names
-  array_param_name_.clear(); // vector of array-parameter names
+  n_var_ = 0;         // total number of state variables
+  n_param_ = 0;       // total number of parameters
+
+  get_spike_array_ = nullptr;
+  port_weight_arr_ = nullptr;  // pointer to array of receptor-port weights
+  port_weight_arr_step_ = 0;   // step between elements for different neurons
+  port_weight_port_step_ = 0;  // step between elements for different ports
+  port_input_arr_ = nullptr;   // pointer to array of receptor-port input
+  port_input_arr_step_ = 0;    // step between elements for different neurons
+  port_input_port_step_ = 0;   // step between elements for different ports
+  var_arr_ = nullptr;          // pointer to state-variables array
+  param_arr_ = nullptr;        // pointer to parameter array
+  group_param_ = nullptr;      // pointer to neuron-group parameters
+  int_var_name_.clear();       // vector of integer-variable names
+  scal_var_name_ = nullptr;    // array of scalar state-variable names
+  port_var_name_ = nullptr;    // array of receptor-port state variable names
+  scal_param_name_ = nullptr;  // array of scalar parameter names
+  port_param_name_ = nullptr;  // array of receptor-port parameter names
+  group_param_name_ = nullptr; // array of neuron-group parameter names
+  array_var_name_.clear();     // vector of array-variable names
+  array_param_name_.clear();   // vector of array-parameter names
 
   has_dir_conn_ = false; // true if neur. group has outgoing direct connections
 
-  spike_count_ = NULL; // array of spike counters
-  rec_spike_times_ = NULL; // array of spike-time records
-  n_rec_spike_times_ = NULL; // array of number of recorded spike times
-  max_n_rec_spike_times_ = 0; // max number of recorded spike times
-  rec_spike_times_step_ = 0; // number of time steps for spike times buffering
-                             // 0 for no buffering
-  den_delay_arr_ = NULL; // array of dendritic backward delays
+  spike_count_ = nullptr;       // array of spike counters
+  rec_spike_times_ = nullptr;   // array of spike-time records
+  n_rec_spike_times_ = nullptr; // array of number of recorded spike times
+  max_n_rec_spike_times_ = 0;   // max number of recorded spike times
+  rec_spike_times_step_ = 0;    // number of time steps for spike times buffering
+                                // 0 for no buffering
+  den_delay_arr_ = nullptr;     // array of dendritic backward delays
 
   return 0;
-}			    
+}
 
 // allocate state-variable array
-int BaseNeuron::AllocVarArr()
+int
+BaseNeuron::AllocVarArr()
 {
-  CUDAMALLOCCTRL("&var_arr_",&var_arr_, n_node_*n_var_*sizeof(float));
+  CUDAMALLOCCTRL( "&var_arr_", &var_arr_, n_node_ * n_var_ * sizeof( float ) );
   return 0;
 }
 
 // allocate parameter array
-int BaseNeuron::AllocParamArr()
+int
+BaseNeuron::AllocParamArr()
 {
-  CUDAMALLOCCTRL("&param_arr_",&param_arr_, n_node_*n_param_*sizeof(float));
+  CUDAMALLOCCTRL( "&param_arr_", &param_arr_, n_node_ * n_param_ * sizeof( float ) );
   return 0;
 }
 
 // deallocate state-variable array
-int BaseNeuron::FreeVarArr()
+int
+BaseNeuron::FreeVarArr()
 {
-  if (var_arr_ != NULL) {
-    CUDAFREECTRL("var_arr_",var_arr_);
-    var_arr_ = NULL;
+  if ( var_arr_ != nullptr )
+  {
+    CUDAFREECTRL( "var_arr_", var_arr_ );
+    var_arr_ = nullptr;
   }
   return 0;
 }
 
 // deallocate parameter array
-int BaseNeuron::FreeParamArr()
+int
+BaseNeuron::FreeParamArr()
 {
-  if (param_arr_ != NULL) {
-    CUDAFREECTRL("param_arr_",param_arr_);
-    param_arr_ = NULL;
+  if ( param_arr_ != nullptr )
+  {
+    CUDAFREECTRL( "param_arr_", param_arr_ );
+    param_arr_ = nullptr;
   }
   return 0;
 }
@@ -231,72 +247,70 @@ int BaseNeuron::FreeParamArr()
 // set scalar parameter param_name of neurons
 // i_neuron, ..., i_neuron + n_neuron -1
 // to value val
-int BaseNeuron::SetScalParam(int i_neuron, int n_neuron,
-			     std::string param_name, float val)
-{
-  if (!IsScalParam(param_name)) {
-    throw ngpu_exception(std::string("Unrecognized scalar parameter ")
-			 + param_name);
-  }
-  CheckNeuronIdx(i_neuron);
-  CheckNeuronIdx(i_neuron + n_neuron - 1);
-  float *param_pt = GetParamPt(i_neuron, param_name);
-  BaseNeuronSetFloatArray<<<(n_neuron+1023)/1024, 1024>>>
-    (param_pt, n_neuron, n_param_, val);
+int
+BaseNeuron::SetScalParam( int i_neuron, int n_neuron, std::string param_name, float val )
+{
+  if ( !IsScalParam( param_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized scalar parameter " ) + param_name );
+  }
+  CheckNeuronIdx( i_neuron );
+  CheckNeuronIdx( i_neuron + n_neuron - 1 );
+  float* param_pt = GetParamPt( i_neuron, param_name );
+  BaseNeuronSetFloatArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( param_pt, n_neuron, n_param_, val );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  
+
   return 0;
 }
 
 // set scalar parameter param_name of neurons
 // i_neuron[0], ..., i_neuron[n_neuron -1]
 // to value val
-int BaseNeuron::SetScalParam(int *i_neuron, int n_neuron,
-			     std::string param_name, float val)
+int
+BaseNeuron::SetScalParam( int* i_neuron, int n_neuron, std::string param_name, float val )
 {
-  if (!IsScalParam(param_name)) {
-    throw ngpu_exception(std::string("Unrecognized scalar parameter ")
-				     + param_name);
+  if ( !IsScalParam( param_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized scalar parameter " ) + param_name );
   }
-  int *d_i_neuron;
-  CUDAMALLOCCTRL("&d_i_neuron",&d_i_neuron, n_neuron*sizeof(int));
+  int* d_i_neuron;
+  CUDAMALLOCCTRL( "&d_i_neuron", &d_i_neuron, n_neuron * sizeof( int ) );
   // Memcopy will be synchronized with BaseNeuronSetFloatPtArray kernel
-  gpuErrchk(cudaMemcpyAsync(d_i_neuron, i_neuron, n_neuron*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  float *param_pt = GetParamPt(0, param_name);
-  BaseNeuronSetFloatPtArray<<<(n_neuron+1023)/1024, 1024>>>
-    (param_pt, d_i_neuron, n_neuron, n_param_, val);
+  gpuErrchk( cudaMemcpyAsync( d_i_neuron, i_neuron, n_neuron * sizeof( int ), cudaMemcpyHostToDevice ) );
+  float* param_pt = GetParamPt( 0, param_name );
+  BaseNeuronSetFloatPtArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( param_pt, d_i_neuron, n_neuron, n_param_, val );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  CUDAFREECTRL("d_i_neuron",d_i_neuron);
-  
+  CUDAFREECTRL( "d_i_neuron", d_i_neuron );
+
   return 0;
 }
 
 // set receptor-port parameter param_name of neurons
 // i_neuron, ..., i_neuron + n_neuron -1
 // to value val
-int BaseNeuron::SetPortParam(int i_neuron, int n_neuron,
-			     std::string param_name, float *param,
-			     int vect_size)
-{
-  if (!IsPortParam(param_name)) {
-    throw ngpu_exception(std::string("Unrecognized port parameter ")
-			 + param_name);
-  }
-  CheckNeuronIdx(i_neuron);
-  CheckNeuronIdx(i_neuron + n_neuron - 1);
-  if (vect_size != n_port_) {
-    throw ngpu_exception("Parameter array size must be equal "
-			 "to the number of ports.");
-  }
-  float *param_pt;
-    
-  for (int i_vect=0; i_vect<vect_size; i_vect++) {
-    param_pt = GetParamPt(i_neuron, param_name, i_vect);
-    BaseNeuronSetFloatArray<<<(n_neuron+1023)/1024, 1024>>>
-      (param_pt, n_neuron, n_param_, param[i_vect]);
+int
+BaseNeuron::SetPortParam( int i_neuron, int n_neuron, std::string param_name, float* param, int vect_size )
+{
+  if ( !IsPortParam( param_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized port parameter " ) + param_name );
+  }
+  CheckNeuronIdx( i_neuron );
+  CheckNeuronIdx( i_neuron + n_neuron - 1 );
+  if ( vect_size != n_port_ )
+  {
+    throw ngpu_exception(
+      "Parameter array size must be equal "
+      "to the number of ports." );
+  }
+  float* param_pt;
+
+  for ( int i_vect = 0; i_vect < vect_size; i_vect++ )
+  {
+    param_pt = GetParamPt( i_neuron, param_name, i_vect );
+    BaseNeuronSetFloatArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( param_pt, n_neuron, n_param_, param[ i_vect ] );
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
   }
@@ -306,31 +320,32 @@ int BaseNeuron::SetPortParam(int i_neuron, int n_neuron,
 // set receptor-port parameter param_name of neurons
 // i_neuron[0], ..., i_neuron[n_neuron -1]
 // to value val
-int BaseNeuron::SetPortParam(int *i_neuron, int n_neuron,
-			     std::string param_name, float *param,
-			     int vect_size)
+int
+BaseNeuron::SetPortParam( int* i_neuron, int n_neuron, std::string param_name, float* param, int vect_size )
 {
-  if (!IsPortParam(param_name)) {
-    throw ngpu_exception(std::string("Unrecognized port parameter ")
-			 + param_name);
+  if ( !IsPortParam( param_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized port parameter " ) + param_name );
   }
-  if (vect_size != n_port_) {
-    throw ngpu_exception("Parameter array size must be equal "
-			 "to the number of ports.");
+  if ( vect_size != n_port_ )
+  {
+    throw ngpu_exception(
+      "Parameter array size must be equal "
+      "to the number of ports." );
   }
-  int *d_i_neuron;
-  CUDAMALLOCCTRL("&d_i_neuron",&d_i_neuron, n_neuron*sizeof(int));
+  int* d_i_neuron;
+  CUDAMALLOCCTRL( "&d_i_neuron", &d_i_neuron, n_neuron * sizeof( int ) );
   // Memcopy will be synchronized with BaseNeuronSetFloatPtArray kernel
-  gpuErrchk(cudaMemcpyAsync(d_i_neuron, i_neuron, n_neuron*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  for (int i_vect=0; i_vect<vect_size; i_vect++) {
-    float *param_pt = GetParamPt(0, param_name, i_vect);
-    BaseNeuronSetFloatPtArray<<<(n_neuron+1023)/1024, 1024>>>
-      (param_pt, d_i_neuron, n_neuron, n_param_, param[i_vect]);
+  gpuErrchk( cudaMemcpyAsync( d_i_neuron, i_neuron, n_neuron * sizeof( int ), cudaMemcpyHostToDevice ) );
+  for ( int i_vect = 0; i_vect < vect_size; i_vect++ )
+  {
+    float* param_pt = GetParamPt( 0, param_name, i_vect );
+    BaseNeuronSetFloatPtArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>(
+      param_pt, d_i_neuron, n_neuron, n_param_, param[ i_vect ] );
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
   }
-  CUDAFREECTRL("d_i_neuron",d_i_neuron);
+  CUDAFREECTRL( "d_i_neuron", d_i_neuron );
 
   return 0;
 }
@@ -339,155 +354,148 @@ int BaseNeuron::SetPortParam(int *i_neuron, int n_neuron,
 // i_neuron, ..., i_neuron + n_neuron -1
 // to values array[0], ... , array[array_size-1]
 // Must be defined in derived classes
-int BaseNeuron::SetArrayParam(int i_neuron, int n_neuron,
-			      std::string param_name, float *array,
-			      int array_size)
+int
+BaseNeuron::SetArrayParam( int i_neuron, int n_neuron, std::string param_name, float* array, int array_size )
 {
-  throw ngpu_exception(std::string("Unrecognized parameter ")
-		       + param_name);
+  throw ngpu_exception( std::string( "Unrecognized parameter " ) + param_name );
 }
 
 // set array parameter param_name of neurons
 // i_neuron[0], ..., i_neuron[n_neuron -1]
 // to values array[0], ... , array[array_size-1]
 // Must be defined in derived classes
-int BaseNeuron::SetArrayParam(int *i_neuron, int n_neuron,
-			      std::string param_name, float *array,
-			      int array_size)
+int
+BaseNeuron::SetArrayParam( int* i_neuron, int n_neuron, std::string param_name, float* array, int array_size )
 {
-  throw ngpu_exception(std::string("Unrecognized parameter ")
-		       + param_name);
+  throw ngpu_exception( std::string( "Unrecognized parameter " ) + param_name );
 }
 
 // set neuron-group parameter param_name to value val
-int BaseNeuron::SetGroupParam(std::string param_name, float val)
+int
+BaseNeuron::SetGroupParam( std::string param_name, float val )
 {
   int i_param;
-  for (i_param=0; i_param<n_group_param_; i_param++) {
-    if (param_name == group_param_name_[i_param]) {
-      group_param_[i_param] = val;
+  for ( i_param = 0; i_param < n_group_param_; i_param++ )
+  {
+    if ( param_name == group_param_name_[ i_param ] )
+    {
+      group_param_[ i_param ] = val;
       return 0;
     }
   }
-  throw ngpu_exception(std::string("Unrecognized group parameter ")
-		       + param_name);
+  throw ngpu_exception( std::string( "Unrecognized group parameter " ) + param_name );
 }
 
 // set integer variable var_name of neurons
 // i_neuron, ..., i_neuron + n_neuron -1
 // to value val
-int BaseNeuron::SetIntVar(int i_neuron, int n_neuron,
-			  std::string var_name, int val)
-{
-  if (!IsIntVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized integer variable ")
-			 + var_name);
-  }
-  CheckNeuronIdx(i_neuron);
-  CheckNeuronIdx(i_neuron + n_neuron - 1);
-  int *var_pt = GetIntVarPt(i_neuron, var_name);
-  BaseNeuronSetIntArray<<<(n_neuron+1023)/1024, 1024>>>
-    (var_pt, n_neuron, 1, val);
+int
+BaseNeuron::SetIntVar( int i_neuron, int n_neuron, std::string var_name, int val )
+{
+  if ( !IsIntVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized integer variable " ) + var_name );
+  }
+  CheckNeuronIdx( i_neuron );
+  CheckNeuronIdx( i_neuron + n_neuron - 1 );
+  int* var_pt = GetIntVarPt( i_neuron, var_name );
+  BaseNeuronSetIntArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( var_pt, n_neuron, 1, val );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  
+
   return 0;
 }
 
 // set integer variable var_name of neurons
 // i_neuron[0], ..., i_neuron[n_neuron -1]
 // to value val
-int BaseNeuron::SetIntVar(int *i_neuron, int n_neuron,
-			  std::string var_name, int val)
+int
+BaseNeuron::SetIntVar( int* i_neuron, int n_neuron, std::string var_name, int val )
 {
-  if (!IsIntVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized integer variable ")
-			 + var_name);
+  if ( !IsIntVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized integer variable " ) + var_name );
   }
-  int *d_i_neuron;
-  CUDAMALLOCCTRL("&d_i_neuron",&d_i_neuron, n_neuron*sizeof(int));
+  int* d_i_neuron;
+  CUDAMALLOCCTRL( "&d_i_neuron", &d_i_neuron, n_neuron * sizeof( int ) );
   // Memcopy will be synchronized with BaseNeuronSetIntPtArray kernel
-  gpuErrchk(cudaMemcpyAsync(d_i_neuron, i_neuron, n_neuron*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  int *var_pt = GetIntVarPt(0, var_name);
-  BaseNeuronSetIntPtArray<<<(n_neuron+1023)/1024, 1024>>>
-    (var_pt, d_i_neuron, n_neuron, 1, val);
+  gpuErrchk( cudaMemcpyAsync( d_i_neuron, i_neuron, n_neuron * sizeof( int ), cudaMemcpyHostToDevice ) );
+  int* var_pt = GetIntVarPt( 0, var_name );
+  BaseNeuronSetIntPtArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( var_pt, d_i_neuron, n_neuron, 1, val );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  CUDAFREECTRL("d_i_neuron",d_i_neuron);
-  
+  CUDAFREECTRL( "d_i_neuron", d_i_neuron );
+
   return 0;
 }
 
 // set scalar state-variable var_name of neurons
 // i_neuron, ..., i_neuron + n_neuron -1
 // to value val
-int BaseNeuron::SetScalVar(int i_neuron, int n_neuron,
-			     std::string var_name, float val)
-{
-  if (!IsScalVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized scalar variable ")
-			 + var_name);
-  }
-  CheckNeuronIdx(i_neuron);
-  CheckNeuronIdx(i_neuron + n_neuron - 1);
-  float *var_pt = GetVarPt(i_neuron, var_name);
-  BaseNeuronSetFloatArray<<<(n_neuron+1023)/1024, 1024>>>
-    (var_pt, n_neuron, n_var_, val);
+int
+BaseNeuron::SetScalVar( int i_neuron, int n_neuron, std::string var_name, float val )
+{
+  if ( !IsScalVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized scalar variable " ) + var_name );
+  }
+  CheckNeuronIdx( i_neuron );
+  CheckNeuronIdx( i_neuron + n_neuron - 1 );
+  float* var_pt = GetVarPt( i_neuron, var_name );
+  BaseNeuronSetFloatArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( var_pt, n_neuron, n_var_, val );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  
+
   return 0;
 }
 
 // set scalar state-variable var_name of neurons
 // i_neuron[0], ..., i_neuron[n_neuron -1]
 // to value val
-int BaseNeuron::SetScalVar(int *i_neuron, int n_neuron,
-			   std::string var_name, float val)
+int
+BaseNeuron::SetScalVar( int* i_neuron, int n_neuron, std::string var_name, float val )
 {
-  if (!IsScalVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized scalar variable ")
-				     + var_name);
+  if ( !IsScalVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized scalar variable " ) + var_name );
   }
-  int *d_i_neuron;
-  CUDAMALLOCCTRL("&d_i_neuron",&d_i_neuron, n_neuron*sizeof(int));
+  int* d_i_neuron;
+  CUDAMALLOCCTRL( "&d_i_neuron", &d_i_neuron, n_neuron * sizeof( int ) );
   // Memcopy will be synchronized with BaseNeuronSetFloatPtArray kernel
-  gpuErrchk(cudaMemcpyAsync(d_i_neuron, i_neuron, n_neuron*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  float *var_pt = GetVarPt(0, var_name);
-  BaseNeuronSetFloatPtArray<<<(n_neuron+1023)/1024, 1024>>>
-    (var_pt, d_i_neuron, n_neuron, n_var_, val);
+  gpuErrchk( cudaMemcpyAsync( d_i_neuron, i_neuron, n_neuron * sizeof( int ), cudaMemcpyHostToDevice ) );
+  float* var_pt = GetVarPt( 0, var_name );
+  BaseNeuronSetFloatPtArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( var_pt, d_i_neuron, n_neuron, n_var_, val );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  CUDAFREECTRL("d_i_neuron",d_i_neuron);
-  
+  CUDAFREECTRL( "d_i_neuron", d_i_neuron );
+
   return 0;
 }
 
 // set receptor-port state-variable var_name of neurons
 // i_neuron, ..., i_neuron + n_neuron -1
 // to value val
-int BaseNeuron::SetPortVar(int i_neuron, int n_neuron,
-			   std::string var_name, float *var,
-			   int vect_size)
-{
-  if (!IsPortVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized port variable ")
-			 + var_name);
-  }
-  CheckNeuronIdx(i_neuron);
-  CheckNeuronIdx(i_neuron + n_neuron - 1);
-  if (vect_size != n_port_) {
-    throw ngpu_exception("Variable array size must be equal "
-			 "to the number of ports.");
-  }
-  float *var_pt;
-    
-  for (int i_vect=0; i_vect<vect_size; i_vect++) {
-    var_pt = GetVarPt(i_neuron, var_name, i_vect);
-    BaseNeuronSetFloatArray<<<(n_neuron+1023)/1024, 1024>>>
-      (var_pt, n_neuron, n_var_, var[i_vect]);
+int
+BaseNeuron::SetPortVar( int i_neuron, int n_neuron, std::string var_name, float* var, int vect_size )
+{
+  if ( !IsPortVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized port variable " ) + var_name );
+  }
+  CheckNeuronIdx( i_neuron );
+  CheckNeuronIdx( i_neuron + n_neuron - 1 );
+  if ( vect_size != n_port_ )
+  {
+    throw ngpu_exception(
+      "Variable array size must be equal "
+      "to the number of ports." );
+  }
+  float* var_pt;
+
+  for ( int i_vect = 0; i_vect < vect_size; i_vect++ )
+  {
+    var_pt = GetVarPt( i_neuron, var_name, i_vect );
+    BaseNeuronSetFloatArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( var_pt, n_neuron, n_var_, var[ i_vect ] );
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
   }
@@ -497,31 +505,32 @@ int BaseNeuron::SetPortVar(int i_neuron, int n_neuron,
 // set receptor-port state-variable var_name of neurons
 // i_neuron[0], ..., i_neuron[n_neuron -1]
 // to value val
-int BaseNeuron::SetPortVar(int *i_neuron, int n_neuron,
-			   std::string var_name, float *var,
-			   int vect_size)
+int
+BaseNeuron::SetPortVar( int* i_neuron, int n_neuron, std::string var_name, float* var, int vect_size )
 {
-  if (!IsPortVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized port variable ")
-			 + var_name);
+  if ( !IsPortVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized port variable " ) + var_name );
   }
-  if (vect_size != n_port_) {
-    throw ngpu_exception("Variable array size must be equal "
-			 "to the number of ports.");
+  if ( vect_size != n_port_ )
+  {
+    throw ngpu_exception(
+      "Variable array size must be equal "
+      "to the number of ports." );
   }
-  int *d_i_neuron;
-  CUDAMALLOCCTRL("&d_i_neuron",&d_i_neuron, n_neuron*sizeof(int));
+  int* d_i_neuron;
+  CUDAMALLOCCTRL( "&d_i_neuron", &d_i_neuron, n_neuron * sizeof( int ) );
   // Memcopy will be synchronized with BaseNeuronSetFloatPtArray kernel
-  gpuErrchk(cudaMemcpyAsync(d_i_neuron, i_neuron, n_neuron*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  for (int i_vect=0; i_vect<vect_size; i_vect++) {
-    float *var_pt = GetVarPt(0, var_name, i_vect);
-    BaseNeuronSetFloatPtArray<<<(n_neuron+1023)/1024, 1024>>>
-      (var_pt, d_i_neuron, n_neuron, n_var_, var[i_vect]);
+  gpuErrchk( cudaMemcpyAsync( d_i_neuron, i_neuron, n_neuron * sizeof( int ), cudaMemcpyHostToDevice ) );
+  for ( int i_vect = 0; i_vect < vect_size; i_vect++ )
+  {
+    float* var_pt = GetVarPt( 0, var_name, i_vect );
+    BaseNeuronSetFloatPtArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>(
+      var_pt, d_i_neuron, n_neuron, n_var_, var[ i_vect ] );
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
   }
-  CUDAFREECTRL("d_i_neuron",d_i_neuron);
+  CUDAFREECTRL( "d_i_neuron", d_i_neuron );
 
   return 0;
 }
@@ -530,166 +539,145 @@ int BaseNeuron::SetPortVar(int *i_neuron, int n_neuron,
 // i_neuron, ..., i_neuron + n_neuron -1
 // to values array[0], ... , array[array_size-1]
 // Must be defined in derived classes
-int BaseNeuron::SetArrayVar(int i_neuron, int n_neuron,
-			      std::string var_name, float *array,
-			      int array_size)
+int
+BaseNeuron::SetArrayVar( int i_neuron, int n_neuron, std::string var_name, float* array, int array_size )
 {
-  throw ngpu_exception(std::string("Unrecognized variable ")
-		       + var_name);
+  throw ngpu_exception( std::string( "Unrecognized variable " ) + var_name );
 }
 
 // set array variable var_name of neurons
 // i_neuron[0], ..., i_neuron[n_neuron -1]
 // to values array[0], ... , array[array_size-1]
 // Must be defined in derived classes
-int BaseNeuron::SetArrayVar(int *i_neuron, int n_neuron,
-			      std::string var_name, float *array,
-			      int array_size)
+int
+BaseNeuron::SetArrayVar( int* i_neuron, int n_neuron, std::string var_name, float* array, int array_size )
 {
-  throw ngpu_exception(std::string("Unrecognized variable ")
-		       + var_name);
+  throw ngpu_exception( std::string( "Unrecognized variable " ) + var_name );
 }
 
-
-
-
 //////////////////////////////////////////////////////////////////////
 
 // set scalar parameter param_name of neurons
 // i_neuron, ..., i_neuron + n_neuron -1
 // using distribution or array
-int BaseNeuron::SetScalParamDistr(int i_neuron, int n_neuron,
-				  std::string param_name,
-				  Distribution *distribution)
-{
-  if (!IsScalParam(param_name)) {
-    throw ngpu_exception(std::string("Unrecognized scalar parameter ")
-			 + param_name);
-  }
-  CheckNeuronIdx(i_neuron);
-  CheckNeuronIdx(i_neuron + n_neuron - 1);
-  float *param_pt = GetParamPt(i_neuron, param_name);
-  float *d_arr = distribution->getArray(*random_generator_, n_neuron);
-  BaseNeuronCopyFloatArray<<<(n_neuron+1023)/1024, 1024>>>
-    (param_pt, n_neuron, n_param_, d_arr);
+int
+BaseNeuron::SetScalParamDistr( int i_neuron, int n_neuron, std::string param_name, Distribution* distribution )
+{
+  if ( !IsScalParam( param_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized scalar parameter " ) + param_name );
+  }
+  CheckNeuronIdx( i_neuron );
+  CheckNeuronIdx( i_neuron + n_neuron - 1 );
+  float* param_pt = GetParamPt( i_neuron, param_name );
+  float* d_arr = distribution->getArray( *random_generator_, n_neuron );
+  BaseNeuronCopyFloatArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( param_pt, n_neuron, n_param_, d_arr );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  CUDAFREECTRL("d_arr",d_arr);
-  
+  CUDAFREECTRL( "d_arr", d_arr );
+
   return 0;
 }
 
 // set scalar parameter param_name of neurons
 // i_neuron[0], ..., i_neuron[n_neuron -1]
 // using distribution or array
-int BaseNeuron::SetScalParamDistr(int *i_neuron, int n_neuron,
-				  std::string param_name,
-				  Distribution *distribution)
-{
-  if (!IsScalParam(param_name)) {
-    throw ngpu_exception(std::string("Unrecognized scalar parameter ")
-				     + param_name);
-  }
-  int *d_i_neuron;
-  CUDAMALLOCCTRL("&d_i_neuron",&d_i_neuron, n_neuron*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_i_neuron, i_neuron, n_neuron*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  float *param_pt = GetParamPt(0, param_name);
-  float *d_arr = distribution->getArray(*random_generator_, n_neuron);
-  BaseNeuronCopyFloatPtArray<<<(n_neuron+1023)/1024, 1024>>>
-    (param_pt, d_i_neuron, n_neuron, n_param_, d_arr);
+int
+BaseNeuron::SetScalParamDistr( int* i_neuron, int n_neuron, std::string param_name, Distribution* distribution )
+{
+  if ( !IsScalParam( param_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized scalar parameter " ) + param_name );
+  }
+  int* d_i_neuron;
+  CUDAMALLOCCTRL( "&d_i_neuron", &d_i_neuron, n_neuron * sizeof( int ) );
+  gpuErrchk( cudaMemcpy( d_i_neuron, i_neuron, n_neuron * sizeof( int ), cudaMemcpyHostToDevice ) );
+  float* param_pt = GetParamPt( 0, param_name );
+  float* d_arr = distribution->getArray( *random_generator_, n_neuron );
+  BaseNeuronCopyFloatPtArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( param_pt, d_i_neuron, n_neuron, n_param_, d_arr );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  CUDAFREECTRL("d_i_neuron",d_i_neuron);
-  CUDAFREECTRL("d_arr",d_arr);
-  
+  CUDAFREECTRL( "d_i_neuron", d_i_neuron );
+  CUDAFREECTRL( "d_arr", d_arr );
+
   return 0;
 }
 
 // set scalar variable var_name of neurons
 // i_neuron, ..., i_neuron + n_neuron -1
 // using distribution or array
-int BaseNeuron::SetScalVarDistr(int i_neuron, int n_neuron,
-				  std::string var_name,
-				  Distribution *distribution)
-{
-  //printf("okk0\n");
-  if (!IsScalVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized scalar variable ")
-			 + var_name);
-  }
-  CheckNeuronIdx(i_neuron);
-  CheckNeuronIdx(i_neuron + n_neuron - 1);
-  float *var_pt = GetVarPt(i_neuron, var_name);
-  //printf("okk1\n");
-  float *d_arr = distribution->getArray(*random_generator_, n_neuron);
-  //printf("okk2\n");
-  BaseNeuronCopyFloatArray<<<(n_neuron+1023)/1024, 1024>>>
-    (var_pt, n_neuron, n_var_, d_arr);
+int
+BaseNeuron::SetScalVarDistr( int i_neuron, int n_neuron, std::string var_name, Distribution* distribution )
+{
+  if ( !IsScalVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized scalar variable " ) + var_name );
+  }
+  CheckNeuronIdx( i_neuron );
+  CheckNeuronIdx( i_neuron + n_neuron - 1 );
+  float* var_pt = GetVarPt( i_neuron, var_name );
+  float* d_arr = distribution->getArray( *random_generator_, n_neuron );
+  BaseNeuronCopyFloatArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( var_pt, n_neuron, n_var_, d_arr );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  //printf("okk3\n");
-  CUDAFREECTRL("d_arr",d_arr);
-  //printf("okk4\n");
-  
+  CUDAFREECTRL( "d_arr", d_arr );
+
   return 0;
 }
 
 // set scalar state-variable var_name of neurons
 // i_neuron[0], ..., i_neuron[n_neuron -1]
 // using distribution or array
-int BaseNeuron::SetScalVarDistr(int *i_neuron, int n_neuron,
-				std::string var_name,
-				Distribution *distribution)
-{
-  if (!IsScalVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized scalar variable ")
-				     + var_name);
-  }
-  int *d_i_neuron;
-  CUDAMALLOCCTRL("&d_i_neuron",&d_i_neuron, n_neuron*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_i_neuron, i_neuron, n_neuron*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  float *var_pt = GetVarPt(0, var_name);
-  float *d_arr = distribution->getArray(*random_generator_, n_neuron);
-  BaseNeuronCopyFloatPtArray<<<(n_neuron+1023)/1024, 1024>>>
-    (var_pt, d_i_neuron, n_neuron, n_var_, d_arr);
+int
+BaseNeuron::SetScalVarDistr( int* i_neuron, int n_neuron, std::string var_name, Distribution* distribution )
+{
+  if ( !IsScalVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized scalar variable " ) + var_name );
+  }
+  int* d_i_neuron;
+  CUDAMALLOCCTRL( "&d_i_neuron", &d_i_neuron, n_neuron * sizeof( int ) );
+  gpuErrchk( cudaMemcpy( d_i_neuron, i_neuron, n_neuron * sizeof( int ), cudaMemcpyHostToDevice ) );
+  float* var_pt = GetVarPt( 0, var_name );
+  float* d_arr = distribution->getArray( *random_generator_, n_neuron );
+  BaseNeuronCopyFloatPtArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( var_pt, d_i_neuron, n_neuron, n_var_, d_arr );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  CUDAFREECTRL("d_i_neuron",d_i_neuron);
-  CUDAFREECTRL("d_arr",d_arr);
-  
+  CUDAFREECTRL( "d_i_neuron", d_i_neuron );
+  CUDAFREECTRL( "d_arr", d_arr );
+
   return 0;
 }
 
 // set receptor-port parameter param_name of neurons
 // i_neuron, ..., i_neuron + n_neuron -1
 // using distribution or array
-int BaseNeuron::SetPortParamDistr(int i_neuron, int n_neuron,
-				  std::string param_name,
-				  Distribution *distribution)
+int
+BaseNeuron::SetPortParamDistr( int i_neuron, int n_neuron, std::string param_name, Distribution* distribution )
 {
-  if (!IsPortParam(param_name)) {
-    throw ngpu_exception(std::string("Unrecognized port parameter ")
-			 + param_name);
+  if ( !IsPortParam( param_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized port parameter " ) + param_name );
   }
-  CheckNeuronIdx(i_neuron);
-  CheckNeuronIdx(i_neuron + n_neuron - 1);
+  CheckNeuronIdx( i_neuron );
+  CheckNeuronIdx( i_neuron + n_neuron - 1 );
   int vect_size = distribution->vectSize();
-  if (vect_size != n_port_) {
-    throw ngpu_exception("Distribution vector dimension must be "
-			 "equal to the number of ports.");
-  }
-  float *param_pt;
-    
-  for (int i_vect=0; i_vect<vect_size; i_vect++) {
-    param_pt = GetParamPt(i_neuron, param_name, i_vect);
-    float *d_arr = distribution->getArray(*random_generator_, n_neuron, i_vect);
-    BaseNeuronCopyFloatArray<<<(n_neuron+1023)/1024, 1024>>>
-      (param_pt, n_neuron, n_param_, d_arr);
+  if ( vect_size != n_port_ )
+  {
+    throw ngpu_exception(
+      "Distribution vector dimension must be "
+      "equal to the number of ports." );
+  }
+  float* param_pt;
+
+  for ( int i_vect = 0; i_vect < vect_size; i_vect++ )
+  {
+    param_pt = GetParamPt( i_neuron, param_name, i_vect );
+    float* d_arr = distribution->getArray( *random_generator_, n_neuron, i_vect );
+    BaseNeuronCopyFloatArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( param_pt, n_neuron, n_param_, d_arr );
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
-    CUDAFREECTRL("d_arr",d_arr);
+    CUDAFREECTRL( "d_arr", d_arr );
   }
   return 0;
 }
@@ -697,34 +685,35 @@ int BaseNeuron::SetPortParamDistr(int i_neuron, int n_neuron,
 // set receptor-port parameter param_name of neurons
 // i_neuron[0], ..., i_neuron[n_neuron -1]
 // using distribution or array
-int BaseNeuron::SetPortParamDistr(int *i_neuron, int n_neuron,
-				  std::string param_name,
-				  Distribution *distribution)
-			     
+int
+BaseNeuron::SetPortParamDistr( int* i_neuron, int n_neuron, std::string param_name, Distribution* distribution )
+
 {
-  if (!IsPortParam(param_name)) {
-    throw ngpu_exception(std::string("Unrecognized port parameter ")
-			 + param_name);
+  if ( !IsPortParam( param_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized port parameter " ) + param_name );
   }
   int vect_size = distribution->vectSize();
-  if (vect_size != n_port_) {
-    throw ngpu_exception("Distribution vector dimension must be "
-			 "equal to the number of ports.");
-  }
-  int *d_i_neuron;
-  CUDAMALLOCCTRL("&d_i_neuron",&d_i_neuron, n_neuron*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_i_neuron, i_neuron, n_neuron*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  for (int i_vect=0; i_vect<vect_size; i_vect++) {
-    float *param_pt = GetParamPt(0, param_name, i_vect);
-    float *d_arr = distribution->getArray(*random_generator_, n_neuron, i_vect);
-    BaseNeuronCopyFloatPtArray<<<(n_neuron+1023)/1024, 1024>>>
-      (param_pt, d_i_neuron, n_neuron, n_param_, d_arr);
+  if ( vect_size != n_port_ )
+  {
+    throw ngpu_exception(
+      "Distribution vector dimension must be "
+      "equal to the number of ports." );
+  }
+  int* d_i_neuron;
+  CUDAMALLOCCTRL( "&d_i_neuron", &d_i_neuron, n_neuron * sizeof( int ) );
+  gpuErrchk( cudaMemcpy( d_i_neuron, i_neuron, n_neuron * sizeof( int ), cudaMemcpyHostToDevice ) );
+  for ( int i_vect = 0; i_vect < vect_size; i_vect++ )
+  {
+    float* param_pt = GetParamPt( 0, param_name, i_vect );
+    float* d_arr = distribution->getArray( *random_generator_, n_neuron, i_vect );
+    BaseNeuronCopyFloatPtArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>(
+      param_pt, d_i_neuron, n_neuron, n_param_, d_arr );
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
-    CUDAFREECTRL("d_arr",d_arr);
+    CUDAFREECTRL( "d_arr", d_arr );
   }
-  CUDAFREECTRL("d_i_neuron",d_i_neuron);
+  CUDAFREECTRL( "d_i_neuron", d_i_neuron );
 
   return 0;
 }
@@ -732,956 +721,1050 @@ int BaseNeuron::SetPortParamDistr(int *i_neuron, int n_neuron,
 // set receptor-port variable var_name of neurons
 // i_neuron, ..., i_neuron + n_neuron -1
 // using distribution or array
-int BaseNeuron::SetPortVarDistr(int i_neuron, int n_neuron,
-				std::string var_name,
-				Distribution *distribution)
+int
+BaseNeuron::SetPortVarDistr( int i_neuron, int n_neuron, std::string var_name, Distribution* distribution )
 {
-  if (!IsPortVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized port variable ")
-			 + var_name);
+  if ( !IsPortVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized port variable " ) + var_name );
   }
-  CheckNeuronIdx(i_neuron);
-  CheckNeuronIdx(i_neuron + n_neuron - 1);
+  CheckNeuronIdx( i_neuron );
+  CheckNeuronIdx( i_neuron + n_neuron - 1 );
   int vect_size = distribution->vectSize();
-  if (vect_size != n_port_) {
-    throw ngpu_exception("Distribution vector dimension must be "
-			 "equal to the number of ports.");
-  }
-  float *var_pt;
-    
-  for (int i_vect=0; i_vect<vect_size; i_vect++) {
-    var_pt = GetVarPt(i_neuron, var_name, i_vect);
-    float *d_arr = distribution->getArray(*random_generator_, n_neuron, i_vect);
-    BaseNeuronCopyFloatArray<<<(n_neuron+1023)/1024, 1024>>>
-      (var_pt, n_neuron, n_var_, d_arr);
+  if ( vect_size != n_port_ )
+  {
+    throw ngpu_exception(
+      "Distribution vector dimension must be "
+      "equal to the number of ports." );
+  }
+  float* var_pt;
+
+  for ( int i_vect = 0; i_vect < vect_size; i_vect++ )
+  {
+    var_pt = GetVarPt( i_neuron, var_name, i_vect );
+    float* d_arr = distribution->getArray( *random_generator_, n_neuron, i_vect );
+    BaseNeuronCopyFloatArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( var_pt, n_neuron, n_var_, d_arr );
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
-    CUDAFREECTRL("d_arr",d_arr);
+    CUDAFREECTRL( "d_arr", d_arr );
   }
   return 0;
 }
 
-int BaseNeuron::SetPortVarDistr(int *i_neuron, int n_neuron,
-				std::string var_name,
-				Distribution *distribution)
+int
+BaseNeuron::SetPortVarDistr( int* i_neuron, int n_neuron, std::string var_name, Distribution* distribution )
 {
-  if (!IsPortVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized port variable ")
-			 + var_name);
+  if ( !IsPortVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized port variable " ) + var_name );
   }
   int vect_size = distribution->vectSize();
-  if (vect_size != n_port_) {
-    throw ngpu_exception("Distribution vector dimension must be "
-			 "equal to the number of ports.");
-  }
-  int *d_i_neuron;
-  CUDAMALLOCCTRL("&d_i_neuron",&d_i_neuron, n_neuron*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_i_neuron, i_neuron, n_neuron*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  for (int i_vect=0; i_vect<vect_size; i_vect++) {
-    float *var_pt = GetVarPt(0, var_name, i_vect);
-    float *d_arr = distribution->getArray(*random_generator_, n_neuron, i_vect);
-    BaseNeuronCopyFloatPtArray<<<(n_neuron+1023)/1024, 1024>>>
-      (var_pt, d_i_neuron, n_neuron, n_var_, d_arr);
+  if ( vect_size != n_port_ )
+  {
+    throw ngpu_exception(
+      "Distribution vector dimension must be "
+      "equal to the number of ports." );
+  }
+  int* d_i_neuron;
+  CUDAMALLOCCTRL( "&d_i_neuron", &d_i_neuron, n_neuron * sizeof( int ) );
+  gpuErrchk( cudaMemcpy( d_i_neuron, i_neuron, n_neuron * sizeof( int ), cudaMemcpyHostToDevice ) );
+  for ( int i_vect = 0; i_vect < vect_size; i_vect++ )
+  {
+    float* var_pt = GetVarPt( 0, var_name, i_vect );
+    float* d_arr = distribution->getArray( *random_generator_, n_neuron, i_vect );
+    BaseNeuronCopyFloatPtArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( var_pt, d_i_neuron, n_neuron, n_var_, d_arr );
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
-    CUDAFREECTRL("d_arr",d_arr);
+    CUDAFREECTRL( "d_arr", d_arr );
   }
-  CUDAFREECTRL("d_i_neuron",d_i_neuron);
+  CUDAFREECTRL( "d_i_neuron", d_i_neuron );
 
   return 0;
 }
 
-
-    
 //////////////////////////////////////////////////////////////////////
 
-
-
 // get scalar parameters param_name of neurons
 // i_neuron, ..., i_neuron + n_neuron -1
-float *BaseNeuron::GetScalParam(int i_neuron, int n_neuron,
-				std::string param_name)
+float*
+BaseNeuron::GetScalParam( int i_neuron, int n_neuron, std::string param_name )
 {
-  if (!IsScalParam(param_name)) {
-    throw ngpu_exception(std::string("Unrecognized scalar parameter ")
-			 + param_name);
+  if ( !IsScalParam( param_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized scalar parameter " ) + param_name );
   }
-  CheckNeuronIdx(i_neuron);
-  CheckNeuronIdx(i_neuron + n_neuron - 1);
-  float *param_pt = GetParamPt(i_neuron, param_name);
+  CheckNeuronIdx( i_neuron );
+  CheckNeuronIdx( i_neuron + n_neuron - 1 );
+  float* param_pt = GetParamPt( i_neuron, param_name );
 
-  float *d_param_arr;
-  CUDAMALLOCCTRL("&d_param_arr",&d_param_arr, n_neuron*sizeof(float));
-  float *h_param_arr = (float*)malloc(n_neuron*sizeof(float));
+  float* d_param_arr;
+  CUDAMALLOCCTRL( "&d_param_arr", &d_param_arr, n_neuron * sizeof( float ) );
+  float* h_param_arr = ( float* ) malloc( n_neuron * sizeof( float ) );
 
-  BaseNeuronGetFloatArray<<<(n_neuron+1023)/1024, 1024>>>
-    (param_pt, d_param_arr, n_neuron, n_param_, 1);
+  BaseNeuronGetFloatArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( param_pt, d_param_arr, n_neuron, n_param_, 1 );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  
-  gpuErrchk(cudaMemcpy(h_param_arr, d_param_arr, n_neuron*sizeof(float),
-		       cudaMemcpyDeviceToHost));
-  CUDAFREECTRL("d_param_arr",d_param_arr);
-  
+
+  gpuErrchk( cudaMemcpy( h_param_arr, d_param_arr, n_neuron * sizeof( float ), cudaMemcpyDeviceToHost ) );
+  CUDAFREECTRL( "d_param_arr", d_param_arr );
+
   return h_param_arr;
 }
 
 // get scalar parameters param_name of neurons
 // i_neuron[0], ..., i_neuron[n_neuron -1]
-float *BaseNeuron::GetScalParam(int *i_neuron, int n_neuron,
-				std::string param_name)
+float*
+BaseNeuron::GetScalParam( int* i_neuron, int n_neuron, std::string param_name )
 {
-  if (!IsScalParam(param_name)) {
-    throw ngpu_exception(std::string("Unrecognized scalar parameter ")
-				     + param_name);
+  if ( !IsScalParam( param_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized scalar parameter " ) + param_name );
   }
-  int *d_i_neuron;
-  CUDAMALLOCCTRL("&d_i_neuron",&d_i_neuron, n_neuron*sizeof(int));
+  int* d_i_neuron;
+  CUDAMALLOCCTRL( "&d_i_neuron", &d_i_neuron, n_neuron * sizeof( int ) );
   // Memcopy will be synchronized with BaseNeuronGetFloatPtArray kernel
-  gpuErrchk(cudaMemcpyAsync(d_i_neuron, i_neuron, n_neuron*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  float *param_pt = GetParamPt(0, param_name);
-
-  float *d_param_arr;
-  CUDAMALLOCCTRL("&d_param_arr",&d_param_arr, n_neuron*sizeof(float));
-  float *h_param_arr = (float*)malloc(n_neuron*sizeof(float));
-  
-  BaseNeuronGetFloatPtArray<<<(n_neuron+1023)/1024, 1024>>>
-    (param_pt, d_param_arr, d_i_neuron, n_neuron, n_param_, 1);
+  gpuErrchk( cudaMemcpyAsync( d_i_neuron, i_neuron, n_neuron * sizeof( int ), cudaMemcpyHostToDevice ) );
+  float* param_pt = GetParamPt( 0, param_name );
+
+  float* d_param_arr;
+  CUDAMALLOCCTRL( "&d_param_arr", &d_param_arr, n_neuron * sizeof( float ) );
+  float* h_param_arr = ( float* ) malloc( n_neuron * sizeof( float ) );
+
+  BaseNeuronGetFloatPtArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>(
+    param_pt, d_param_arr, d_i_neuron, n_neuron, n_param_, 1 );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  CUDAFREECTRL("d_i_neuron",d_i_neuron);
+  CUDAFREECTRL( "d_i_neuron", d_i_neuron );
 
-  gpuErrchk(cudaMemcpy(h_param_arr, d_param_arr, n_neuron*sizeof(float),
-		       cudaMemcpyDeviceToHost));
-  CUDAFREECTRL("d_param_arr",d_param_arr);
+  gpuErrchk( cudaMemcpy( h_param_arr, d_param_arr, n_neuron * sizeof( float ), cudaMemcpyDeviceToHost ) );
+  CUDAFREECTRL( "d_param_arr", d_param_arr );
 
   return h_param_arr;
 }
 
 // get receptor-port parameters param_name of neurons
 // i_neuron, ..., i_neuron + n_neuron -1
-float *BaseNeuron::GetPortParam(int i_neuron, int n_neuron,
-			      std::string param_name)
-{
-  if (!IsPortParam(param_name)) {
-    throw ngpu_exception(std::string("Unrecognized port parameter ")
-			 + param_name);
-  }
-  CheckNeuronIdx(i_neuron);
-  CheckNeuronIdx(i_neuron + n_neuron - 1);
-  float *param_pt;
-
-  float *d_param_arr;
-  CUDAMALLOCCTRL("&d_param_arr",&d_param_arr, n_neuron*n_port_*sizeof(float));
-  float *h_param_arr = (float*)malloc(n_neuron*n_port_*sizeof(float));
-  
-  for (int port=0; port<n_port_; port++) {
-    param_pt = GetParamPt(i_neuron, param_name, port);
-    BaseNeuronGetFloatArray<<<(n_neuron+1023)/1024, 1024>>>
-      (param_pt, d_param_arr + port, n_neuron, n_param_, n_port_);
+float*
+BaseNeuron::GetPortParam( int i_neuron, int n_neuron, std::string param_name )
+{
+  if ( !IsPortParam( param_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized port parameter " ) + param_name );
+  }
+  CheckNeuronIdx( i_neuron );
+  CheckNeuronIdx( i_neuron + n_neuron - 1 );
+  float* param_pt;
+
+  float* d_param_arr;
+  CUDAMALLOCCTRL( "&d_param_arr", &d_param_arr, n_neuron * n_port_ * sizeof( float ) );
+  float* h_param_arr = ( float* ) malloc( n_neuron * n_port_ * sizeof( float ) );
+
+  for ( int port = 0; port < n_port_; port++ )
+  {
+    param_pt = GetParamPt( i_neuron, param_name, port );
+    BaseNeuronGetFloatArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>(
+      param_pt, d_param_arr + port, n_neuron, n_param_, n_port_ );
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
   }
 
-  gpuErrchk(cudaMemcpy(h_param_arr, d_param_arr, n_neuron*n_port_
-		       *sizeof(float), cudaMemcpyDeviceToHost));
-  CUDAFREECTRL("d_param_arr",d_param_arr);
-  
+  gpuErrchk( cudaMemcpy( h_param_arr, d_param_arr, n_neuron * n_port_ * sizeof( float ), cudaMemcpyDeviceToHost ) );
+  CUDAFREECTRL( "d_param_arr", d_param_arr );
+
   return h_param_arr;
 }
 
 // get receptor-port parameters param_name of neurons
 // i_neuron[0], ..., i_neuron[n_neuron -1]
-float *BaseNeuron::GetPortParam(int *i_neuron, int n_neuron,
-				std::string param_name)
+float*
+BaseNeuron::GetPortParam( int* i_neuron, int n_neuron, std::string param_name )
 {
-  if (!IsPortParam(param_name)) {
-    throw ngpu_exception(std::string("Unrecognized port parameter ")
-			 + param_name);
+  if ( !IsPortParam( param_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized port parameter " ) + param_name );
   }
-  int *d_i_neuron;
-  CUDAMALLOCCTRL("&d_i_neuron",&d_i_neuron, n_neuron*sizeof(int));
+  int* d_i_neuron;
+  CUDAMALLOCCTRL( "&d_i_neuron", &d_i_neuron, n_neuron * sizeof( int ) );
   // Memcopy will be synchronized with BaseNeuronGetFloatPtArray kernel
-  gpuErrchk(cudaMemcpyAsync(d_i_neuron, i_neuron, n_neuron*sizeof(int),
-		       cudaMemcpyHostToDevice));
-
-  float *d_param_arr;
-  CUDAMALLOCCTRL("&d_param_arr",&d_param_arr, n_neuron*n_port_*sizeof(float));
-  float *h_param_arr = (float*)malloc(n_neuron*n_port_*sizeof(float));
-    
-  for (int port=0; port<n_port_; port++) {
-    float *param_pt = GetParamPt(0, param_name, port);
-    BaseNeuronGetFloatPtArray<<<(n_neuron+1023)/1024, 1024>>>
-      (param_pt, d_param_arr+port, d_i_neuron, n_neuron, n_param_,
-       n_port_);
+  gpuErrchk( cudaMemcpyAsync( d_i_neuron, i_neuron, n_neuron * sizeof( int ), cudaMemcpyHostToDevice ) );
+
+  float* d_param_arr;
+  CUDAMALLOCCTRL( "&d_param_arr", &d_param_arr, n_neuron * n_port_ * sizeof( float ) );
+  float* h_param_arr = ( float* ) malloc( n_neuron * n_port_ * sizeof( float ) );
+
+  for ( int port = 0; port < n_port_; port++ )
+  {
+    float* param_pt = GetParamPt( 0, param_name, port );
+    BaseNeuronGetFloatPtArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>(
+      param_pt, d_param_arr + port, d_i_neuron, n_neuron, n_param_, n_port_ );
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
   }
-  CUDAFREECTRL("d_i_neuron",d_i_neuron);
-  
-  gpuErrchk(cudaMemcpy(h_param_arr, d_param_arr, n_neuron*n_port_
-		       *sizeof(float), cudaMemcpyDeviceToHost));
-  CUDAFREECTRL("d_param_arr",d_param_arr);
-  
+  CUDAFREECTRL( "d_i_neuron", d_i_neuron );
+
+  gpuErrchk( cudaMemcpy( h_param_arr, d_param_arr, n_neuron * n_port_ * sizeof( float ), cudaMemcpyDeviceToHost ) );
+  CUDAFREECTRL( "d_param_arr", d_param_arr );
+
   return h_param_arr;
 }
 
 // get array-parameter param_name of neuron i_neuron
 // must be defined in the derived classes
-float *BaseNeuron::GetArrayParam(int i_neuron, std::string param_name)
+float*
+BaseNeuron::GetArrayParam( int i_neuron, std::string param_name )
 {
-  throw ngpu_exception(std::string("Unrecognized parameter ")
-		       + param_name);
+  throw ngpu_exception( std::string( "Unrecognized parameter " ) + param_name );
 }
 
 // get neuron-group parameter param_name
-float BaseNeuron::GetGroupParam(std::string param_name)
+float
+BaseNeuron::GetGroupParam( std::string param_name )
 {
   int i_param;
-  for (i_param=0; i_param<n_group_param_; i_param++) {
-    if (param_name == group_param_name_[i_param]) {
-      return group_param_[i_param];
+  for ( i_param = 0; i_param < n_group_param_; i_param++ )
+  {
+    if ( param_name == group_param_name_[ i_param ] )
+    {
+      return group_param_[ i_param ];
     }
   }
-    
-  throw ngpu_exception(std::string("Unrecognized group parameter ")
-		       + param_name);
+
+  throw ngpu_exception( std::string( "Unrecognized group parameter " ) + param_name );
 }
 
- 
 // get integer variable var_name of neurons
 // i_neuron, ..., i_neuron + n_neuron -1
-int *BaseNeuron::GetIntVar(int i_neuron, int n_neuron,
-				std::string var_name)
+int*
+BaseNeuron::GetIntVar( int i_neuron, int n_neuron, std::string var_name )
 {
-  if (!IsIntVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized integer variable ")
-			 + var_name);
+  if ( !IsIntVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized integer variable " ) + var_name );
   }
-  CheckNeuronIdx(i_neuron);
-  CheckNeuronIdx(i_neuron + n_neuron - 1);
-  int *var_pt = GetIntVarPt(i_neuron, var_name);
+  CheckNeuronIdx( i_neuron );
+  CheckNeuronIdx( i_neuron + n_neuron - 1 );
+  int* var_pt = GetIntVarPt( i_neuron, var_name );
 
-  int *d_var_arr;
-  CUDAMALLOCCTRL("&d_var_arr",&d_var_arr, n_neuron*sizeof(int));
-  int *h_var_arr = (int*)malloc(n_neuron*sizeof(int));
+  int* d_var_arr;
+  CUDAMALLOCCTRL( "&d_var_arr", &d_var_arr, n_neuron * sizeof( int ) );
+  int* h_var_arr = ( int* ) malloc( n_neuron * sizeof( int ) );
 
-  BaseNeuronGetIntArray<<<(n_neuron+1023)/1024, 1024>>>
-    (var_pt, d_var_arr, n_neuron, 1, 1);
+  BaseNeuronGetIntArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( var_pt, d_var_arr, n_neuron, 1, 1 );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  
-  gpuErrchk(cudaMemcpy(h_var_arr, d_var_arr, n_neuron*sizeof(int),
-		       cudaMemcpyDeviceToHost));
-  CUDAFREECTRL("d_var_arr",d_var_arr);
-  
+
+  gpuErrchk( cudaMemcpy( h_var_arr, d_var_arr, n_neuron * sizeof( int ), cudaMemcpyDeviceToHost ) );
+  CUDAFREECTRL( "d_var_arr", d_var_arr );
+
   return h_var_arr;
 }
 
 // get integer variable var_name of neurons
 // i_neuron[0], ..., i_neuron[n_neuron -1]
-int *BaseNeuron::GetIntVar(int *i_neuron, int n_neuron,
-			   std::string var_name)
+int*
+BaseNeuron::GetIntVar( int* i_neuron, int n_neuron, std::string var_name )
 {
-  if (!IsIntVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized integer variable ")
-			 + var_name);
+  if ( !IsIntVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized integer variable " ) + var_name );
   }
-  int *d_i_neuron;
-  CUDAMALLOCCTRL("&d_i_neuron",&d_i_neuron, n_neuron*sizeof(int));
+  int* d_i_neuron;
+  CUDAMALLOCCTRL( "&d_i_neuron", &d_i_neuron, n_neuron * sizeof( int ) );
   // Memcopy will be synchronized with BaseNeuronGetIntPtArray kernel
-  gpuErrchk(cudaMemcpyAsync(d_i_neuron, i_neuron, n_neuron*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  int *var_pt = GetIntVarPt(0, var_name);
-
-  int *d_var_arr;
-  CUDAMALLOCCTRL("&d_var_arr",&d_var_arr, n_neuron*sizeof(int));
-  int *h_var_arr = (int*)malloc(n_neuron*sizeof(int));
-  
-  BaseNeuronGetIntPtArray<<<(n_neuron+1023)/1024, 1024>>>
-    (var_pt, d_var_arr, d_i_neuron, n_neuron, 1, 1);
+  gpuErrchk( cudaMemcpyAsync( d_i_neuron, i_neuron, n_neuron * sizeof( int ), cudaMemcpyHostToDevice ) );
+  int* var_pt = GetIntVarPt( 0, var_name );
+
+  int* d_var_arr;
+  CUDAMALLOCCTRL( "&d_var_arr", &d_var_arr, n_neuron * sizeof( int ) );
+  int* h_var_arr = ( int* ) malloc( n_neuron * sizeof( int ) );
+
+  BaseNeuronGetIntPtArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( var_pt, d_var_arr, d_i_neuron, n_neuron, 1, 1 );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  CUDAFREECTRL("d_i_neuron",d_i_neuron);
+  CUDAFREECTRL( "d_i_neuron", d_i_neuron );
+
+  gpuErrchk( cudaMemcpy( h_var_arr, d_var_arr, n_neuron * sizeof( int ), cudaMemcpyDeviceToHost ) );
+  CUDAFREECTRL( "d_var_arr", d_var_arr );
 
-  gpuErrchk(cudaMemcpy(h_var_arr, d_var_arr, n_neuron*sizeof(int),
-		       cudaMemcpyDeviceToHost));
-  CUDAFREECTRL("d_var_arr",d_var_arr);
-  
   return h_var_arr;
 }
 
 // get scalar state-variable var_name of neurons
 // i_neuron, ..., i_neuron + n_neuron -1
-float *BaseNeuron::GetScalVar(int i_neuron, int n_neuron,
-				std::string var_name)
+float*
+BaseNeuron::GetScalVar( int i_neuron, int n_neuron, std::string var_name )
 {
-  if (!IsScalVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized scalar variable ")
-			 + var_name);
+  if ( !IsScalVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized scalar variable " ) + var_name );
   }
-  CheckNeuronIdx(i_neuron);
-  CheckNeuronIdx(i_neuron + n_neuron - 1);
-  float *var_pt = GetVarPt(i_neuron, var_name);
+  CheckNeuronIdx( i_neuron );
+  CheckNeuronIdx( i_neuron + n_neuron - 1 );
+  float* var_pt = GetVarPt( i_neuron, var_name );
 
-  float *d_var_arr;
-  CUDAMALLOCCTRL("&d_var_arr",&d_var_arr, n_neuron*sizeof(float));
-  float *h_var_arr = (float*)malloc(n_neuron*sizeof(float));
+  float* d_var_arr;
+  CUDAMALLOCCTRL( "&d_var_arr", &d_var_arr, n_neuron * sizeof( float ) );
+  float* h_var_arr = ( float* ) malloc( n_neuron * sizeof( float ) );
 
-  BaseNeuronGetFloatArray<<<(n_neuron+1023)/1024, 1024>>>
-    (var_pt, d_var_arr, n_neuron, n_var_, 1);
+  BaseNeuronGetFloatArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>( var_pt, d_var_arr, n_neuron, n_var_, 1 );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  
-  gpuErrchk(cudaMemcpy(h_var_arr, d_var_arr, n_neuron*sizeof(float),
-		       cudaMemcpyDeviceToHost));
-  CUDAFREECTRL("d_var_arr",d_var_arr);
-  
+
+  gpuErrchk( cudaMemcpy( h_var_arr, d_var_arr, n_neuron * sizeof( float ), cudaMemcpyDeviceToHost ) );
+  CUDAFREECTRL( "d_var_arr", d_var_arr );
+
   return h_var_arr;
 }
 
 // get scalar state-variable var_name of neurons
 // i_neuron[0], ..., i_neuron[n_neuron -1]
-float *BaseNeuron::GetScalVar(int *i_neuron, int n_neuron,
-				std::string var_name)
+float*
+BaseNeuron::GetScalVar( int* i_neuron, int n_neuron, std::string var_name )
 {
-  if (!IsScalVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized scalar variable ")
-				     + var_name);
+  if ( !IsScalVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized scalar variable " ) + var_name );
   }
-  int *d_i_neuron;
-  CUDAMALLOCCTRL("&d_i_neuron",&d_i_neuron, n_neuron*sizeof(int));
+  int* d_i_neuron;
+  CUDAMALLOCCTRL( "&d_i_neuron", &d_i_neuron, n_neuron * sizeof( int ) );
   // Memcopy will be synchronized with BaseNeuronGetFloatPtArray kernel
-  gpuErrchk(cudaMemcpyAsync(d_i_neuron, i_neuron, n_neuron*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  float *var_pt = GetVarPt(0, var_name);
-
-  float *d_var_arr;
-  CUDAMALLOCCTRL("&d_var_arr",&d_var_arr, n_neuron*sizeof(float));
-  float *h_var_arr = (float*)malloc(n_neuron*sizeof(float));
-  
-  BaseNeuronGetFloatPtArray<<<(n_neuron+1023)/1024, 1024>>>
-    (var_pt, d_var_arr, d_i_neuron, n_neuron, n_var_, 1);
+  gpuErrchk( cudaMemcpyAsync( d_i_neuron, i_neuron, n_neuron * sizeof( int ), cudaMemcpyHostToDevice ) );
+  float* var_pt = GetVarPt( 0, var_name );
+
+  float* d_var_arr;
+  CUDAMALLOCCTRL( "&d_var_arr", &d_var_arr, n_neuron * sizeof( float ) );
+  float* h_var_arr = ( float* ) malloc( n_neuron * sizeof( float ) );
+
+  BaseNeuronGetFloatPtArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>(
+    var_pt, d_var_arr, d_i_neuron, n_neuron, n_var_, 1 );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  CUDAFREECTRL("d_i_neuron",d_i_neuron);
+  CUDAFREECTRL( "d_i_neuron", d_i_neuron );
 
-  gpuErrchk(cudaMemcpy(h_var_arr, d_var_arr, n_neuron*sizeof(float),
-		       cudaMemcpyDeviceToHost));
-  CUDAFREECTRL("d_var_arr",d_var_arr);
+  gpuErrchk( cudaMemcpy( h_var_arr, d_var_arr, n_neuron * sizeof( float ), cudaMemcpyDeviceToHost ) );
+  CUDAFREECTRL( "d_var_arr", d_var_arr );
 
   return h_var_arr;
 }
 
 // get receptor-port state-variable var_name of neurons
 // i_neuron, ..., i_neuron + n_neuron -1
-float *BaseNeuron::GetPortVar(int i_neuron, int n_neuron,
-			      std::string var_name)
-{
-  if (!IsPortVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized port variable ")
-			 + var_name);
-  }
-  CheckNeuronIdx(i_neuron);
-  CheckNeuronIdx(i_neuron + n_neuron - 1);
-  float *var_pt;
-
-  float *d_var_arr;
-  CUDAMALLOCCTRL("&d_var_arr",&d_var_arr, n_neuron*n_port_*sizeof(float));
-  float *h_var_arr = (float*)malloc(n_neuron*n_port_*sizeof(float));
-  
-  for (int port=0; port<n_port_; port++) {
-    var_pt = GetVarPt(i_neuron, var_name, port);
-    BaseNeuronGetFloatArray<<<(n_neuron+1023)/1024, 1024>>>
-      (var_pt, d_var_arr + port, n_neuron, n_var_, n_port_);
+float*
+BaseNeuron::GetPortVar( int i_neuron, int n_neuron, std::string var_name )
+{
+  if ( !IsPortVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized port variable " ) + var_name );
+  }
+  CheckNeuronIdx( i_neuron );
+  CheckNeuronIdx( i_neuron + n_neuron - 1 );
+  float* var_pt;
+
+  float* d_var_arr;
+  CUDAMALLOCCTRL( "&d_var_arr", &d_var_arr, n_neuron * n_port_ * sizeof( float ) );
+  float* h_var_arr = ( float* ) malloc( n_neuron * n_port_ * sizeof( float ) );
+
+  for ( int port = 0; port < n_port_; port++ )
+  {
+    var_pt = GetVarPt( i_neuron, var_name, port );
+    BaseNeuronGetFloatArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>(
+      var_pt, d_var_arr + port, n_neuron, n_var_, n_port_ );
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
   }
 
-  gpuErrchk(cudaMemcpy(h_var_arr, d_var_arr, n_neuron*n_port_
-		       *sizeof(float), cudaMemcpyDeviceToHost));
-  CUDAFREECTRL("d_var_arr",d_var_arr);
-  
+  gpuErrchk( cudaMemcpy( h_var_arr, d_var_arr, n_neuron * n_port_ * sizeof( float ), cudaMemcpyDeviceToHost ) );
+  CUDAFREECTRL( "d_var_arr", d_var_arr );
+
   return h_var_arr;
 }
 
 // get receptor-port state-variable var_name of neurons
 // i_neuron[0], ..., i_neuron[n_neuron -1]
-float *BaseNeuron::GetPortVar(int *i_neuron, int n_neuron,
-			      std::string var_name)
+float*
+BaseNeuron::GetPortVar( int* i_neuron, int n_neuron, std::string var_name )
 {
-  if (!IsPortVar(var_name)) {
-    throw ngpu_exception(std::string("Unrecognized port variable ")
-			 + var_name);
+  if ( !IsPortVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized port variable " ) + var_name );
   }
-  int *d_i_neuron;
-  CUDAMALLOCCTRL("&d_i_neuron",&d_i_neuron, n_neuron*sizeof(int));
+  int* d_i_neuron;
+  CUDAMALLOCCTRL( "&d_i_neuron", &d_i_neuron, n_neuron * sizeof( int ) );
   // Memcopy will be synchronized with BaseNeuronGetFloatPtArray kernel
-  gpuErrchk(cudaMemcpyAsync(d_i_neuron, i_neuron, n_neuron*sizeof(int),
-		       cudaMemcpyHostToDevice));
-
-  float *d_var_arr;
-  CUDAMALLOCCTRL("&d_var_arr",&d_var_arr, n_neuron*n_port_*sizeof(float));
-  float *h_var_arr = (float*)malloc(n_neuron*n_port_*sizeof(float));
-    
-  for (int port=0; port<n_port_; port++) {
-    float *var_pt = GetVarPt(0, var_name, port);
-    BaseNeuronGetFloatPtArray<<<(n_neuron+1023)/1024, 1024>>>
-      (var_pt, d_var_arr+port, d_i_neuron, n_neuron, n_var_, n_port_);
+  gpuErrchk( cudaMemcpyAsync( d_i_neuron, i_neuron, n_neuron * sizeof( int ), cudaMemcpyHostToDevice ) );
+
+  float* d_var_arr;
+  CUDAMALLOCCTRL( "&d_var_arr", &d_var_arr, n_neuron * n_port_ * sizeof( float ) );
+  float* h_var_arr = ( float* ) malloc( n_neuron * n_port_ * sizeof( float ) );
+
+  for ( int port = 0; port < n_port_; port++ )
+  {
+    float* var_pt = GetVarPt( 0, var_name, port );
+    BaseNeuronGetFloatPtArray<<< ( n_neuron + 1023 ) / 1024, 1024 >>>(
+      var_pt, d_var_arr + port, d_i_neuron, n_neuron, n_var_, n_port_ );
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
   }
-  CUDAFREECTRL("d_i_neuron",d_i_neuron);
-  
-  gpuErrchk(cudaMemcpy(h_var_arr, d_var_arr, n_neuron*n_port_
-		       *sizeof(float), cudaMemcpyDeviceToHost));
-  CUDAFREECTRL("d_var_arr",d_var_arr);
-  
+  CUDAFREECTRL( "d_i_neuron", d_i_neuron );
+
+  gpuErrchk( cudaMemcpy( h_var_arr, d_var_arr, n_neuron * n_port_ * sizeof( float ), cudaMemcpyDeviceToHost ) );
+  CUDAFREECTRL( "d_var_arr", d_var_arr );
+
   return h_var_arr;
 }
 
 // get array variable var_name of neuron  i_neuron
-float *BaseNeuron::GetArrayVar(int i_neuron, std::string var_name)
+float*
+BaseNeuron::GetArrayVar( int i_neuron, std::string var_name )
 {
-  throw ngpu_exception(std::string("Unrecognized variable ")
-		       + var_name);
+  throw ngpu_exception( std::string( "Unrecognized variable " ) + var_name );
 }
 
 // get index of integer variable var_name
-int BaseNeuron::GetIntVarIdx(std::string var_name)
+int
+BaseNeuron::GetIntVarIdx( std::string var_name )
 {
   int i_var;
-  for (i_var=0; i_var<GetNIntVar(); i_var++) {
-    if (var_name == int_var_name_[i_var]) break;
+  for ( i_var = 0; i_var < GetNIntVar(); i_var++ )
+  {
+    if ( var_name == int_var_name_[ i_var ] )
+    {
+      break;
+    }
   }
-  if (i_var == GetNIntVar()) {
-    throw ngpu_exception(std::string("Unrecognized integer variable ")
-			 + var_name);
+  if ( i_var == GetNIntVar() )
+  {
+    throw ngpu_exception( std::string( "Unrecognized integer variable " ) + var_name );
   }
-  
+
   return i_var;
 }
 
 // get index of scalar variable var_name
-int BaseNeuron::GetScalVarIdx(std::string var_name)
+int
+BaseNeuron::GetScalVarIdx( std::string var_name )
 {
   int i_var;
-  for (i_var=0; i_var<n_scal_var_; i_var++) {
-    if (var_name == scal_var_name_[i_var]) break;
+  for ( i_var = 0; i_var < n_scal_var_; i_var++ )
+  {
+    if ( var_name == scal_var_name_[ i_var ] )
+    {
+      break;
+    }
   }
-  if (i_var == n_scal_var_) {
-    throw ngpu_exception(std::string("Unrecognized scalar variable ")
-			 + var_name);
+  if ( i_var == n_scal_var_ )
+  {
+    throw ngpu_exception( std::string( "Unrecognized scalar variable " ) + var_name );
   }
-  
+
   return i_var;
 }
 
 // get index of receptor-port variable var_name
-int BaseNeuron::GetPortVarIdx(std::string var_name)
+int
+BaseNeuron::GetPortVarIdx( std::string var_name )
 {
   int i_var;
-  for (i_var=0; i_var<n_port_var_; i_var++) {
-    if (var_name == port_var_name_[i_var]) break;
+  for ( i_var = 0; i_var < n_port_var_; i_var++ )
+  {
+    if ( var_name == port_var_name_[ i_var ] )
+    {
+      break;
+    }
   }
-  if (i_var == n_port_var_) {
-    throw ngpu_exception(std::string("Unrecognized port variable ")
-				     + var_name);
+  if ( i_var == n_port_var_ )
+  {
+    throw ngpu_exception( std::string( "Unrecognized port variable " ) + var_name );
   }
-  
+
   return i_var;
 }
 
 // get index of scalar parameter param_name
-int BaseNeuron::GetScalParamIdx(std::string param_name)
+int
+BaseNeuron::GetScalParamIdx( std::string param_name )
 {
   int i_param;
-  for (i_param=0; i_param<n_scal_param_; i_param++) {
-    if (param_name == scal_param_name_[i_param]) break;
+  for ( i_param = 0; i_param < n_scal_param_; i_param++ )
+  {
+    if ( param_name == scal_param_name_[ i_param ] )
+    {
+      break;
+    }
   }
-  if (i_param == n_scal_param_) {
-    throw ngpu_exception(std::string("Unrecognized parameter ")
-			 + param_name);
+  if ( i_param == n_scal_param_ )
+  {
+    throw ngpu_exception( std::string( "Unrecognized parameter " ) + param_name );
   }
-  
+
   return i_param;
 }
 
 // get index of receptor-port parameter param_name
-int BaseNeuron::GetPortParamIdx(std::string param_name)
-{  
+int
+BaseNeuron::GetPortParamIdx( std::string param_name )
+{
   int i_param;
-  for (i_param=0; i_param<n_port_param_; i_param++) {
-    if (param_name == port_param_name_[i_param]) break;
+  for ( i_param = 0; i_param < n_port_param_; i_param++ )
+  {
+    if ( param_name == port_param_name_[ i_param ] )
+    {
+      break;
+    }
   }
-  if (i_param == n_port_param_) {
-    throw ngpu_exception(std::string("Unrecognized port parameter ")
-			 + param_name);
+  if ( i_param == n_port_param_ )
+  {
+    throw ngpu_exception( std::string( "Unrecognized port parameter " ) + param_name );
   }
-  
+
   return i_param;
 }
 
 // return pointer to state variable array
-float *BaseNeuron::GetVarArr()
+float*
+BaseNeuron::GetVarArr()
 {
   return var_arr_;
 }
 
 // return pointer to parameter array
-float *BaseNeuron::GetParamArr()
+float*
+BaseNeuron::GetParamArr()
 {
   return param_arr_;
 }
 
 // return array size for array variable var_name
 // Must be defined in derived class
-int BaseNeuron::GetArrayVarSize(int i_neuron, std::string var_name)
+int
+BaseNeuron::GetArrayVarSize( int i_neuron, std::string var_name )
 {
-  throw ngpu_exception(std::string("Unrecognized variable ")
-		       + var_name);
-
+  throw ngpu_exception( std::string( "Unrecognized variable " ) + var_name );
 }
-  
+
 // return array size for array parameter param_name
 // Must be defined in derived class
-int BaseNeuron::GetArrayParamSize(int i_neuron, std::string param_name)
+int
+BaseNeuron::GetArrayParamSize( int i_neuron, std::string param_name )
 {
-  throw ngpu_exception(std::string("Unrecognized parameter ")
-		       + param_name);
-
+  throw ngpu_exception( std::string( "Unrecognized parameter " ) + param_name );
 }
 
 // return size of variable var_name
 // 1 for scalar variables, n_port for receptor-port variables
-int BaseNeuron::GetVarSize(std::string var_name)
+int
+BaseNeuron::GetVarSize( std::string var_name )
 {
-  if (IsScalVar(var_name)) {
+  if ( IsScalVar( var_name ) )
+  {
     return 1;
   }
-  else if (IsPortVar(var_name)) {
+  else if ( IsPortVar( var_name ) )
+  {
     return n_port_;
   }
-  else if (IsArrayVar(var_name)) {
-    throw ngpu_exception(std::string("Node index must be specified to get "
-				     "array variable size for ")+ var_name);
+  else if ( IsArrayVar( var_name ) )
+  {
+    throw ngpu_exception( std::string( "Node index must be specified to get "
+                                       "array variable size for " )
+      + var_name );
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized variable ")
-			 + var_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized variable " ) + var_name );
   }
 }
 
 // return size of parameter param_name
 // 1 for scalar parameters, n_port for receptor-port parameters
-int BaseNeuron::GetParamSize(std::string param_name)
+int
+BaseNeuron::GetParamSize( std::string param_name )
 {
-  if (IsScalParam(param_name)) {
+  if ( IsScalParam( param_name ) )
+  {
     return 1;
   }
-  else if (IsPortParam(param_name)) {
+  else if ( IsPortParam( param_name ) )
+  {
     return n_port_;
   }
-  else if (IsArrayParam(param_name)) {
-    throw ngpu_exception(std::string("Node index must be specified to get "
-				     "array parameter size for ")+ param_name);
+  else if ( IsArrayParam( param_name ) )
+  {
+    throw ngpu_exception( std::string( "Node index must be specified to get "
+                                       "array parameter size for " )
+      + param_name );
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized parameter ")
-			 + param_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized parameter " ) + param_name );
   }
 }
 
 // check if var_name is an integer variable
-bool BaseNeuron::IsIntVar(std::string var_name)
+bool
+BaseNeuron::IsIntVar( std::string var_name )
 {
   int i_var;
-  for (i_var=0; i_var<GetNIntVar(); i_var++) {
-    if (var_name == int_var_name_[i_var]) return true;
+  for ( i_var = 0; i_var < GetNIntVar(); i_var++ )
+  {
+    if ( var_name == int_var_name_[ i_var ] )
+    {
+      return true;
+    }
   }
   return false;
 }
 
 // check if var_name is a scalar variable
-bool BaseNeuron::IsScalVar(std::string var_name)
+bool
+BaseNeuron::IsScalVar( std::string var_name )
 {
   int i_var;
-  for (i_var=0; i_var<n_scal_var_; i_var++) {
-    if (var_name == scal_var_name_[i_var]) return true;
+  for ( i_var = 0; i_var < n_scal_var_; i_var++ )
+  {
+    if ( var_name == scal_var_name_[ i_var ] )
+    {
+      return true;
+    }
   }
   return false;
 }
 
 // check if var_name is a receptor-port variable
-bool BaseNeuron::IsPortVar(std::string var_name)
+bool
+BaseNeuron::IsPortVar( std::string var_name )
 {
   int i_var;
-  for (i_var=0; i_var<n_port_var_; i_var++) {
-    if (var_name == port_var_name_[i_var]) return true;
+  for ( i_var = 0; i_var < n_port_var_; i_var++ )
+  {
+    if ( var_name == port_var_name_[ i_var ] )
+    {
+      return true;
+    }
   }
   return false;
 }
 
 // check if var_name is an array variable
-bool BaseNeuron::IsArrayVar(std::string var_name)
+bool
+BaseNeuron::IsArrayVar( std::string var_name )
 {
   int i_var;
-  for (i_var=0; i_var<GetNArrayVar(); i_var++) {
-    if (var_name == array_var_name_[i_var]) return true;
+  for ( i_var = 0; i_var < GetNArrayVar(); i_var++ )
+  {
+    if ( var_name == array_var_name_[ i_var ] )
+    {
+      return true;
+    }
   }
   return false;
 }
 
 // check if param_name is a scalar parameter
-bool BaseNeuron::IsScalParam(std::string param_name)
+bool
+BaseNeuron::IsScalParam( std::string param_name )
 {
   int i_param;
-  for (i_param=0; i_param<n_scal_param_; i_param++) {
-    if (param_name == scal_param_name_[i_param]) return true;
+  for ( i_param = 0; i_param < n_scal_param_; i_param++ )
+  {
+    if ( param_name == scal_param_name_[ i_param ] )
+    {
+      return true;
+    }
   }
   return false;
 }
 
 // check if param_name is a receptor-port parameter
-bool BaseNeuron::IsPortParam(std::string param_name)
-{  
+bool
+BaseNeuron::IsPortParam( std::string param_name )
+{
   int i_param;
-  for (i_param=0; i_param<n_port_param_; i_param++) {
-    if (param_name == port_param_name_[i_param]) return true;
+  for ( i_param = 0; i_param < n_port_param_; i_param++ )
+  {
+    if ( param_name == port_param_name_[ i_param ] )
+    {
+      return true;
+    }
   }
   return false;
 }
 
 // check if param_name is an array parameter
-bool BaseNeuron::IsArrayParam(std::string param_name)
+bool
+BaseNeuron::IsArrayParam( std::string param_name )
 {
   int i_param;
-  for (i_param=0; i_param<GetNArrayParam(); i_param++) {
-    if (param_name == array_param_name_[i_param]) return true;
+  for ( i_param = 0; i_param < GetNArrayParam(); i_param++ )
+  {
+    if ( param_name == array_param_name_[ i_param ] )
+    {
+      return true;
+    }
   }
   return false;
 }
 
 // check if param_name is a neuron-group parameter
-bool BaseNeuron::IsGroupParam(std::string param_name)
+bool
+BaseNeuron::IsGroupParam( std::string param_name )
 {
   int i_param;
-  for (i_param=0; i_param<n_group_param_; i_param++) {
-    if (param_name == group_param_name_[i_param]) return true;
+  for ( i_param = 0; i_param < n_group_param_; i_param++ )
+  {
+    if ( param_name == group_param_name_[ i_param ] )
+    {
+      return true;
+    }
   }
   return false;
 }
 
 // check if index i_neuron is >=0 and <n_node for the neuron group
-int BaseNeuron::CheckNeuronIdx(int i_neuron)
+int
+BaseNeuron::CheckNeuronIdx( int i_neuron )
 {
-  if (i_neuron>=n_node_) {
-    throw ngpu_exception("Neuron index must be lower then n. of neurons");
+  if ( i_neuron >= n_node_ )
+  {
+    throw ngpu_exception( "Neuron index must be lower then n. of neurons" );
   }
-  else if (i_neuron<0) {
-    throw ngpu_exception("Neuron index must be >= 0");
+  else if ( i_neuron < 0 )
+  {
+    throw ngpu_exception( "Neuron index must be >= 0" );
   }
   return 0;
 }
 
 // check if index port is >=0 and <n_port
-int BaseNeuron::CheckPortIdx(int port)
+int
+BaseNeuron::CheckPortIdx( int port )
 {
-  if (port>=n_port_) {
-    throw ngpu_exception("Port index must be lower then n. of ports");
+  if ( port >= n_port_ )
+  {
+    throw ngpu_exception( "Port index must be lower then n. of ports" );
   }
-  else if (port<0) {
-    throw ngpu_exception("Port index must be >= 0");
+  else if ( port < 0 )
+  {
+    throw ngpu_exception( "Port index must be >= 0" );
   }
   return 0;
 }
 
 // return pointer to integer variable var_name for neuron i_neuron
-int *BaseNeuron::GetIntVarPt(int i_neuron, std::string var_name)
+int*
+BaseNeuron::GetIntVarPt( int i_neuron, std::string var_name )
 {
-  CheckNeuronIdx(i_neuron);
-    
-  if (IsIntVar(var_name)) {
-    int i_var =  GetIntVarIdx(var_name);
-    return int_var_pt_[i_var] + i_neuron; 
+  CheckNeuronIdx( i_neuron );
+
+  if ( IsIntVar( var_name ) )
+  {
+    int i_var = GetIntVarIdx( var_name );
+    return int_var_pt_[ i_var ] + i_neuron;
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized integer variable ")
-			 + var_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized integer variable " ) + var_name );
   }
 }
 
 // return pointer to variable var_name for neuron i_neuron
 // (and specified receptor port in case of a port variable)
-float *BaseNeuron::GetVarPt(int i_neuron, std::string var_name,
-			    int port /*=0*/)
+float*
+BaseNeuron::GetVarPt( int i_neuron, std::string var_name, int port /*=0*/ )
 {
-  CheckNeuronIdx(i_neuron);
-  if (port!=0) {
-    CheckPortIdx(port);
+  CheckNeuronIdx( i_neuron );
+  if ( port != 0 )
+  {
+    CheckPortIdx( port );
   }
-    
-  if (IsScalVar(var_name)) {
-    int i_var =  GetScalVarIdx(var_name);
-    return GetVarArr() + i_neuron*n_var_ + i_var;
+
+  if ( IsScalVar( var_name ) )
+  {
+    int i_var = GetScalVarIdx( var_name );
+    return GetVarArr() + i_neuron * n_var_ + i_var;
   }
-  else if (IsPortVar(var_name)) {
-    int i_vvar =  GetPortVarIdx(var_name);
-    return GetVarArr() + i_neuron*n_var_ + n_scal_var_
-      + port*n_port_var_ + i_vvar;
+  else if ( IsPortVar( var_name ) )
+  {
+    int i_vvar = GetPortVarIdx( var_name );
+    return GetVarArr() + i_neuron * n_var_ + n_scal_var_ + port * n_port_var_ + i_vvar;
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized variable ")
-			 + var_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized variable " ) + var_name );
   }
 }
 
 // return pointer to parameter param_name for neuron i_neuron
 // (and specified receptor port in case of a port parameter)
-float *BaseNeuron::GetParamPt(int i_neuron, std::string param_name,
-			      int port /*=0*/)
+float*
+BaseNeuron::GetParamPt( int i_neuron, std::string param_name, int port /*=0*/ )
 {
-  CheckNeuronIdx(i_neuron);
-  if (port!=0) {
-    CheckPortIdx(port);
+  CheckNeuronIdx( i_neuron );
+  if ( port != 0 )
+  {
+    CheckPortIdx( port );
   }
-  if (IsScalParam(param_name)) {
-    int i_param =  GetScalParamIdx(param_name);
-    return GetParamArr() + i_neuron*n_param_ + i_param;
+  if ( IsScalParam( param_name ) )
+  {
+    int i_param = GetScalParamIdx( param_name );
+    return GetParamArr() + i_neuron * n_param_ + i_param;
   }
-  else if (IsPortParam(param_name)) {
-    int i_vparam =  GetPortParamIdx(param_name);
-    return GetParamArr() + i_neuron*n_param_ + n_scal_param_
-      + port*n_port_param_ + i_vparam;
+  else if ( IsPortParam( param_name ) )
+  {
+    int i_vparam = GetPortParamIdx( param_name );
+    return GetParamArr() + i_neuron * n_param_ + n_scal_param_ + port * n_port_param_ + i_vparam;
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized parameter ")
-			 + param_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized parameter " ) + param_name );
   }
 }
 
-// return spike multiplicity (spike_height) of neuron i_neuron 
+// return spike multiplicity (spike_height) of neuron i_neuron
 // if neuron emitted a spike in the current time step
 // otherwise return 0
-float BaseNeuron::GetSpikeActivity(int i_neuron)
+float
+BaseNeuron::GetSpikeActivity( int i_neuron )
 {
-  CheckNeuronIdx(i_neuron);
+  CheckNeuronIdx( i_neuron );
   int i_spike_buffer = i_neuron + i_node_0_;
   int Ns;
-  gpuErrchk(cudaMemcpy(&Ns, d_SpikeBufferSize + i_spike_buffer,
-		       sizeof(int), cudaMemcpyDeviceToHost));
-  if (Ns==0) {
+  gpuErrchk( cudaMemcpy( &Ns, d_SpikeBufferSize + i_spike_buffer, sizeof( int ), cudaMemcpyDeviceToHost ) );
+  if ( Ns == 0 )
+  {
     return 0.0;
   }
-  
+
   int is0;
-  gpuErrchk(cudaMemcpy(&is0, d_SpikeBufferIdx0 + i_spike_buffer,
-		       sizeof(int), cudaMemcpyDeviceToHost));
-  int i_arr = is0*h_NSpikeBuffer+i_spike_buffer; // spike index in array
+  gpuErrchk( cudaMemcpy( &is0, d_SpikeBufferIdx0 + i_spike_buffer, sizeof( int ), cudaMemcpyDeviceToHost ) );
+  int i_arr = is0 * h_NSpikeBuffer + i_spike_buffer; // spike index in array
 
   int time_idx;
   // get first (most recent) spike from buffer
-  gpuErrchk(cudaMemcpy(&time_idx, d_SpikeBufferTimeIdx + i_arr,
-		       sizeof(int), cudaMemcpyDeviceToHost));
-  if (time_idx!=0) { // neuron is not spiking now
+  gpuErrchk( cudaMemcpy( &time_idx, d_SpikeBufferTimeIdx + i_arr, sizeof( int ), cudaMemcpyDeviceToHost ) );
+  if ( time_idx != 0 )
+  { // neuron is not spiking now
     return 0.0;
   }
   float spike_height;
-  gpuErrchk(cudaMemcpy(&spike_height, d_SpikeBufferHeight + i_arr,
-		       sizeof(float), cudaMemcpyDeviceToHost));
+  gpuErrchk( cudaMemcpy( &spike_height, d_SpikeBufferHeight + i_arr, sizeof( float ), cudaMemcpyDeviceToHost ) );
 
   return spike_height;
 }
 
 // get all names of integer variables
-std::vector<std::string> BaseNeuron::GetIntVarNames()
+std::vector< std::string >
+BaseNeuron::GetIntVarNames()
 {
   return int_var_name_;
 }
 
 // get all names of scalar state variables
-std::vector<std::string> BaseNeuron::GetScalVarNames()
+std::vector< std::string >
+BaseNeuron::GetScalVarNames()
 {
-  std::vector<std::string> var_name_vect;
-  for (int i=0; i<n_scal_var_; i++) {
-    var_name_vect.push_back(scal_var_name_[i]);
+  std::vector< std::string > var_name_vect;
+  for ( int i = 0; i < n_scal_var_; i++ )
+  {
+    var_name_vect.push_back( scal_var_name_[ i ] );
   }
-  
+
   return var_name_vect;
 }
 
 // get number of scalar state variables
-int BaseNeuron::GetNScalVar()
+int
+BaseNeuron::GetNScalVar()
 {
   return n_scal_var_;
 }
 
 // get number of integer variables
-int BaseNeuron::GetNIntVar()
+int
+BaseNeuron::GetNIntVar()
 {
-  return (int)int_var_name_.size();
+  return ( int ) int_var_name_.size();
 }
 
 // get all names of receptor-port state variables
-std::vector<std::string> BaseNeuron::GetPortVarNames()
+std::vector< std::string >
+BaseNeuron::GetPortVarNames()
 {
-  std::vector<std::string> var_name_vect;
-  for (int i=0; i<n_port_var_; i++) {
-    var_name_vect.push_back(port_var_name_[i]);
+  std::vector< std::string > var_name_vect;
+  for ( int i = 0; i < n_port_var_; i++ )
+  {
+    var_name_vect.push_back( port_var_name_[ i ] );
   }
-  
+
   return var_name_vect;
 }
 
-// get number of receptor-port variables 
-int BaseNeuron::GetNPortVar()
+// get number of receptor-port variables
+int
+BaseNeuron::GetNPortVar()
 {
   return n_port_var_;
 }
 
 // get all names of scalar parameters
-std::vector<std::string> BaseNeuron::GetScalParamNames()
+std::vector< std::string >
+BaseNeuron::GetScalParamNames()
 {
-  std::vector<std::string> param_name_vect;
-  for (int i=0; i<n_scal_param_; i++) {
-    param_name_vect.push_back(scal_param_name_[i]);
+  std::vector< std::string > param_name_vect;
+  for ( int i = 0; i < n_scal_param_; i++ )
+  {
+    param_name_vect.push_back( scal_param_name_[ i ] );
   }
-  
+
   return param_name_vect;
 }
 
 // get number of scalar parameters
-int BaseNeuron::GetNScalParam()
+int
+BaseNeuron::GetNScalParam()
 {
   return n_scal_param_;
 }
 
 // get all names of receptor-port parameters
-std::vector<std::string> BaseNeuron::GetPortParamNames()
+std::vector< std::string >
+BaseNeuron::GetPortParamNames()
 {
-  std::vector<std::string> param_name_vect;
-  for (int i=0; i<n_port_param_; i++) {
-    param_name_vect.push_back(port_param_name_[i]);
+  std::vector< std::string > param_name_vect;
+  for ( int i = 0; i < n_port_param_; i++ )
+  {
+    param_name_vect.push_back( port_param_name_[ i ] );
   }
-  
+
   return param_name_vect;
 }
 
 // get number of receptor-port parameters
-int BaseNeuron::GetNPortParam()
+int
+BaseNeuron::GetNPortParam()
 {
   return n_port_param_;
 }
 
 // get all names of neuron-group parameters
-std::vector<std::string> BaseNeuron::GetGroupParamNames()
+std::vector< std::string >
+BaseNeuron::GetGroupParamNames()
 {
-  std::vector<std::string> param_name_vect;
-  for (int i=0; i<n_group_param_; i++) {
-    param_name_vect.push_back(group_param_name_[i]);
+  std::vector< std::string > param_name_vect;
+  for ( int i = 0; i < n_group_param_; i++ )
+  {
+    param_name_vect.push_back( group_param_name_[ i ] );
   }
-  
+
   return param_name_vect;
 }
 
 // get number of neuron-group parameters
-int BaseNeuron::GetNGroupParam()
+int
+BaseNeuron::GetNGroupParam()
 {
   return n_group_param_;
 }
 
 // get all names of array variables
-std::vector<std::string> BaseNeuron::GetArrayVarNames()
+std::vector< std::string >
+BaseNeuron::GetArrayVarNames()
 {
-  std::vector<std::string> var_name_vect;
-  for (int i=0; i<GetNArrayVar(); i++) {
-    var_name_vect.push_back(array_var_name_[i]);
+  std::vector< std::string > var_name_vect;
+  for ( int i = 0; i < GetNArrayVar(); i++ )
+  {
+    var_name_vect.push_back( array_var_name_[ i ] );
   }
-  
+
   return var_name_vect;
 }
 
 // get number of array variables
-int BaseNeuron::GetNArrayVar()
+int
+BaseNeuron::GetNArrayVar()
 {
-  return (int)array_var_name_.size();
+  return ( int ) array_var_name_.size();
 }
 
 // get all names of array parameters
-std::vector<std::string> BaseNeuron::GetArrayParamNames()
+std::vector< std::string >
+BaseNeuron::GetArrayParamNames()
 {
-  std::vector<std::string> param_name_vect;
-  for (int i=0; i<GetNArrayParam(); i++) {
-    param_name_vect.push_back(array_param_name_[i]);
+  std::vector< std::string > param_name_vect;
+  for ( int i = 0; i < GetNArrayParam(); i++ )
+  {
+    param_name_vect.push_back( array_param_name_[ i ] );
   }
-  
+
   return param_name_vect;
 }
 
 // get number of array parameters
-int BaseNeuron::GetNArrayParam()
+int
+BaseNeuron::GetNArrayParam()
 {
-  return (int)array_param_name_.size();
+  return ( int ) array_param_name_.size();
 }
 
 // activate spike count for all neurons of the group
-int BaseNeuron::ActivateSpikeCount()
+int
+BaseNeuron::ActivateSpikeCount()
 {
   const std::string s = "spike_count";
-  if (std::find(int_var_name_.begin(), int_var_name_.end(), s)
-      == int_var_name_.end()) { // add it if not already present 
-    int_var_name_.push_back(s);
+  if ( std::find( int_var_name_.begin(), int_var_name_.end(), s ) == int_var_name_.end() )
+  { // add it if not already present
+    int_var_name_.push_back( s );
 
-    CUDAMALLOCCTRL("&spike_count_",&spike_count_, n_node_*sizeof(int));
-    gpuErrchk(cudaMemset(spike_count_, 0, n_node_*sizeof(int)));
-    int_var_pt_.push_back(spike_count_);
+    CUDAMALLOCCTRL( "&spike_count_", &spike_count_, n_node_ * sizeof( int ) );
+    gpuErrchk( cudaMemset( spike_count_, 0, n_node_ * sizeof( int ) ) );
+    int_var_pt_.push_back( spike_count_ );
   }
-  else {
-    throw ngpu_exception("Spike count already activated");
+  else
+  {
+    throw ngpu_exception( "Spike count already activated" );
   }
 
-
   return 0;
 }
 
 // activate spike-time recording for all neurons of the group
-int BaseNeuron::ActivateRecSpikeTimes(int max_n_rec_spike_times)
+int
+BaseNeuron::ActivateRecSpikeTimes( int max_n_rec_spike_times )
 {
-  if(max_n_rec_spike_times<=0) {
-    throw ngpu_exception("Maximum number of recorded spike times "
-			 "must be greater than 0");
+  if ( max_n_rec_spike_times <= 0 )
+  {
+    throw ngpu_exception(
+      "Maximum number of recorded spike times "
+      "must be greater than 0" );
   }
   const std::string s = "n_rec_spike_times";
-  if (std::find(int_var_name_.begin(), int_var_name_.end(), s)
-      == int_var_name_.end()) { // add it if not already present 
-    int_var_name_.push_back(s);
-
-    CUDAMALLOCCTRL("&n_rec_spike_times_",&n_rec_spike_times_, n_node_*sizeof(int));
-    CUDAMALLOCCTRL("&n_rec_spike_times_cumul_",&n_rec_spike_times_cumul_,
-			 (n_node_+1)*sizeof(int));
-    gpuErrchk(cudaMemset(n_rec_spike_times_, 0, n_node_*sizeof(int)));
-    int_var_pt_.push_back(n_rec_spike_times_);
-    
+  if ( std::find( int_var_name_.begin(), int_var_name_.end(), s ) == int_var_name_.end() )
+  { // add it if not already present
+    int_var_name_.push_back( s );
+
+    CUDAMALLOCCTRL( "&n_rec_spike_times_", &n_rec_spike_times_, n_node_ * sizeof( int ) );
+    CUDAMALLOCCTRL( "&n_rec_spike_times_cumul_", &n_rec_spike_times_cumul_, ( n_node_ + 1 ) * sizeof( int ) );
+    gpuErrchk( cudaMemset( n_rec_spike_times_, 0, n_node_ * sizeof( int ) ) );
+    int_var_pt_.push_back( n_rec_spike_times_ );
+
     max_n_rec_spike_times_ = max_n_rec_spike_times;
-    CUDAMALLOCCTRL("&rec_spike_times_",&rec_spike_times_, n_node_*max_n_rec_spike_times
-			 *sizeof(int));
-    CUDAMALLOCCTRL("&rec_spike_times_pack_",&rec_spike_times_pack_, n_node_*max_n_rec_spike_times
-			 *sizeof(int));
-    spike_times_pt_vect_.resize(n_node_, NULL);
-    n_spike_times_vect_.resize(n_node_, 0);
-    spike_times_vect_.resize(n_node_);
+    CUDAMALLOCCTRL( "&rec_spike_times_", &rec_spike_times_, n_node_ * max_n_rec_spike_times * sizeof( int ) );
+    CUDAMALLOCCTRL( "&rec_spike_times_pack_", &rec_spike_times_pack_, n_node_ * max_n_rec_spike_times * sizeof( int ) );
+    spike_times_pt_vect_.resize( n_node_, nullptr );
+    n_spike_times_vect_.resize( n_node_, 0 );
+    spike_times_vect_.resize( n_node_ );
   }
-  else {
-    throw ngpu_exception("Spike times recording already activated");
+  else
+  {
+    throw ngpu_exception( "Spike times recording already activated" );
   }
 
   return 0;
 }
 
 // set number of time steps for buffering recorded spike times
-int BaseNeuron::SetRecSpikeTimesStep(int rec_spike_times_step)
+int
+BaseNeuron::SetRecSpikeTimesStep( int rec_spike_times_step )
 {
   rec_spike_times_step_ = rec_spike_times_step;
 
@@ -1689,155 +1772,164 @@ int BaseNeuron::SetRecSpikeTimesStep(int rec_spike_times_step)
 }
 
 // get number of spikes recorded for neuron i_neuron
-int BaseNeuron::GetNRecSpikeTimes(int i_neuron)
+int
+BaseNeuron::GetNRecSpikeTimes( int i_neuron )
 {
-  CheckNeuronIdx(i_neuron);
-  if(max_n_rec_spike_times_<=0) {
-    throw ngpu_exception("Spike times recording was not activated");
+  CheckNeuronIdx( i_neuron );
+  if ( max_n_rec_spike_times_ <= 0 )
+  {
+    throw ngpu_exception( "Spike times recording was not activated" );
   }
   int n_spikes;
-  
-  gpuErrchk(cudaMemcpy(&n_spikes, &n_rec_spike_times_[i_neuron], sizeof(int),
-		       cudaMemcpyDeviceToHost));
+
+  gpuErrchk( cudaMemcpy( &n_spikes, &n_rec_spike_times_[ i_neuron ], sizeof( int ), cudaMemcpyDeviceToHost ) );
   return n_spikes;
 }
 
 // get input spikes from external interface
 // Must be defined in derived classes
-float *BaseNeuron::GetExtNeuronInputSpikes(int *n_node, int *n_port)
+float*
+BaseNeuron::GetExtNeuronInputSpikes( int* n_node, int* n_port )
 {
-  throw ngpu_exception("Cannot get extern neuron input spikes from this model");
+  throw ngpu_exception( "Cannot get extern neuron input spikes from this model" );
 }
 
 // set neuron-group parameter param_name to value val
 // Must be defined in derived classes
-int BaseNeuron::SetNeuronGroupParam(std::string param_name, float val)
+int
+BaseNeuron::SetNeuronGroupParam( std::string param_name, float val )
 {
-  throw ngpu_exception(std::string("Unrecognized neuron group parameter ")
-		       + param_name);
+  throw ngpu_exception( std::string( "Unrecognized neuron group parameter " ) + param_name );
 }
 
-
-
 // kernel for packing spike times of neurons
 // i_neuron, ..., i_neuron + n_neuron -1
-// in contiguous locations in GPU memory 
-__global__ void PackSpikeTimesKernel(int n_neuron, int *n_rec_spike_times_cumul,
-		     float *rec_spike_times, float *rec_spike_times_pack,
-		     int n_spike_tot, int max_n_rec_spike_times)
-{
-  // array_idx: index on one-dimensional packed spike array 
+// in contiguous locations in GPU memory
+__global__ void
+PackSpikeTimesKernel( int n_neuron,
+  int* n_rec_spike_times_cumul,
+  float* rec_spike_times,
+  float* rec_spike_times_pack,
+  int n_spike_tot,
+  int max_n_rec_spike_times )
+{
+  // array_idx: index on one-dimensional packed spike array
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<n_spike_tot) {
+  if ( array_idx < n_spike_tot )
+  {
     // a locate of array_idx on the cumulative sum of the number of spikes
     // of the neurons is used to get the neuron index
-    int i_neuron = locate(array_idx, n_rec_spike_times_cumul, n_neuron + 1);
+    int i_neuron = locate( array_idx, n_rec_spike_times_cumul, n_neuron + 1 );
     // if neuron has no spikes, go to the next
-    while ((i_neuron < n_neuron) && (n_rec_spike_times_cumul[i_neuron+1]
-				  == n_rec_spike_times_cumul[i_neuron])) {
+    while (
+      ( i_neuron < n_neuron ) && ( n_rec_spike_times_cumul[ i_neuron + 1 ] == n_rec_spike_times_cumul[ i_neuron ] ) )
+    {
       i_neuron++;
-      if (i_neuron==n_neuron) return;
+      if ( i_neuron == n_neuron )
+      {
+        return;
+      }
     }
     // the difference gives the spike index
-    int i_spike = array_idx - n_rec_spike_times_cumul[i_neuron];
+    int i_spike = array_idx - n_rec_spike_times_cumul[ i_neuron ];
     // copy the spike to the packed array
-    rec_spike_times_pack[array_idx] =
-      rec_spike_times[i_neuron*max_n_rec_spike_times + i_spike];
+    rec_spike_times_pack[ array_idx ] = rec_spike_times[ i_neuron * max_n_rec_spike_times + i_spike ];
   }
 }
 
 // extract recorded spike times
 // and put them in a buffer
-int BaseNeuron::BufferRecSpikeTimes()
-{  
-  if(max_n_rec_spike_times_<=0) {
-    throw ngpu_exception("Spike times recording was not activated");
+int
+BaseNeuron::BufferRecSpikeTimes()
+{
+  if ( max_n_rec_spike_times_ <= 0 )
+  {
+    throw ngpu_exception( "Spike times recording was not activated" );
   }
   // a cumulative sum is used by the spike-packing algorithm
-  prefix_scan(n_rec_spike_times_cumul_, n_rec_spike_times_,
-	      n_node_+1, true);
-  int *h_n_rec_spike_times_cumul = new int[n_node_+1];
-  gpuErrchk(cudaMemcpy(h_n_rec_spike_times_cumul,
-			    n_rec_spike_times_cumul_,
-			    (n_node_+1)*sizeof(int), cudaMemcpyDeviceToHost));
+  prefix_scan( n_rec_spike_times_cumul_, n_rec_spike_times_, n_node_ + 1, true );
+  int* h_n_rec_spike_times_cumul = new int[ n_node_ + 1 ];
+  gpuErrchk( cudaMemcpy(
+    h_n_rec_spike_times_cumul, n_rec_spike_times_cumul_, ( n_node_ + 1 ) * sizeof( int ), cudaMemcpyDeviceToHost ) );
   // the last element of the cumulative sum is the total number of spikes
-  int n_spike_tot = h_n_rec_spike_times_cumul[n_node_];
+  int n_spike_tot = h_n_rec_spike_times_cumul[ n_node_ ];
 
-  if (n_spike_tot>0) {
+  if ( n_spike_tot > 0 )
+  {
     // pack spike times in GPU memory
-    PackSpikeTimesKernel<<<(n_spike_tot+1023)/1024, 1024>>>(n_node_,
-		     n_rec_spike_times_cumul_,
-		     rec_spike_times_,
-		     rec_spike_times_pack_,
-		     n_spike_tot, max_n_rec_spike_times_);
-
-    float *h_rec_spike_times_pack = new float[n_spike_tot];
-    gpuErrchk(cudaMemcpy(h_rec_spike_times_pack,
-			 rec_spike_times_pack_,
-			 sizeof(float)*n_spike_tot, cudaMemcpyDeviceToHost));
+    PackSpikeTimesKernel<<< ( n_spike_tot + 1023 ) / 1024, 1024 >>>(
+      n_node_, n_rec_spike_times_cumul_, rec_spike_times_, rec_spike_times_pack_, n_spike_tot, max_n_rec_spike_times_ );
+
+    float* h_rec_spike_times_pack = new float[ n_spike_tot ];
+    gpuErrchk( cudaMemcpy(
+      h_rec_spike_times_pack, rec_spike_times_pack_, sizeof( float ) * n_spike_tot, cudaMemcpyDeviceToHost ) );
     // push the packed spike array and the cumulative sum in the buffers
-    spike_times_buffer_.push_back(h_rec_spike_times_pack);
-    n_spike_times_cumul_buffer_.push_back(h_n_rec_spike_times_cumul);
-    gpuErrchk(cudaMemset(n_rec_spike_times_, 0, n_node_*sizeof(int)));
+    spike_times_buffer_.push_back( h_rec_spike_times_pack );
+    n_spike_times_cumul_buffer_.push_back( h_n_rec_spike_times_cumul );
+    gpuErrchk( cudaMemset( n_rec_spike_times_, 0, n_node_ * sizeof( int ) ) );
   }
-  else {
+  else
+  {
     delete[] h_n_rec_spike_times_cumul;
   }
-  
+
   return 0;
 }
 
 // get recorded spike times
-int BaseNeuron::GetRecSpikeTimes(int **n_spike_times_pt,
-				 float ***spike_times_pt)
+int
+BaseNeuron::GetRecSpikeTimes( int** n_spike_times_pt, float*** spike_times_pt )
 {
-  if(max_n_rec_spike_times_<=0) {
-    throw ngpu_exception("Spike times recording was not activated");
+  if ( max_n_rec_spike_times_ <= 0 )
+  {
+    throw ngpu_exception( "Spike times recording was not activated" );
   }
   // push all spikes and cumulative sums left in the buffers
   BufferRecSpikeTimes();
 
   // first evaluate the total number of spikes for each node
-  for (int i_node=0; i_node<n_node_; i_node++) {
-    n_spike_times_vect_[i_node] = 0;
+  for ( int i_node = 0; i_node < n_node_; i_node++ )
+  {
+    n_spike_times_vect_[ i_node ] = 0;
     // loop on buffer entries
-    for (uint i_buf=0; i_buf<spike_times_buffer_.size(); i_buf++) {
-      int *n_spike_times_cumul = n_spike_times_cumul_buffer_[i_buf];
+    for ( uint i_buf = 0; i_buf < spike_times_buffer_.size(); i_buf++ )
+    {
+      int* n_spike_times_cumul = n_spike_times_cumul_buffer_[ i_buf ];
       // get the number of spikes of each buffer entry
-      int n_spike = n_spike_times_cumul[i_node+1] - n_spike_times_cumul[i_node];
+      int n_spike = n_spike_times_cumul[ i_node + 1 ] - n_spike_times_cumul[ i_node ];
       // and add it to the number of spikes of the node
-      n_spike_times_vect_[i_node] += n_spike;
+      n_spike_times_vect_[ i_node ] += n_spike;
     }
     // allocate the spike vector for the considered node
-    spike_times_vect_[i_node].resize(n_spike_times_vect_[i_node]);
+    spike_times_vect_[ i_node ].resize( n_spike_times_vect_[ i_node ] );
 
     int k = 0;
     // loop on buffer entries
-    for (uint i_buf=0; i_buf<spike_times_buffer_.size(); i_buf++) {
-      float *spike_times_pack = spike_times_buffer_[i_buf];
-      int *n_spike_times_cumul = n_spike_times_cumul_buffer_[i_buf];
+    for ( uint i_buf = 0; i_buf < spike_times_buffer_.size(); i_buf++ )
+    {
+      float* spike_times_pack = spike_times_buffer_[ i_buf ];
+      int* n_spike_times_cumul = n_spike_times_cumul_buffer_[ i_buf ];
       // array_idx: index of the first spike of node i_node
-      // on one-dimensional packed spike array      
-      int array_idx = n_spike_times_cumul[i_node];
-      int n_spike = n_spike_times_cumul[i_node+1] - array_idx;
-     
-      float *pt = spike_times_pack + array_idx;
-      // insert the spikes of node i_node in its spike vector 
-      spike_times_vect_[i_node].insert(spike_times_vect_[i_node].begin()+k,
-				       pt, pt+n_spike);
+      // on one-dimensional packed spike array
+      int array_idx = n_spike_times_cumul[ i_node ];
+      int n_spike = n_spike_times_cumul[ i_node + 1 ] - array_idx;
+
+      float* pt = spike_times_pack + array_idx;
+      // insert the spikes of node i_node in its spike vector
+      spike_times_vect_[ i_node ].insert( spike_times_vect_[ i_node ].begin() + k, pt, pt + n_spike );
       k += n_spike;
     }
   }
-  for (int i_node=0; i_node<n_node_; i_node++) {
+  for ( int i_node = 0; i_node < n_node_; i_node++ )
+  {
     // pointer to spike vector data of node i_node
-    spike_times_pt_vect_[i_node] = spike_times_vect_[i_node].data();
+    spike_times_pt_vect_[ i_node ] = spike_times_vect_[ i_node ].data();
   }
   spike_times_buffer_.clear();
-  n_spike_times_cumul_buffer_.clear();                                  
- 
+  n_spike_times_cumul_buffer_.clear();
+
   *n_spike_times_pt = n_spike_times_vect_.data();
   *spike_times_pt = spike_times_pt_vect_.data();
-  
+
   return 0;
 }
-  
diff --git a/src/base_neuron.h b/src/base_neuron.h
index ba74d3fa6..80fc229d1 100644
--- a/src/base_neuron.h
+++ b/src/base_neuron.h
@@ -20,18 +20,14 @@
  *
  */
 
-
-
-
-
 #ifndef BASENEURON_H
 #define BASENEURON_H
 
+#include <stdint.h>
 #include <string>
 #include <vector>
-#include <stdint.h>
-//#include <curand.h>
-//#include "distribution.h"
+// #include <curand.h>
+// #include "distribution.h"
 
 class Distribution;
 
@@ -42,7 +38,7 @@ typedef struct curandGenerator_st* curandGenerator_t;
 
 class BaseNeuron
 {
- protected:
+protected:
   friend class NESTGPU;
   int node_type_;
   bool ext_neuron_flag_;
@@ -50,7 +46,7 @@ class BaseNeuron
   int n_node_;
   int n_port_;
   int i_group_;
-  curandGenerator_t *random_generator_;
+  curandGenerator_t* random_generator_;
 
   int n_int_var_;
   int n_scal_var_;
@@ -60,294 +56,279 @@ class BaseNeuron
   int n_group_param_;
   int n_var_;
   int n_param_;
-  
-  double *get_spike_array_;
-  float *port_weight_arr_;
+
+  double* get_spike_array_;
+  float* port_weight_arr_;
   int port_weight_arr_step_;
   int port_weight_port_step_;
-  float *port_input_arr_;
+  float* port_input_arr_;
   int port_input_arr_step_;
   int port_input_port_step_;
-  std::vector<int*> int_var_pt_; 
-  float *var_arr_;
-  float *param_arr_;
-  float *group_param_;
-  std::vector<std::string> int_var_name_;
-  const std::string *scal_var_name_;
-  const std::string *port_var_name_;
-  const std::string *scal_param_name_;
-  const std::string *port_param_name_;
-  const std::string *group_param_name_;
-  std::vector<std::string> array_var_name_;
-  std::vector<std::string> array_param_name_;
-  
+  std::vector< int* > int_var_pt_;
+  float* var_arr_;
+  float* param_arr_;
+  float* group_param_;
+  std::vector< std::string > int_var_name_;
+  const std::string* scal_var_name_;
+  const std::string* port_var_name_;
+  const std::string* scal_param_name_;
+  const std::string* port_param_name_;
+  const std::string* group_param_name_;
+  std::vector< std::string > array_var_name_;
+  std::vector< std::string > array_param_name_;
+
   bool has_dir_conn_; // = false;
 
-  int *spike_count_;
-  float *rec_spike_times_;
-  float *rec_spike_times_pack_;
-  int *n_rec_spike_times_;
-  int *n_rec_spike_times_cumul_;
+  int* spike_count_;
+  float* rec_spike_times_;
+  float* rec_spike_times_pack_;
+  int* n_rec_spike_times_;
+  int* n_rec_spike_times_cumul_;
   int max_n_rec_spike_times_;
   int rec_spike_times_step_;
-  float *den_delay_arr_;
-  std::vector<int*> n_spike_times_cumul_buffer_;
-  std::vector<float*> spike_times_buffer_;
-  std::vector<float*> spike_times_pt_vect_;
-  std::vector<int> n_spike_times_vect_;
-  std::vector< std::vector <float> > spike_times_vect_;
+  float* den_delay_arr_;
+  std::vector< int* > n_spike_times_cumul_buffer_;
+  std::vector< float* > spike_times_buffer_;
+  std::vector< float* > spike_times_pt_vect_;
+  std::vector< int > n_spike_times_vect_;
+  std::vector< std::vector< float > > spike_times_vect_;
+
+  std::vector< float > port_weight_vect_;
+  std::vector< float > port_input_vect_;
 
-  std::vector<float> port_weight_vect_;
-  std::vector<float> port_input_vect_;
+  std::vector< float > ext_neuron_input_spikes_;
 
-  std::vector<float> ext_neuron_input_spikes_;
-  
- public:
-  virtual ~BaseNeuron() {}
-  
-  virtual int Init(int i_node_0, int n_neuron, int n_port,
-		   int i_neuron_group);
+public:
+  virtual ~BaseNeuron()
+  {
+  }
 
+  virtual int Init( int i_node_0, int n_neuron, int n_port, int i_neuron_group );
 
   virtual int AllocVarArr();
-  
+
   virtual int AllocParamArr();
 
   virtual int FreeVarArr();
-  
+
   virtual int FreeParamArr();
-  
-  int GetNodeType() {
+
+  int
+  GetNodeType()
+  {
     return node_type_;
   }
 
-  bool IsExtNeuron()
+  bool
+  IsExtNeuron()
   {
     return ext_neuron_flag_;
   }
-  
-  virtual int Calibrate(double time_min, float time_resolution) {return 0;}
-		
-  virtual int Update(long long it, double t1) {return 0;}
-
-  virtual int buildDirectConnections() {return 0;}
-  
-  virtual int GetX(int i_neuron, int n_neuron, double *x) {return 0;}
-  
-  virtual int GetY(int i_var, int i_neuron, int n_neuron, float *y) {return 0;}
-  
-  virtual int SetScalParam(int i_neuron, int n_neuron, std::string param_name, 
-			   float val);
 
-  virtual int SetScalParam(int *i_neuron, int n_neuron, std::string param_name,
-			   float val);
-  
-  virtual int SetPortParam(int i_neuron, int n_neuron, std::string param_name,
-			   float *param, int vect_size);
-  
-  virtual int SetPortParam(int *i_neuron, int n_neuron,
-			   std::string param_name, float *param,
-			   int vect_size);
+  virtual int
+  Calibrate( double time_min, float time_resolution )
+  {
+    return 0;
+  }
 
-  virtual int SetArrayParam(int i_neuron, int n_neuron, std::string param_name,
-			   float *array, int array_size);
-  
-  virtual int SetArrayParam(int *i_neuron, int n_neuron,
-			   std::string param_name, float *array,
-			   int array_size);
+  virtual int
+  Update( long long it, double t1 )
+  {
+    return 0;
+  }
 
-  virtual int SetGroupParam(std::string param_name, float val);
+  virtual int
+  buildDirectConnections()
+  {
+    return 0;
+  }
+
+  virtual int
+  GetX( int i_neuron, int n_neuron, double* x )
+  {
+    return 0;
+  }
 
-  virtual int SetIntVar(int i_neuron, int n_neuron, std::string var_name, 
-			int val);
+  virtual int
+  GetY( int i_var, int i_neuron, int n_neuron, float* y )
+  {
+    return 0;
+  }
 
-  virtual int SetIntVar(int *i_neuron, int n_neuron, std::string var_name,
-			int val);
+  virtual int SetScalParam( int i_neuron, int n_neuron, std::string param_name, float val );
 
-  virtual int SetScalVar(int i_neuron, int n_neuron, std::string var_name, 
-			   float val);
+  virtual int SetScalParam( int* i_neuron, int n_neuron, std::string param_name, float val );
 
-  virtual int SetScalVar(int *i_neuron, int n_neuron, std::string var_name,
-			   float val);
-  
-  virtual int SetPortVar(int i_neuron, int n_neuron, std::string var_name,
-			   float *var, int vect_size);
-  
-  virtual int SetPortVar(int *i_neuron, int n_neuron,
-			   std::string var_name, float *var,
-			   int vect_size);
+  virtual int SetPortParam( int i_neuron, int n_neuron, std::string param_name, float* param, int vect_size );
 
-  virtual int SetArrayVar(int i_neuron, int n_neuron, std::string var_name,
-			   float *array, int array_size);
-  
-  virtual int SetArrayVar(int *i_neuron, int n_neuron,
-			   std::string var_name, float *array,
-			   int array_size);
+  virtual int SetPortParam( int* i_neuron, int n_neuron, std::string param_name, float* param, int vect_size );
 
+  virtual int SetArrayParam( int i_neuron, int n_neuron, std::string param_name, float* array, int array_size );
 
-  virtual int SetScalParamDistr(int i_neuron, int n_node,
-				std::string param_name,
-				Distribution *distribution);
+  virtual int SetArrayParam( int* i_neuron, int n_neuron, std::string param_name, float* array, int array_size );
 
-  virtual int SetScalParamDistr(int *i_neuron, int n_node,
-				std::string param_name,
-				Distribution *distribution);
+  virtual int SetGroupParam( std::string param_name, float val );
 
-  virtual int SetScalVarDistr(int i_neuron, int n_node, std::string var_name,
-			      Distribution *distribution);
+  virtual int SetIntVar( int i_neuron, int n_neuron, std::string var_name, int val );
 
-  virtual int SetScalVarDistr(int *i_neuron, int n_node, std::string var_name,
-				Distribution *distribution);
+  virtual int SetIntVar( int* i_neuron, int n_neuron, std::string var_name, int val );
 
-  virtual int SetPortParamDistr(int i_neuron, int n_node,
-				  std::string param_name,
-				  Distribution *distribution);
+  virtual int SetScalVar( int i_neuron, int n_neuron, std::string var_name, float val );
 
-  virtual int SetPortParamDistr(int *i_neuron, int n_node,
-				std::string param_name,
-				Distribution *distribution);
+  virtual int SetScalVar( int* i_neuron, int n_neuron, std::string var_name, float val );
 
-  virtual int SetPortVarDistr(int i_neuron, int n_node, std::string var_name,
-			      Distribution *distribution);
+  virtual int SetPortVar( int i_neuron, int n_neuron, std::string var_name, float* var, int vect_size );
 
-  virtual int SetPortVarDistr(int *i_neuron, int n_node, std::string var_name,
-			      Distribution *distribution);
+  virtual int SetPortVar( int* i_neuron, int n_neuron, std::string var_name, float* var, int vect_size );
 
-  virtual float *GetScalParam(int i_neuron, int n_neuron,
-			      std::string param_name);
+  virtual int SetArrayVar( int i_neuron, int n_neuron, std::string var_name, float* array, int array_size );
 
-  virtual float *GetScalParam(int *i_neuron, int n_neuron,
-		    std::string param_name);
+  virtual int SetArrayVar( int* i_neuron, int n_neuron, std::string var_name, float* array, int array_size );
 
-  virtual float *GetPortParam(int i_neuron, int n_neuron,
-			      std::string param_name);
+  virtual int SetScalParamDistr( int i_neuron, int n_node, std::string param_name, Distribution* distribution );
 
-  virtual float *GetPortParam(int *i_neuron, int n_neuron,
-			      std::string param_name);
+  virtual int SetScalParamDistr( int* i_neuron, int n_node, std::string param_name, Distribution* distribution );
 
-  virtual float *GetArrayParam(int i_neuron, std::string param_name);
+  virtual int SetScalVarDistr( int i_neuron, int n_node, std::string var_name, Distribution* distribution );
 
-  virtual float GetGroupParam(std::string param_name);
+  virtual int SetScalVarDistr( int* i_neuron, int n_node, std::string var_name, Distribution* distribution );
 
-  virtual int *GetIntVar(int i_neuron, int n_neuron,
-			   std::string var_name);
+  virtual int SetPortParamDistr( int i_neuron, int n_node, std::string param_name, Distribution* distribution );
 
-  virtual int *GetIntVar(int *i_neuron, int n_neuron,
-			 std::string var_name);
+  virtual int SetPortParamDistr( int* i_neuron, int n_node, std::string param_name, Distribution* distribution );
 
-  virtual float *GetScalVar(int i_neuron, int n_neuron,
-			    std::string var_name);
+  virtual int SetPortVarDistr( int i_neuron, int n_node, std::string var_name, Distribution* distribution );
 
-  virtual float *GetScalVar(int *i_neuron, int n_neuron,
-			    std::string var_name);
+  virtual int SetPortVarDistr( int* i_neuron, int n_node, std::string var_name, Distribution* distribution );
 
-  virtual float *GetPortVar(int i_neuron, int n_neuron,
-			    std::string var_name);
+  virtual float* GetScalParam( int i_neuron, int n_neuron, std::string param_name );
 
-  virtual float *GetPortVar(int *i_neuron, int n_neuron,
-			    std::string var_name);
+  virtual float* GetScalParam( int* i_neuron, int n_neuron, std::string param_name );
 
-  virtual float *GetArrayVar(int i_neuron, std::string var_name);
+  virtual float* GetPortParam( int i_neuron, int n_neuron, std::string param_name );
 
-  virtual int GetIntVarIdx(std::string var_name);
-  
-  virtual int GetScalVarIdx(std::string var_name);
+  virtual float* GetPortParam( int* i_neuron, int n_neuron, std::string param_name );
 
-  virtual int GetPortVarIdx(std::string var_name);
+  virtual float* GetArrayParam( int i_neuron, std::string param_name );
 
-  virtual int GetScalParamIdx(std::string param_name);
+  virtual float GetGroupParam( std::string param_name );
 
-  virtual int GetPortParamIdx(std::string param_name);
+  virtual int* GetIntVar( int i_neuron, int n_neuron, std::string var_name );
 
-  virtual float *GetVarArr();
+  virtual int* GetIntVar( int* i_neuron, int n_neuron, std::string var_name );
 
-  virtual float *GetParamArr();
+  virtual float* GetScalVar( int i_neuron, int n_neuron, std::string var_name );
 
-  virtual int GetArrayVarSize(int i_neuron, std::string var_name);
-  
-  virtual int GetArrayParamSize(int i_neuron, std::string param_name);
+  virtual float* GetScalVar( int* i_neuron, int n_neuron, std::string var_name );
 
-  virtual int GetVarSize(std::string var_name);
+  virtual float* GetPortVar( int i_neuron, int n_neuron, std::string var_name );
 
-  virtual int GetParamSize(std::string param_name);
+  virtual float* GetPortVar( int* i_neuron, int n_neuron, std::string var_name );
 
-  virtual bool IsIntVar(std::string var_name);
+  virtual float* GetArrayVar( int i_neuron, std::string var_name );
 
-  virtual bool IsScalVar(std::string var_name);
+  virtual int GetIntVarIdx( std::string var_name );
 
-  virtual bool IsPortVar(std::string var_name);
+  virtual int GetScalVarIdx( std::string var_name );
 
-  virtual bool IsArrayVar(std::string var_name);
-  
-  virtual bool IsScalParam(std::string param_name);
+  virtual int GetPortVarIdx( std::string var_name );
 
-  virtual bool IsPortParam(std::string param_name);
+  virtual int GetScalParamIdx( std::string param_name );
 
-  virtual bool IsArrayParam(std::string param_name);
+  virtual int GetPortParamIdx( std::string param_name );
 
-  virtual bool IsGroupParam(std::string param_name);
+  virtual float* GetVarArr();
 
-  int CheckNeuronIdx(int i_neuron);
+  virtual float* GetParamArr();
 
-  int CheckPortIdx(int port);
+  virtual int GetArrayVarSize( int i_neuron, std::string var_name );
 
-  virtual int *GetIntVarPt(int i_neuron, std::string var_name);
-  
-  virtual float *GetVarPt(int i_neuron, std::string var_name, int port=0);
+  virtual int GetArrayParamSize( int i_neuron, std::string param_name );
 
-  virtual float *GetParamPt(int i_neuron, std::string param_name, 
-			    int port=0);
-  virtual float GetSpikeActivity(int i_neuron);
+  virtual int GetVarSize( std::string var_name );
 
-  virtual int SendDirectSpikes(long long time_idx) {return 0;}
+  virtual int GetParamSize( std::string param_name );
 
-  virtual std::vector<std::string> GetIntVarNames();
+  virtual bool IsIntVar( std::string var_name );
+
+  virtual bool IsScalVar( std::string var_name );
+
+  virtual bool IsPortVar( std::string var_name );
+
+  virtual bool IsArrayVar( std::string var_name );
+
+  virtual bool IsScalParam( std::string param_name );
+
+  virtual bool IsPortParam( std::string param_name );
+
+  virtual bool IsArrayParam( std::string param_name );
+
+  virtual bool IsGroupParam( std::string param_name );
+
+  int CheckNeuronIdx( int i_neuron );
+
+  int CheckPortIdx( int port );
+
+  virtual int* GetIntVarPt( int i_neuron, std::string var_name );
+
+  virtual float* GetVarPt( int i_neuron, std::string var_name, int port = 0 );
+
+  virtual float* GetParamPt( int i_neuron, std::string param_name, int port = 0 );
+  virtual float GetSpikeActivity( int i_neuron );
+
+  virtual int
+  SendDirectSpikes( long long time_idx )
+  {
+    return 0;
+  }
+
+  virtual std::vector< std::string > GetIntVarNames();
 
   virtual int GetNIntVar();
-  
-  virtual std::vector<std::string> GetScalVarNames();
-  
+
+  virtual std::vector< std::string > GetScalVarNames();
+
   virtual int GetNScalVar();
 
-  virtual std::vector<std::string> GetPortVarNames();
-  
+  virtual std::vector< std::string > GetPortVarNames();
+
   virtual int GetNPortVar();
 
-  virtual std::vector<std::string> GetScalParamNames();
-  
+  virtual std::vector< std::string > GetScalParamNames();
+
   virtual int GetNScalParam();
 
-  virtual std::vector<std::string> GetPortParamNames();
-  
+  virtual std::vector< std::string > GetPortParamNames();
+
   virtual int GetNPortParam();
 
-  virtual std::vector<std::string> GetArrayVarNames();
-  
+  virtual std::vector< std::string > GetArrayVarNames();
+
   virtual int GetNArrayVar();
 
-  virtual std::vector<std::string> GetArrayParamNames();
-  
+  virtual std::vector< std::string > GetArrayParamNames();
+
   virtual int GetNArrayParam();
 
-  virtual std::vector<std::string> GetGroupParamNames();
-  
+  virtual std::vector< std::string > GetGroupParamNames();
+
   virtual int GetNGroupParam();
 
   virtual int ActivateSpikeCount();
 
-  virtual int ActivateRecSpikeTimes(int max_n_rec_spike_times);
+  virtual int ActivateRecSpikeTimes( int max_n_rec_spike_times );
 
-  virtual int GetNRecSpikeTimes(int i_neuron);
+  virtual int GetNRecSpikeTimes( int i_neuron );
 
   virtual int BufferRecSpikeTimes();
-  
-  virtual int GetRecSpikeTimes(int **n_spike_times_pt, float ***spike_times_pt);
 
-  virtual int SetRecSpikeTimesStep(int rec_spike_times_step);
+  virtual int GetRecSpikeTimes( int** n_spike_times_pt, float*** spike_times_pt );
+
+  virtual int SetRecSpikeTimesStep( int rec_spike_times_step );
 
-  virtual float *GetExtNeuronInputSpikes(int *n_node, int *n_port);
+  virtual float* GetExtNeuronInputSpikes( int* n_node, int* n_port );
 
-  virtual int SetNeuronGroupParam(std::string param_name, float val);
+  virtual int SetNeuronGroupParam( std::string param_name, float val );
 };
 
 #endif
diff --git a/src/conn12b.cu b/src/conn12b.cu
new file mode 100644
index 000000000..dd7f42c59
--- /dev/null
+++ b/src/conn12b.cu
@@ -0,0 +1,144 @@
+#include "conn12b.h"
+#include "connect.h"
+
+#include <iostream>
+
+__global__ void
+setMaxNodeNBitsKernel( int max_node_nbits,
+  int max_port_syn_nbits,
+  int max_delay_nbits,
+  int max_port_nbits,
+  uint port_syn_mask,
+  uint delay_mask,
+  uint source_mask,
+  uint target_mask,
+  uint port_mask )
+{
+  MaxNodeNBits = max_node_nbits;
+  MaxPortSynNBits = max_port_syn_nbits;
+  MaxDelayNBits = max_delay_nbits;
+  MaxPortNBits = max_port_nbits;
+  PortSynMask = port_syn_mask;
+  DelayMask = delay_mask;
+  SourceMask = source_mask;
+  TargetMask = target_mask;
+  PortMask = port_mask;
+}
+
+__global__ void
+setMaxSynNBitsKernel( int max_syn_nbits, int max_port_nbits, uint syn_mask, uint port_mask )
+{
+  MaxSynNBits = max_syn_nbits;
+  MaxPortNBits = max_port_nbits;
+  SynMask = syn_mask;
+  PortMask = port_mask;
+}
+
+// Set maximum number of bits used to represent node index
+// and other dependent variables
+template <>
+int
+ConnectionTemplate< conn12b_key, conn12b_struct >::_setMaxNodeNBits( int max_node_nbits )
+{
+  // maximum number of bits used to represent node index
+  max_node_nbits_ = max_node_nbits;
+
+  // maximum number of bits used to represent receptor port index
+  // and synapse group index
+  max_port_syn_nbits_ = 32 - max_node_nbits_;
+
+  // maximum number of bits used to represent delays
+  max_delay_nbits_ = max_port_syn_nbits_;
+
+  // maximum number of bits used to represent receptor port index
+  max_port_nbits_ = max_port_syn_nbits_ - max_syn_nbits_ - 1;
+
+  // bit mask used to extract port and synapse group index
+  port_syn_mask_ = ( 1 << max_port_syn_nbits_ ) - 1;
+
+  // bit mask used to extract delay
+  delay_mask_ = port_syn_mask_;
+
+  // bit mask used to extract source node index
+  source_mask_ = ~delay_mask_;
+
+  // bit mask used to extract target node index
+  target_mask_ = source_mask_;
+
+  // bit mask used to extract port index
+  port_mask_ = ( ( 1 << max_port_nbits_ ) - 1 ) << ( max_syn_nbits_ + 1 );
+
+  // call CUDA kernel to initialize variables in device memory
+  setMaxNodeNBitsKernel<<< 1, 1 >>>( max_node_nbits_,
+    max_port_syn_nbits_,
+    max_delay_nbits_,
+    max_port_nbits_,
+    port_syn_mask_,
+    delay_mask_,
+    source_mask_,
+    target_mask_,
+    port_mask_ );
+
+  DBGCUDASYNC;
+
+  return 0;
+}
+
+// Set maximum number of bits used to represent delay
+// and other dependent variables
+template <>
+int
+ConnectionTemplate< conn12b_key, conn12b_struct >::_setMaxDelayNBits( int max_delay_nbits )
+{
+  return _setMaxNodeNBits( 32 - max_delay_nbits );
+}
+
+// Set maximum number of bits used to represent synapse group index
+// and other dependent variables
+template <>
+int
+ConnectionTemplate< conn12b_key, conn12b_struct >::_setMaxSynNBits( int max_syn_nbits )
+{
+  // maximum number of bits used to represent synapse group index
+  max_syn_nbits_ = max_syn_nbits;
+
+  // maximum number of bits used to represent receptor port index
+  max_port_nbits_ = max_port_syn_nbits_ - max_syn_nbits_ - 1;
+
+  // bit mask used to extract synapse group index
+  syn_mask_ = ( 1 << max_syn_nbits_ ) - 1;
+
+  // bit mask used to extract port index
+  port_mask_ = ( ( 1 << max_port_nbits_ ) - 1 ) << ( max_syn_nbits_ + 1 );
+
+  // call CUDA kernel to initialize variables in device memory
+  setMaxSynNBitsKernel<<< 1, 1 >>>( max_syn_nbits_, max_port_nbits_, syn_mask_, port_mask_ );
+  DBGCUDASYNC;
+
+  return 0;
+}
+
+template <>
+void
+ConnectionTemplate< conn12b_key, conn12b_struct >::setConnSource( conn12b_key& conn_key, inode_t source )
+{
+  conn_key = ( conn_key & ( ~source_mask_ ) ) | ( source << max_delay_nbits_ );
+}
+
+template <>
+int
+ConnectionTemplate< conn12b_key, conn12b_struct >::getConnDelay( const conn12b_key& conn_key )
+{
+  return conn_key & delay_mask_;
+}
+
+template <>
+ConnectionTemplate< conn12b_key, conn12b_struct >::ConnectionTemplate()
+{
+  // std::cout << "In Connectiontemplate<conn12b_key, conn12b_struct> "
+  //"specialized constructor\n";
+  init();
+  _setMaxNodeNBits( 20 ); // maximum number of nodes is 2^20
+  // std::cout << "max_node_nbits_: " << max_node_nbits_ << "\n";
+  _setMaxSynNBits( 6 ); // maximum number of synapse groups is 2^6
+}
diff --git a/src/conn12b.h b/src/conn12b.h
new file mode 100644
index 000000000..11273119a
--- /dev/null
+++ b/src/conn12b.h
@@ -0,0 +1,130 @@
+/*
+ *  connect.h
+ *
+ *  This file is part of NEST GPU.
+ *
+ *  Copyright (C) 2021 The NEST Initiative
+ *
+ *  NEST GPU is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  NEST GPU is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with NEST GPU.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef CONN12B_H
+#define CONN12B_H
+#include "connect.h"
+#include "remote_connect.h"
+
+struct conn12b_struct
+{
+  uint target_port_syn;
+  float weight;
+};
+
+typedef uint conn12b_key;
+
+template <>
+__device__ __forceinline__ void
+setConnDelay< conn12b_key >( conn12b_key& conn_key, int delay )
+{
+  conn_key = ( conn_key & ( ~DelayMask ) ) | delay;
+}
+
+template <>
+__device__ __forceinline__ void
+setConnSource< conn12b_key >( conn12b_key& conn_key, inode_t source )
+{
+  conn_key = ( conn_key & ( ~SourceMask ) ) | ( source << MaxDelayNBits );
+}
+
+template <>
+__device__ __forceinline__ void
+setConnTarget< conn12b_struct >( conn12b_struct& conn, inode_t target )
+{
+  conn.target_port_syn = ( conn.target_port_syn & ( ~TargetMask ) ) | ( target << MaxPortSynNBits );
+}
+
+template <>
+__device__ __forceinline__ void
+setConnPort< conn12b_key, conn12b_struct >( conn12b_key& conn_key, conn12b_struct& conn, int port )
+{
+  conn.target_port_syn = ( conn.target_port_syn & ( ~PortMask ) ) | ( port << ( MaxSynNBits + 1 ) );
+}
+
+template <>
+__device__ __forceinline__ void
+setConnSyn< conn12b_key, conn12b_struct >( conn12b_key& conn_key, conn12b_struct& conn, int syn )
+{
+  conn.target_port_syn = ( conn.target_port_syn & ( ~SynMask ) ) | syn;
+}
+
+template <>
+__device__ __forceinline__ int
+getConnDelay< conn12b_key >( const conn12b_key& conn_key )
+{
+  return conn_key & DelayMask;
+}
+
+template <>
+__device__ __forceinline__ inode_t
+getConnSource< conn12b_key >( conn12b_key& conn_key )
+{
+  return ( conn_key & SourceMask ) >> MaxDelayNBits;
+}
+
+template <>
+__device__ __forceinline__ inode_t
+getConnTarget< conn12b_struct >( conn12b_struct& conn )
+{
+  return ( conn.target_port_syn & TargetMask ) >> MaxPortSynNBits;
+}
+
+template <>
+__device__ __forceinline__ int
+getConnPort< conn12b_key, conn12b_struct >( conn12b_key& conn_key, conn12b_struct& conn )
+{
+  return ( conn.target_port_syn & PortMask ) >> ( MaxSynNBits + 1 );
+}
+
+template <>
+__device__ __forceinline__ int
+getConnSyn< conn12b_key, conn12b_struct >( conn12b_key& conn_key, conn12b_struct& conn )
+{
+  return conn.target_port_syn & SynMask;
+}
+
+// TEMPORARY TO BE IMPROVED!!!!
+template <>
+__device__ __forceinline__ bool
+getConnRemoteFlag< conn12b_key, conn12b_struct >( conn12b_key& conn_key, conn12b_struct& conn )
+{
+  return ( conn.target_port_syn >> MaxSynNBits ) & ( uint ) 1;
+}
+
+template <>
+__device__ __forceinline__ void
+clearConnRemoteFlag< conn12b_key, conn12b_struct >( conn12b_key& conn_key, conn12b_struct& conn )
+{
+  conn.target_port_syn = conn.target_port_syn & ~( ( uint ) 1 << MaxSynNBits );
+}
+
+template <>
+int ConnectionTemplate< conn12b_key, conn12b_struct >::_setMaxNodeNBits( int max_node_nbits );
+
+template <>
+int ConnectionTemplate< conn12b_key, conn12b_struct >::_setMaxDelayNBits( int max_delay_nbits );
+
+template <>
+int ConnectionTemplate< conn12b_key, conn12b_struct >::_setMaxSynNBits( int max_syn_nbits );
+
+#endif
diff --git a/src/conn16b.cu b/src/conn16b.cu
new file mode 100644
index 000000000..ad442e930
--- /dev/null
+++ b/src/conn16b.cu
@@ -0,0 +1,131 @@
+#include "conn16b.h"
+#include "connect.h"
+
+#include <iostream>
+
+__global__ void
+setMaxDelayNBits16bKernel( int max_delay_nbits,
+  int max_port_syn_nbits,
+  int max_port_nbits,
+  uint port_syn_mask,
+  uint delay_mask,
+  uint source_mask,
+  uint port_mask )
+{
+  MaxDelayNBits = max_delay_nbits;
+  MaxPortSynNBits = max_port_syn_nbits;
+  MaxPortNBits = max_port_nbits;
+  PortSynMask = port_syn_mask;
+  DelayMask = delay_mask;
+  SourceMask = source_mask;
+  PortMask = port_mask;
+}
+
+__global__ void
+setMaxSynNBits16bKernel( int max_syn_nbits, int max_port_nbits, uint syn_mask, uint port_mask )
+{
+  MaxSynNBits = max_syn_nbits;
+  MaxPortNBits = max_port_nbits;
+  SynMask = syn_mask;
+  PortMask = port_mask;
+}
+
+// Set maximum number of bits used to represent node index
+// and other dependent variables
+template <>
+int
+ConnectionTemplate< conn16b_key, conn16b_struct >::_setMaxNodeNBits( int max_node_nbits )
+{
+  std::cout << "Warning: number of bits representing node index is fixed "
+               "to 32 and cannot be modified with conn16b connection type";
+
+  return 0;
+}
+
+// Set maximum number of bits used to represent delay
+// and other dependent variables
+template <>
+int
+ConnectionTemplate< conn16b_key, conn16b_struct >::_setMaxDelayNBits( int max_delay_nbits )
+{
+  // maximum number of bits used to represent node index
+  max_delay_nbits_ = max_delay_nbits;
+
+  // maximum number of bits used to represent receptor port index
+  // and synapse group index
+  max_port_syn_nbits_ = 32 - max_delay_nbits_;
+
+  // maximum number of bits used to represent receptor port index
+  max_port_nbits_ = max_port_syn_nbits_ - max_syn_nbits_ - 1;
+
+  // bit mask used to extract port and synapse group index
+  port_syn_mask_ = ( 1 << max_port_syn_nbits_ ) - 1;
+
+  // bit mask used to extract delay
+  delay_mask_ = ( ( 1 << max_delay_nbits_ ) - 1 ) << max_port_syn_nbits_;
+
+  // bit mask used to extract source node index
+  source_mask_ = 0xFFFFFFFF;
+
+  // bit mask used to extract port index
+  port_mask_ = ( ( 1 << max_port_nbits_ ) - 1 ) << ( max_syn_nbits_ + 1 );
+
+  // call CUDA kernel to initialize variables in device memory
+  setMaxDelayNBits16bKernel<<< 1, 1 >>>(
+    max_delay_nbits_, max_port_syn_nbits_, max_port_nbits_, port_syn_mask_, delay_mask_, source_mask_, port_mask_ );
+
+  DBGCUDASYNC;
+
+  return 0;
+}
+
+// Set maximum number of bits used to represent synapse group index
+// and other dependent variables
+template <>
+int
+ConnectionTemplate< conn16b_key, conn16b_struct >::_setMaxSynNBits( int max_syn_nbits )
+{
+  // maximum number of bits used to represent synapse group index
+  max_syn_nbits_ = max_syn_nbits;
+
+  // maximum number of bits used to represent receptor port index
+  max_port_nbits_ = max_port_syn_nbits_ - max_syn_nbits_ - 1;
+
+  // bit mask used to extract synapse group index
+  syn_mask_ = ( 1 << max_syn_nbits_ ) - 1;
+
+  // bit mask used to extract port index
+  port_mask_ = ( ( 1 << max_port_nbits_ ) - 1 ) << ( max_syn_nbits_ + 1 );
+
+  // call CUDA kernel to initialize variables in device memory
+  setMaxSynNBits16bKernel<<< 1, 1 >>>( max_syn_nbits_, max_port_nbits_, syn_mask_, port_mask_ );
+  DBGCUDASYNC;
+
+  return 0;
+}
+
+template <>
+void
+ConnectionTemplate< conn16b_key, conn16b_struct >::setConnSource( conn16b_key& conn_key, inode_t source )
+{
+  conn_key = ( conn_key & 0xFFFFFFFF ) | ( ( uint64_t ) source << 32 );
+}
+
+template <>
+int
+ConnectionTemplate< conn16b_key, conn16b_struct >::getConnDelay( const conn16b_key& conn_key )
+{
+  return ( int ) ( ( conn_key & delay_mask_ ) >> max_port_syn_nbits_ );
+}
+
+template <>
+ConnectionTemplate< conn16b_key, conn16b_struct >::ConnectionTemplate()
+{
+  // std::cout << "In Connectiontemplate<conn16b_key, conn16b_struct> "
+  //"specialized constructor\n";
+  init();
+  max_node_nbits_ = 31;
+  _setMaxDelayNBits( 14 ); // maximum number of bits for delays
+  // std::cout << "max_node_nbits_: " << max_node_nbits_ << "\n";
+  _setMaxSynNBits( 10 ); // maximum number of synapse groups is 2^10
+}
diff --git a/src/conn16b.h b/src/conn16b.h
new file mode 100644
index 000000000..b6c8a9c55
--- /dev/null
+++ b/src/conn16b.h
@@ -0,0 +1,132 @@
+/*
+ *  connect.h
+ *
+ *  This file is part of NEST GPU.
+ *
+ *  Copyright (C) 2021 The NEST Initiative
+ *
+ *  NEST GPU is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  NEST GPU is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with NEST GPU.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef CONN16B_H
+#define CONN16B_H
+#include "connect.h"
+#include "remote_connect.h"
+
+struct conn16b_struct
+{
+  uint target;
+  float weight;
+};
+
+// typedef uint conn16b_key;
+typedef uint64_t conn16b_key;
+
+template <>
+__device__ __forceinline__ void
+setConnDelay< conn16b_key >( conn16b_key& conn_key, int delay )
+{
+  conn_key = ( conn_key & ( ~( ( uint64_t ) DelayMask ) ) ) | ( delay << MaxPortSynNBits );
+}
+
+template <>
+__device__ __forceinline__ void
+setConnSource< conn16b_key >( conn16b_key& conn_key, inode_t source )
+{
+  conn_key = ( conn_key & 0xFFFFFFFF ) | ( ( uint64_t ) source << 32 );
+}
+
+template <>
+__device__ __forceinline__ void
+setConnTarget< conn16b_struct >( conn16b_struct& conn, inode_t target )
+{
+  conn.target = target;
+}
+
+template <>
+__device__ __forceinline__ void
+setConnPort< conn16b_key, conn16b_struct >( conn16b_key& conn_key, conn16b_struct& conn, int port )
+{
+  conn_key = ( conn_key & ( ( ~( ( uint64_t ) PortMask ) ) ) ) | ( port << ( MaxSynNBits + 1 ) );
+}
+
+template <>
+__device__ __forceinline__ void
+setConnSyn< conn16b_key, conn16b_struct >( conn16b_key& conn_key, conn16b_struct& conn, int syn )
+{
+  conn_key = ( conn_key & ( ( ~( ( uint64_t ) SynMask ) ) ) ) | syn;
+}
+
+template <>
+__device__ __forceinline__ int
+getConnDelay< conn16b_key >( const conn16b_key& conn_key )
+{
+  return ( int ) ( ( conn_key & DelayMask ) >> MaxPortSynNBits );
+}
+
+template <>
+__device__ __forceinline__ inode_t
+getConnSource< conn16b_key >( conn16b_key& conn_key )
+{
+  return ( inode_t ) ( conn_key >> 32 );
+}
+
+template <>
+__device__ __forceinline__ inode_t
+getConnTarget< conn16b_struct >( conn16b_struct& conn )
+{
+  return conn.target;
+}
+
+template <>
+__device__ __forceinline__ int
+getConnPort< conn16b_key, conn16b_struct >( conn16b_key& conn_key, conn16b_struct& conn )
+{
+  return ( int ) ( ( conn_key & PortMask ) >> ( MaxSynNBits + 1 ) );
+}
+
+template <>
+__device__ __forceinline__ int
+getConnSyn< conn16b_key, conn16b_struct >( conn16b_key& conn_key, conn16b_struct& conn )
+{
+  return ( int ) ( conn_key & SynMask );
+}
+
+// TEMPORARY TO BE IMPROVED!!!!
+template <>
+__device__ __forceinline__ bool
+getConnRemoteFlag< conn16b_key, conn16b_struct >( conn16b_key& conn_key, conn16b_struct& conn )
+{
+  return ( bool ) ( ( conn_key >> MaxSynNBits ) & 1 );
+}
+
+template <>
+__device__ __forceinline__ void
+clearConnRemoteFlag< conn16b_key, conn16b_struct >( conn16b_key& conn_key, conn16b_struct& conn )
+{
+  conn_key = conn_key & ~( ( uint64_t ) 1 << MaxSynNBits );
+}
+
+
+template <>
+int ConnectionTemplate< conn16b_key, conn16b_struct >::_setMaxNodeNBits( int max_node_nbits );
+
+template <>
+int ConnectionTemplate< conn16b_key, conn16b_struct >::_setMaxDelayNBits( int max_delay_nbits );
+
+template <>
+int ConnectionTemplate< conn16b_key, conn16b_struct >::_setMaxSynNBits( int max_syn_nbits );
+
+#endif
diff --git a/src/connect.cu b/src/connect.cu
index aaa6519c3..447141bda 100644
--- a/src/connect.cu
+++ b/src/connect.cu
@@ -20,1623 +20,247 @@
  *
  */
 
-#include <time.h>
-#include <sys/time.h>
-#include <iostream>
+// #include <time.h>
+// #include <sys/time.h>
+#include "cuda_error.h"
 #include <algorithm>
-#include <cstdlib>
 #include <climits>
-#include <vector>
-#include <utility>
+#include <cstdlib>
+#include <cub/cub.cuh>
 #include <cuda.h>
 #include <curand.h>
-#include <cub/cub.cuh>
-#include "cuda_error.h"
-#include "copass_kernels.h"
-#include "copass_sort.h"
-#include "distribution.h"
+#include <iostream>
+#include <utility>
+#include <vector>
+// #include "copass_kernels.h"
+// #include "copass_sort.h"
+// #include "distribution.h"
 #include "connect.h"
 #include "nestgpu.h"
-#include "utilities.h"
+// #include "utilities.h"
 
-//#define OPTIMIZE_FOR_MEMORY
+// #define OPTIMIZE_FOR_MEMORY
 
-extern __constant__ float NESTGPUTimeResolution;
+// extern __constant__ float NESTGPUTimeResolution;
 
 bool print_sort_err = true;
 bool print_sort_cfr = false;
 bool compare_with_serial = false;
 uint last_i_sub = 0;
 
-uint h_MaxNodeNBits;
-__device__ uint MaxNodeNBits;
-// maximum number of bits used to represent node index 
+// maximum number of bits used to represent node index
+__device__ int MaxNodeNBits;
 
-uint h_MaxPortSynNBits;
-__device__ uint MaxPortSynNBits;
-// maximum number of bits used to represent receptor port index and delays 
+// maximum number of bits used to represent delays
+__device__ int MaxDelayNBits;
 
-uint h_MaxSynNBits;
-__device__ uint MaxSynNBits;
+// maximum number of bits used to represent synapse group index
+__device__ int MaxSynNBits;
 
-uint h_PortSynMask;
-__device__ uint PortSynMask;
-// bit mask used to extract port index
+// maximum number of bits used to represent receptor port index
+__device__ int MaxPortNBits;
+
+// maximum number of bits used to represent receptor port index
+// and synapse group index
+__device__ int MaxPortSynNBits;
 
-uint h_SynMask;
+// bit mask used to extract source node index
+__device__ uint SourceMask;
+
+// bit mask used to extract delay
+__device__ uint DelayMask;
+
+// bit mask used to extract target node index
+__device__ uint TargetMask;
+
+// bit mask used to extract synapse group index
 __device__ uint SynMask;
 
+// bit mask used to extract port index
+__device__ uint PortMask;
+
+// bit mask used to extract port and synapse group index
+__device__ uint PortSynMask;
 
-uint *d_ConnGroupIdx0;
-__device__ uint *ConnGroupIdx0;
 // ig0 = ConnGroupIdx0[i_spike_buffer] is the index in the whole
 // connection-group array of the first connection group outgoing
 // from the node i_spike_buffer
+__device__ iconngroup_t* ConnGroupIdx0;
 
-int64_t *d_ConnGroupIConn0;
-__device__ int64_t *ConnGroupIConn0;
 // i_conn0 = ConnGroupIConn0[ig] with ig = 0, ..., Ng
 //  is the index in the whole connection array of the first connection
 // belonging to the connection group ig
+__device__ int64_t* ConnGroupIConn0;
 
-uint *d_ConnGroupDelay;
-__device__ uint *ConnGroupDelay;
 // ConnGroupDelay[ig]
 // delay associated to all connections of the connection group ig
 // with ig = 0, ..., Ng
+__device__ int* ConnGroupDelay;
 
-uint tot_conn_group_num;
-
-int64_t NConn; // total number of connections in the whole network
-
-int64_t h_ConnBlockSize = 10000000; // 160000000; //50000000;
+// size (i.e. number of connections) of connection blocks
+// int64_t h_ConnBlockSize = 10000000; // 160000000; //50000000;
 __device__ int64_t ConnBlockSize;
-// size (i.e. number of connections) of connection blocks 
-
-uint h_MaxDelayNum;
-
 
-// it seems that there is no relevant advantage in using a constant array
-// however better to keep this option ready and commented
-std::vector<uint*> KeySubarray;
-uint** d_SourceDelayArray;
-__device__ uint** SourceDelayArray;
-//__constant__ uint* SourceDelayArray[1024];
 // Array of source node indexes and delays of all connections
 // Source node indexes and delays are merged in a single integer variable
-// The most significant MaxNodeNBits are used for the node index 
+// The most significant MaxNodeNBits are used for the node index
 // the others (less significant) bits are used to represent the delay
 // This array is used as a key array for sorting the connections
 // in ascending order according to the source node index
 // Connections from the same source node are sorted according to
 // the delay
-
 // it seems that there is no relevant advantage in using a constant array
 // however better to keep this option ready and commented
-std::vector<connection_struct*> ConnectionSubarray;
-connection_struct** d_ConnectionArray;
-__device__ connection_struct** ConnectionArray;
-//__constant__ connection_struct* ConnectionArray[1024];
+//__constant__ uint* ConnKeyArray[1024];
+__device__ void* ConnKeyArray;
+
 // array of target node indexes, receptor port index, synapse type,
 // weight of all connections
 // used as a value for key-value sorting of the connections (see above)
+// it seems that there is no relevant advantage in using a constant array
+// however better to keep this option ready and commented
+//__constant__ connection_struct* ConnStructArray[1024];
+__device__ void* ConnStructArray;
 
+__device__ unsigned short* ConnectionSpikeTime;
 
-enum ConnectionFloatParamIndexes {
-  i_weight_param = 0,
-  i_delay_param,
-  N_CONN_FLOAT_PARAM
-};
-
-enum ConnectionIntParamIndexes {
-  i_source_param = 0,
-  i_target_param,
-  i_port_param,
-  i_syn_group_param,
-  N_CONN_INT_PARAM
-};
-
-const std::string ConnectionFloatParamName[N_CONN_FLOAT_PARAM] = {
-  "weight",
-  "delay"
-};
-
-const std::string ConnectionIntParamName[N_CONN_INT_PARAM] = {
-  "source",
-  "target",
-  "port",
-  "syn_group"
-};
-
-
-__global__ void setConnGroupNum(int64_t n_compact,
-				uint *conn_group_num,
-				int64_t *conn_group_idx0_compact,
-				int *conn_group_source_compact)
-{
-  int64_t i_compact = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_compact>=n_compact) return;
-  int source = conn_group_source_compact[i_compact];
-  uint num = (uint)(conn_group_idx0_compact[i_compact+1]
-		    - conn_group_idx0_compact[i_compact]);
-  conn_group_num[source] = num;
-}
+const std::string ConnectionFloatParamName[ N_CONN_FLOAT_PARAM ] = { "weight", "delay" };
 
+const std::string ConnectionIntParamName[ N_CONN_INT_PARAM ] = { "source", "target", "port", "syn_group" };
 
-__global__ void setConnGroupIdx0Compact
-(uint *key_subarray, int64_t n_block_conn, int *conn_group_idx0_mask,
- int64_t *conn_group_iconn0_mask_cumul, int64_t *conn_group_idx0_mask_cumul,
- int64_t *conn_group_idx0_compact, int *conn_group_source_compact,
- int64_t *iconn0_offset, int64_t *idx0_offset)
+__global__ void
+setConnGroupNum( inode_t n_compact,
+  iconngroup_t* conn_group_num,
+  iconngroup_t* conn_group_idx0_compact,
+  inode_t* conn_group_source_compact )
 {
-  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>n_block_conn) return;
-  if (i_conn<n_block_conn && conn_group_idx0_mask[i_conn]==0) return;
-  int64_t i_group = conn_group_iconn0_mask_cumul[i_conn] + *iconn0_offset;
-  int64_t i_source_compact = conn_group_idx0_mask_cumul[i_conn]
-    + *idx0_offset;
-  conn_group_idx0_compact[i_source_compact] = i_group;
-  if (i_conn<n_block_conn) {
-    int source = key_subarray[i_conn] >> MaxPortSynNBits;
-    conn_group_source_compact[i_source_compact] = source;
+  inode_t i_compact = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_compact >= n_compact )
+  {
+    return;
   }
+  inode_t source = conn_group_source_compact[ i_compact ];
+  iconngroup_t num = conn_group_idx0_compact[ i_compact + 1 ] - conn_group_idx0_compact[ i_compact ];
+  conn_group_num[ source ] = num;
 }
 
-
-__global__ void buildConnGroupMask(uint *key_subarray,
-				   uint *key_subarray_prev,
-				   int64_t n_block_conn,
-				   int *conn_group_iconn0_mask,
-				   int *conn_group_idx0_mask)
+__global__ void
+setConnGroupIConn0( int64_t n_block_conn,
+  int* conn_group_iconn0_mask,
+  iconngroup_t* conn_group_iconn0_mask_cumul,
+  int64_t* conn_group_iconn0,
+  int64_t i_conn0,
+  iconngroup_t* offset )
 {
   int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_block_conn) return;
-  uint val = key_subarray[i_conn];
-  int64_t prev_val;
-  int prev_source;
-  if (i_conn==0) {
-    if (key_subarray_prev != NULL) {
-      prev_val = *key_subarray_prev;
-      prev_source = prev_val >> MaxPortSynNBits; 
-    }
-    else {
-      prev_val = -1;      // just to ensure it is different from val
-      prev_source = -1;
-    }
-  }
-  else {
-    prev_val = key_subarray[i_conn-1];
-    prev_source = prev_val >> MaxPortSynNBits;
-  }
-  if (val != prev_val) {
-    conn_group_iconn0_mask[i_conn] = 1;
-    int source = val >> MaxPortSynNBits; 
-    if (source != prev_source) {
-      conn_group_idx0_mask[i_conn] = 1;
-    }
-  }
-}
-
-__global__ void buildConnGroupIConn0Mask(uint *key_subarray,
-					 uint *key_subarray_prev,
-					 int64_t n_block_conn,
-					 int *conn_group_iconn0_mask)
-{
-  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_block_conn) return;
-  uint val = key_subarray[i_conn];
-  int64_t prev_val;
-  if (i_conn==0) {
-    if (key_subarray_prev != NULL) {
-      prev_val = *key_subarray_prev;
-    }
-    else {
-      prev_val = -1;      // just to ensure it is different from val
-    }
-  }
-  else {
-    prev_val = key_subarray[i_conn-1];
+  if ( i_conn >= n_block_conn )
+  {
+    return;
   }
-  if (val != prev_val) {
-    conn_group_iconn0_mask[i_conn] = 1;
-  }
-}
-
-__global__ void setConnGroupIConn0(int64_t n_block_conn,
-				   int *conn_group_iconn0_mask,
-				   int64_t *conn_group_iconn0_mask_cumul,
-				   int64_t *conn_group_iconn0, int64_t i_conn0,
-				   int64_t *offset)
-{
-  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_block_conn) return;
-  if (conn_group_iconn0_mask[i_conn] != 0) {
-    int64_t pos = conn_group_iconn0_mask_cumul[i_conn] + *offset;
-    conn_group_iconn0[pos] = i_conn0 + i_conn;
+  if ( conn_group_iconn0_mask[ i_conn ] != 0 )
+  {
+    iconngroup_t pos = conn_group_iconn0_mask_cumul[ i_conn ] + *offset;
+    conn_group_iconn0[ pos ] = i_conn0 + i_conn;
   }
 }
 
-__global__ void setConnGroupNewOffset(int64_t *offset, int64_t *add_offset)
+__global__ void
+connectCalibrateKernel( iconngroup_t* conn_group_idx0,
+  int64_t* conn_group_iconn0,
+  int* conn_group_delay,
+  int64_t block_size,
+  void* conn_key_array,
+  void* conn_struct_array,
+  unsigned short* conn_spike_time )
 {
-  *offset = *offset + *add_offset;
-}
-
-
-__global__ void setWeights(connection_struct *conn_subarray, float weight,
-			   int64_t n_conn)
-{
-  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_conn) return;
-  conn_subarray[i_conn].weight = weight;
-}
-
-
-__global__ void setWeights(connection_struct *conn_subarray, float *arr_val,
-			   int64_t n_conn)
-{
-  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_conn) return;
-  conn_subarray[i_conn].weight = arr_val[i_conn];
-}
-
-
-__global__ void setDelays(uint *key_subarray, float *arr_val,
-			  int64_t n_conn, float time_resolution)
-{
-  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_conn) return;
-  int delay = (int)round(arr_val[i_conn]/time_resolution);
-  delay = max(delay,1);
-  key_subarray[i_conn] = (key_subarray[i_conn] << MaxPortSynNBits) | delay;
-}
-
-
-__global__ void setDelays(uint *key_subarray, float fdelay,
-			  int64_t n_conn, float time_resolution)
-{
-  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_conn) return;
-  int delay = (int)round(fdelay/time_resolution);
-  delay = max(delay,1);
-  key_subarray[i_conn] = (key_subarray[i_conn] << MaxPortSynNBits) | delay;
-}
-
-
-
-__global__ void setPort(connection_struct *conn_subarray, uint port,
-			   int64_t n_conn)
-{
-  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_conn) return;
-  conn_subarray[i_conn].target_port_syn =
-    (conn_subarray[i_conn].target_port_syn << MaxPortSynNBits)
-    | (port << MaxSynNBits); 
-}
-
-
-__global__ void setSynGroup(connection_struct *conn_subarray,
-			    unsigned char syn_group,
-			    int64_t n_conn)
-{
-  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_conn) return;
-  conn_subarray[i_conn].target_port_syn =
-    conn_subarray[i_conn].target_port_syn | syn_group;
-  //conn_subarray[i_conn].syn_group = syn_group; 
-}
-
-
-__global__ void setPortSynGroup(connection_struct *conn_subarray, uint port,
-				unsigned char syn_group,
-				int64_t n_conn)
-{
-  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_conn) return;
-  conn_subarray[i_conn].target_port_syn =
-    (conn_subarray[i_conn].target_port_syn << MaxPortSynNBits)
-    | (port << MaxSynNBits) | syn_group;
-}
-
-__global__ void getConnGroupDelay(int64_t block_size,
-				  uint **source_delay_array,
-				  int64_t *conn_group_iconn0,
-				  uint *conn_group_delay,
-				  uint conn_group_num)
-{
-  uint conn_group_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (conn_group_idx >= conn_group_num) return;
-  int64_t i_conn = conn_group_iconn0[conn_group_idx];
-  uint i_block = (uint)(i_conn / block_size);
-  int64_t i_block_conn = i_conn % block_size;
-  uint source_delay = source_delay_array[i_block][i_block_conn];
-  conn_group_delay[conn_group_idx] = source_delay & PortSynMask;
-}
-
-
-int freeConnectionKey(std::vector<uint*> &key_subarray)
-{
-  for (uint ib=0; ib<key_subarray.size(); ib++) {
-    uint *d_key_pt = key_subarray[ib];
-    if (d_key_pt != NULL) {
-      CUDAFREECTRL("d_key_pt", d_key_pt);
-    }
-  }
-  return 0;
-}
-
-
-int allocateNewBlocks(std::vector<uint*> &key_subarray,
-		      std::vector<connection_struct*> &conn_subarray,
-		      int64_t block_size, uint new_n_block)
-{
-  // Allocating GPU memory for new connection blocks
-  // allocate new blocks if needed
-  for (uint ib=key_subarray.size(); ib<new_n_block; ib++) {
-    uint *d_key_pt;
-    connection_struct *d_connection_pt;
-    // allocate GPU memory for new blocks 
-    CUDAMALLOCCTRL("&d_key_pt",&d_key_pt, block_size*sizeof(uint));
-    CUDAMALLOCCTRL("&d_connection_pt",&d_connection_pt,
-			 block_size*sizeof(connection_struct));
-    key_subarray.push_back(d_key_pt);
-    conn_subarray.push_back(d_connection_pt);
-  }
-
-  return 0;
-}
-
-
-int setConnectionWeights(curandGenerator_t &gen, void *d_storage,
-			 connection_struct *conn_subarray, int64_t n_conn,
-			 SynSpec &syn_spec)
-{
-  if (syn_spec.weight_distr_ >= DISTR_TYPE_ARRAY   // probability distribution
-      && syn_spec.weight_distr_ < N_DISTR_TYPE) {  // or array
-    if (syn_spec.weight_distr_ == DISTR_TYPE_ARRAY) {
-      gpuErrchk(cudaMemcpy(d_storage, syn_spec.weight_h_array_pt_,
-			   n_conn*sizeof(float), cudaMemcpyHostToDevice));    
-    }
-    else if (syn_spec.weight_distr_ == DISTR_TYPE_NORMAL_CLIPPED) {
-      CURAND_CALL(curandGenerateUniform(gen, (float*)d_storage, n_conn));
-      randomNormalClipped((float*)d_storage, n_conn, syn_spec.weight_mu_,
-			  syn_spec.weight_sigma_, syn_spec.weight_low_,
-			  syn_spec.weight_high_);
-    }
-    else if (syn_spec.weight_distr_==DISTR_TYPE_NORMAL) {
-      float low = syn_spec.weight_mu_ - 5.0*syn_spec.weight_sigma_;
-      float high = syn_spec.weight_mu_ + 5.0*syn_spec.weight_sigma_;
-      CURAND_CALL(curandGenerateUniform(gen, (float*)d_storage, n_conn));
-      randomNormalClipped((float*)d_storage, n_conn, syn_spec.weight_mu_,
-			  syn_spec.weight_sigma_, low, high);
-    }
-    else {
-      throw ngpu_exception("Invalid connection weight distribution type");
-    }
-    setWeights<<<(n_conn+1023)/1024, 1024>>>
-      (conn_subarray, (float*)d_storage, n_conn);
-    DBGCUDASYNC
-  }
-  else {
-    setWeights<<<(n_conn+1023)/1024, 1024>>>
-      (conn_subarray, syn_spec.weight_, n_conn);
-    DBGCUDASYNC
-  }
-    
-  return 0;
-}
-
-
-int setConnectionDelays(curandGenerator_t &gen, void *d_storage,
-			uint *key_subarray, int64_t n_conn,
-			SynSpec &syn_spec, float time_resolution)
-{
-  if (syn_spec.delay_distr_ >= DISTR_TYPE_ARRAY   // probability distribution
-      && syn_spec.delay_distr_ < N_DISTR_TYPE) {  // or array
-    if (syn_spec.delay_distr_ == DISTR_TYPE_ARRAY) {
-      gpuErrchk(cudaMemcpy(d_storage, syn_spec.delay_h_array_pt_,
-			   n_conn*sizeof(float), cudaMemcpyHostToDevice));
-    }
-    else if (syn_spec.delay_distr_ == DISTR_TYPE_NORMAL_CLIPPED) {
-      CURAND_CALL(curandGenerateUniform(gen, (float*)d_storage, n_conn));
-      randomNormalClipped((float*)d_storage, n_conn, syn_spec.delay_mu_,
-			  syn_spec.delay_sigma_, syn_spec.delay_low_,
-			  syn_spec.delay_high_);
-    }
-    else if (syn_spec.delay_distr_ == DISTR_TYPE_NORMAL) {
-      float low = syn_spec.delay_mu_ - 5.0*syn_spec.delay_sigma_;
-      float high = syn_spec.delay_mu_ + 5.0*syn_spec.delay_sigma_;
-      CURAND_CALL(curandGenerateUniform(gen, (float*)d_storage, n_conn));
-      randomNormalClipped((float*)d_storage, n_conn, syn_spec.delay_mu_,
-			  syn_spec.delay_sigma_, syn_spec.delay_low_,
-			  syn_spec.delay_high_);
-    }
-    else {
-      throw ngpu_exception("Invalid connection delay distribution type");
-    }
-
-    setDelays<<<(n_conn+1023)/1024, 1024>>>
-      (key_subarray, (float*)d_storage, n_conn, time_resolution);
-    DBGCUDASYNC
-
-  }
-  else {
-    setDelays<<<(n_conn+1023)/1024, 1024>>>
-      (key_subarray, syn_spec.delay_, n_conn, time_resolution);
-    DBGCUDASYNC
-  }
-  return 0;
-}
-
-
-int organizeConnections(float time_resolution, uint n_node, int64_t n_conn,
-			int64_t block_size,
-			std::vector<uint*> &key_subarray,
-			std::vector<connection_struct*> &conn_subarray)
-{
-  typedef uint key_t;
-  timeval startTV;
-  timeval endTV;
-  CUDASYNC
-  gettimeofday(&startTV, NULL);
-
-  if (n_conn > 0) {
-    printf("Allocating auxiliary GPU memory...\n");
-    int64_t sort_storage_bytes = 0;
-    void *d_sort_storage = NULL;
-    copass_sort::sort<uint, connection_struct>(key_subarray.data(),
-					       conn_subarray.data(), n_conn,
-					       block_size, d_sort_storage,
-					       sort_storage_bytes);
-    printf("storage bytes: %ld\n", sort_storage_bytes);
-    CUDAMALLOCCTRL("&d_sort_storage",&d_sort_storage, sort_storage_bytes);
-    
-    printf("Sorting...\n");
-    copass_sort::sort<uint, connection_struct>(key_subarray.data(),
-					       conn_subarray.data(), n_conn,
-					       block_size, d_sort_storage,
-					       sort_storage_bytes);
-    CUDAFREECTRL("d_sort_storage",d_sort_storage);
-
-    size_t storage_bytes = 0;
-    size_t storage_bytes1 = 0;
-    void *d_storage = NULL;
-    printf("Indexing connection groups...\n");
-    // It is important to separate number of allocated blocks
-    // (determined by key_subarray.size()) from number of blocks
-    // on which there are connections, which is determined by n_conn
-    // number of used connection blocks
-    uint k = (n_conn - 1)  / block_size + 1;
-    
-    // it seems that there is no relevant advantage in using a constant array
-    // however better to keep this option ready and commented
-    //gpuErrchk(cudaMemcpyToSymbol(SourceDelayArray, KeySubarray.data(),
-    //				 k*sizeof(uint*)));//, cudaMemcpyHostToDevice));
-    //gpuErrchk(cudaMemcpyToSymbol(ConnectionArray, ConnectionSubarray.data(),
-    //				 k*sizeof(connection_struct*)));
-				 //, cudaMemcpyHostToDevice));
-
-    CUDAMALLOCCTRL("&d_SourceDelayArray",&d_SourceDelayArray,
-		   k*sizeof(uint*));
-    gpuErrchk(cudaMemcpy(d_SourceDelayArray, KeySubarray.data(),
-			 k*sizeof(uint*), cudaMemcpyHostToDevice));
-  
-    CUDAMALLOCCTRL("&d_ConnectionArray",&d_ConnectionArray,
-		   k*sizeof(connection_struct*));
-    gpuErrchk(cudaMemcpy(d_ConnectionArray, ConnectionSubarray.data(),
-			 k*sizeof(connection_struct*), cudaMemcpyHostToDevice));
-
-    //////////////////////////////////////////////////////////////////////
-    
-    int *d_conn_group_iconn0_mask;
-    CUDAMALLOCCTRL("&d_conn_group_iconn0_mask",
-		   &d_conn_group_iconn0_mask,
-		   block_size*sizeof(int));
-
-    int64_t *d_conn_group_iconn0_mask_cumul;
-    CUDAMALLOCCTRL("&d_conn_group_iconn0_mask_cumul",
-		   &d_conn_group_iconn0_mask_cumul,
-		   (block_size+1)*sizeof(int64_t));
-    
-    int *d_conn_group_idx0_mask;
-    CUDAMALLOCCTRL("&d_conn_group_idx0_mask",
-		   &d_conn_group_idx0_mask,
-		   block_size*sizeof(int));
-
-    int64_t *d_conn_group_idx0_mask_cumul;
-    CUDAMALLOCCTRL("&d_conn_group_idx0_mask_cumul",
-		   &d_conn_group_idx0_mask_cumul,
-		   (block_size+1)*sizeof(int64_t));
-
-    int64_t *d_conn_group_idx0_compact;
-    int64_t reserve_size = n_node<block_size ? n_node : block_size;
-    CUDAMALLOCCTRL("&d_conn_group_idx0_compact",
-		   &d_conn_group_idx0_compact,
-		   (reserve_size+1)*sizeof(int64_t));
-  
-    int *d_conn_group_source_compact;
-    CUDAMALLOCCTRL("&d_conn_group_source_compact",
-		   &d_conn_group_source_compact,
-		   reserve_size*sizeof(int));
-  
-    int64_t *d_iconn0_offset;
-    CUDAMALLOCCTRL("&d_iconn0_offset", &d_iconn0_offset, sizeof(int64_t));
-    gpuErrchk(cudaMemset(d_iconn0_offset, 0, sizeof(int64_t)));
-    int64_t *d_idx0_offset;
-    CUDAMALLOCCTRL("&d_idx0_offset", &d_idx0_offset, sizeof(int64_t));
-    gpuErrchk(cudaMemset(d_idx0_offset, 0, sizeof(int64_t)));
-
-    uint *key_subarray_prev = NULL;
-    for (uint ib=0; ib<k; ib++) {
-      uint n_block_conn = ib<(k-1) ? block_size : NConn - block_size*(k-1);
-      gpuErrchk(cudaMemset(d_conn_group_iconn0_mask, 0,
-			   n_block_conn*sizeof(int)));
-      buildConnGroupIConn0Mask<<<(n_block_conn+1023)/1024, 1024>>>
-	(key_subarray[ib], key_subarray_prev, n_block_conn,
-	 d_conn_group_iconn0_mask);
-      CUDASYNC;
-      
-      key_subarray_prev = key_subarray[ib] + block_size - 1;
-    
-      if (ib==0) {
-	// Determine temporary device storage requirements for prefix sum
-	cub::DeviceScan::ExclusiveSum(NULL, storage_bytes,
-				      d_conn_group_iconn0_mask,
-				      d_conn_group_iconn0_mask_cumul,
-				      n_block_conn+1);
-	// Allocate temporary storage for prefix sum
-	CUDAMALLOCCTRL("&d_storage",&d_storage, storage_bytes);
-      }
-      // Run exclusive prefix sum
-      cub::DeviceScan::ExclusiveSum(d_storage, storage_bytes,
-				    d_conn_group_iconn0_mask,
-				    d_conn_group_iconn0_mask_cumul,
-				    n_block_conn+1);
-
-      setConnGroupNewOffset<<<1, 1>>>(d_iconn0_offset,
-				      d_conn_group_iconn0_mask_cumul
-				      + n_block_conn);
-
-      CUDASYNC;
-      
-    }
-    gpuErrchk(cudaMemcpy(&tot_conn_group_num, d_iconn0_offset,
-			 sizeof(int64_t), cudaMemcpyDeviceToHost));
-    printf("Total number of connection groups: %d\n", tot_conn_group_num);
-
-    if (tot_conn_group_num > 0) {
-      uint *d_conn_group_num;
-      CUDAMALLOCCTRL("&d_conn_group_num", &d_conn_group_num,
-		     n_node*sizeof(uint));
-      gpuErrchk(cudaMemset(d_conn_group_num, 0, sizeof(uint)));
-    
-      uint *key_subarray_prev = NULL;
-      gpuErrchk(cudaMemset(d_iconn0_offset, 0, sizeof(int64_t)));
-
-      CUDAMALLOCCTRL("&d_ConnGroupIConn0",&d_ConnGroupIConn0,
-		     (tot_conn_group_num+1)*sizeof(int64_t));
-
-      int64_t n_compact = 0; 
-      for (uint ib=0; ib<k; ib++) {
-	uint n_block_conn = ib<(k-1) ? block_size : NConn - block_size*(k-1);
-	gpuErrchk(cudaMemset(d_conn_group_iconn0_mask, 0,
-			     n_block_conn*sizeof(int)));
-	gpuErrchk(cudaMemset(d_conn_group_idx0_mask, 0,
-			     n_block_conn*sizeof(int)));
-	buildConnGroupMask<<<(n_block_conn+1023)/1024, 1024>>>
-	  (key_subarray[ib], key_subarray_prev, n_block_conn,
-	   d_conn_group_iconn0_mask, d_conn_group_idx0_mask);
-	CUDASYNC;
-      
-	key_subarray_prev = key_subarray[ib] + block_size - 1;
-    
-	// Run exclusive prefix sum
-	cub::DeviceScan::ExclusiveSum(d_storage, storage_bytes,
-				      d_conn_group_iconn0_mask,
-				      d_conn_group_iconn0_mask_cumul,
-				      n_block_conn+1);
-	DBGCUDASYNC;
-	cub::DeviceScan::ExclusiveSum(d_storage, storage_bytes,
-				      d_conn_group_idx0_mask,
-				      d_conn_group_idx0_mask_cumul,
-				      n_block_conn+1);
-
-	DBGCUDASYNC;
-	int64_t i_conn0 = block_size*ib;
-	setConnGroupIConn0<<<(n_block_conn+1023)/1024, 1024>>>
-	  (n_block_conn, d_conn_group_iconn0_mask,
-	   d_conn_group_iconn0_mask_cumul, d_ConnGroupIConn0,
-	   i_conn0, d_iconn0_offset);
-	CUDASYNC;
-
-	setConnGroupIdx0Compact<<<(n_block_conn+1023)/1024, 1024>>>
-	  (key_subarray[ib], n_block_conn, d_conn_group_idx0_mask,
-	   d_conn_group_iconn0_mask_cumul, d_conn_group_idx0_mask_cumul,
-	   d_conn_group_idx0_compact, d_conn_group_source_compact,
-	   d_iconn0_offset, d_idx0_offset);
-	CUDASYNC;
-
-	int64_t n_block_compact; 
-	gpuErrchk(cudaMemcpy(&n_block_compact, d_conn_group_idx0_mask_cumul
-			     + n_block_conn,
-			     sizeof(int64_t), cudaMemcpyDeviceToHost));
-	//std::cout << "number of nodes with outgoing connections "
-	//"in block " << ib << ": " << n_block_compact << "\n";
-	n_compact += n_block_compact;
-            
-	setConnGroupNewOffset<<<1, 1>>>(d_iconn0_offset,
-					d_conn_group_iconn0_mask_cumul
-					+ n_block_conn);
-	setConnGroupNewOffset<<<1, 1>>>(d_idx0_offset,
-					d_conn_group_idx0_mask_cumul
-					+ n_block_conn);
-	CUDASYNC;
-      }
-      gpuErrchk(cudaMemcpy(d_ConnGroupIConn0+tot_conn_group_num, &NConn,
-			   sizeof(int64_t), cudaMemcpyHostToDevice));
-
-      setConnGroupNum<<<(n_compact+1023)/1024, 1024>>>
-	(n_compact, d_conn_group_num, d_conn_group_idx0_compact,
-	 d_conn_group_source_compact);
-      CUDASYNC;
-
-      CUDAMALLOCCTRL("&d_ConnGroupIdx0", &d_ConnGroupIdx0,
-		     (n_node+1)*sizeof(uint));
-      storage_bytes1 = 0;
-      
-      // Determine temporary device storage requirements for prefix sum
-      cub::DeviceScan::ExclusiveSum(NULL, storage_bytes1,
-				    d_conn_group_num,
-				    d_ConnGroupIdx0,
-				    n_node+1);
-      if (storage_bytes1 > storage_bytes) {
-	storage_bytes = storage_bytes1;
-	CUDAFREECTRL("d_storage",d_storage);
-	// Allocate temporary storage for prefix sum
-	CUDAMALLOCCTRL("&d_storage",&d_storage, storage_bytes);
-      }
-      // Run exclusive prefix sum
-      cub::DeviceScan::ExclusiveSum(d_storage, storage_bytes,
-				    d_conn_group_num,
-				    d_ConnGroupIdx0,
-				    n_node+1);
-
-      // find maxumum number of connection groups (delays) over all neurons
-      uint *d_max_delay_num;
-      CUDAMALLOCCTRL("&d_max_delay_num",&d_max_delay_num, sizeof(uint));
-    
-      storage_bytes1 = 0; 
-      // Determine temporary device storage requirements
-      cub::DeviceReduce::Max(NULL, storage_bytes1,
-			     d_conn_group_num, d_max_delay_num, n_node);
-      if (storage_bytes1 > storage_bytes) {
-	storage_bytes = storage_bytes1;
-	CUDAFREECTRL("d_storage",d_storage);
-	// Allocate temporary storage for prefix sum
-	CUDAMALLOCCTRL("&d_storage",&d_storage, storage_bytes);
-      }
-    
-      // Run maximum search
-      cub::DeviceReduce::Max(d_storage, storage_bytes,
-			     d_conn_group_num, d_max_delay_num, n_node);
-    
-      CUDASYNC;
-      gpuErrchk(cudaMemcpy(&h_MaxDelayNum, d_max_delay_num,
-			   sizeof(uint), cudaMemcpyDeviceToHost));
-      CUDAFREECTRL("d_max_delay_num",d_max_delay_num);
-
-      printf("Maximum number of connection groups (delays) over all nodes: %d\n",
-	     h_MaxDelayNum);
-    
-
-      ///////////////////////////////////////////////////////////////////
-      ///////////////////////////////////////////////////////////////////
-      CUDAFREECTRL("d_storage",d_storage); // free temporary allocated storage
-      CUDAFREECTRL("d_conn_group_iconn0_mask",d_conn_group_iconn0_mask);
-      CUDAFREECTRL("d_conn_group_iconn0_mask_cumul",d_conn_group_iconn0_mask_cumul);
-      CUDAFREECTRL("d_iconn0_offset",d_iconn0_offset);
-      CUDAFREECTRL("d_conn_group_idx0_mask",d_conn_group_idx0_mask);
-      CUDAFREECTRL("d_conn_group_idx0_mask_cumul",d_conn_group_idx0_mask_cumul);
-      CUDAFREECTRL("d_idx0_offset",d_idx0_offset);
-      CUDAFREECTRL("d_conn_group_idx0_compact",d_conn_group_idx0_compact);
-      CUDAFREECTRL("d_conn_group_num",d_conn_group_num);
-      
-#ifndef OPTIMIZE_FOR_MEMORY
-      CUDAMALLOCCTRL("&d_ConnGroupDelay",&d_ConnGroupDelay,
-		     tot_conn_group_num*sizeof(uint));
-
-      getConnGroupDelay<<<(tot_conn_group_num+1023)/1024, 1024>>>
-	(block_size, d_SourceDelayArray, d_ConnGroupIConn0, d_ConnGroupDelay,
-	 tot_conn_group_num);
-      DBGCUDASYNC
-#endif
-	
-    }
-    else {
-      throw ngpu_exception("Number of connections groups must be positive "
-			   "for number of connections > 0");   
-    }
-  }
-  else {
-    gpuErrchk(cudaMemset(d_ConnGroupIdx0, 0, (n_node+1)*sizeof(uint)));
-    h_MaxDelayNum = 0;
-  }
-  
-  gettimeofday(&endTV, NULL);
-  long time = (long)((endTV.tv_sec * 1000000.0 + endTV.tv_usec)
-		     - (startTV.tv_sec * 1000000.0 + startTV.tv_usec));
-  printf("%-40s%.2f ms\n", "Time: ", (double)time / 1000.);
-  printf("Done\n");
-  
-  
-  return 0;
-}
-
-
-__global__ void ConnectInitKernel(uint *conn_group_idx0,
-				     int64_t *conn_group_iconn0,
-				     uint *conn_group_delay,
-				     int64_t block_size,
-				     uint **source_delay_array,
-				     connection_struct **connection_array)
-{
-  
   ConnGroupIdx0 = conn_group_idx0;
   ConnGroupIConn0 = conn_group_iconn0;
   ConnGroupDelay = conn_group_delay;
   ConnBlockSize = block_size;
-  SourceDelayArray = source_delay_array;
-  ConnectionArray = connection_array;
-}
-
-int ConnectInit()
-{
-  /*
-  uint k = ConnectionSubarray.size();
-  uint **d_source_delay_array;
-  CUDAMALLOCCTRL("&d_source_delay_array",&d_source_delay_array, k*sizeof(uint*));
-  gpuErrchk(cudaMemcpy(d_source_delay_array, KeySubarray.data(),
-		       k*sizeof(uint*), cudaMemcpyHostToDevice));
-  
-  connection_struct **d_connection_array;
-  CUDAMALLOCCTRL("&d_connection_array",&d_connection_array, k*sizeof(connection_struct*));
-  gpuErrchk(cudaMemcpy(d_connection_array, ConnectionSubarray.data(),
-		       k*sizeof(connection_struct*), cudaMemcpyHostToDevice));
-
-  */
-  ConnectInitKernel<<<1,1>>>(d_ConnGroupIdx0, d_ConnGroupIConn0,
-				d_ConnGroupDelay, h_ConnBlockSize,
-				d_SourceDelayArray,
-				d_ConnectionArray);
-  DBGCUDASYNC
-
-  return 0;
-}
-
-
-__global__ void setMaxNodeNBitsKernel(int max_node_nbits,
-				      int max_port_syn_nbits,
-				      int port_syn_mask)
-{
-  MaxNodeNBits = max_node_nbits;
-  MaxPortSynNBits = max_port_syn_nbits;
-  PortSynMask = port_syn_mask;
-}
-
-__global__ void setMaxSynNBitsKernel(int max_syn_nbits, int syn_mask)
-{
-  MaxSynNBits = max_syn_nbits;
-  SynMask = syn_mask;
-}
-
-int setMaxNodeNBits(int max_node_nbits)
-{
-  h_MaxNodeNBits = max_node_nbits;
-  h_MaxPortSynNBits = 32 - h_MaxNodeNBits;
-  h_PortSynMask = (1 << h_MaxPortSynNBits) - 1;
-  setMaxNodeNBitsKernel<<<1,1>>>(h_MaxNodeNBits, h_MaxPortSynNBits,
-				 h_PortSynMask);
-  DBGCUDASYNC
-
-  return 0;
-}  
-
-int setMaxSynNBits(int max_syn_nbits)
-{
-  h_MaxSynNBits = max_syn_nbits;
-  h_SynMask = (1 << h_MaxSynNBits) - 1;
-  setMaxSynNBitsKernel<<<1,1>>>(h_MaxSynNBits, h_SynMask);
-  DBGCUDASYNC
-
-  return 0;
-}  
-
-int *sortArray(int *h_arr, int n_elem)
-{
-  // allocate unsorted and sorted array in device memory
-  int *d_arr_unsorted;
-  int *d_arr_sorted;
-  CUDAMALLOCCTRL("&d_arr_unsorted",&d_arr_unsorted, n_elem*sizeof(int));
-  CUDAMALLOCCTRL("&d_arr_sorted",&d_arr_sorted, n_elem*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_arr_unsorted, h_arr, n_elem*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  void *d_storage = NULL;
-  size_t storage_bytes = 0;
-  // Determine temporary storage requirements for sorting source indexes
-  cub::DeviceRadixSort::SortKeys(d_storage, storage_bytes, d_arr_unsorted,
-				 d_arr_sorted, n_elem);
-  // Allocate temporary storage for sorting
-  CUDAMALLOCCTRL("&d_storage",&d_storage, storage_bytes);
-  // Run radix sort
-  cub::DeviceRadixSort::SortKeys(d_storage, storage_bytes, d_arr_unsorted,
-				 d_arr_sorted, n_elem);
-  CUDAFREECTRL("d_storage",d_storage);
-  CUDAFREECTRL("d_arr_unsorted",d_arr_unsorted);
-
-  return d_arr_sorted;
-}
-
-__global__ void setSourceTargetIndexKernel(int64_t n_src_tgt, int  n_source,
-					   int n_target, int64_t *d_src_tgt_arr,
-					   int *d_src_arr, int *d_tgt_arr)
-{
-  int64_t i_src_tgt = (int64_t)blockIdx.x * blockDim.x + threadIdx.x; 
-  if (i_src_tgt >= n_src_tgt) return;
-  int i_src =(int)(i_src_tgt / n_target);
-  int i_tgt =(int)(i_src_tgt % n_target);
-  int src_id = d_src_arr[i_src];
-  int tgt_id = d_tgt_arr[i_tgt];
-  int64_t src_tgt_id = ((int64_t)src_id << 32) | tgt_id;
-  d_src_tgt_arr[i_src_tgt] = src_tgt_id;
-  //printf("i_src_tgt %lld\tsrc_id %d\ttgt_id %d\tsrc_tgt_id %lld\n", 
-  //	 i_src_tgt, src_id, tgt_id, src_tgt_id); 
-}
-
-// Count number of connections per source-target couple
-__global__ void CountConnectionsKernel(int64_t n_conn, int n_source,
-				       int n_target, int64_t *src_tgt_arr,
-				       int64_t *src_tgt_conn_num,
-				       int syn_group)
-{
-  int64_t i_conn = (int64_t)blockIdx.x * blockDim.x + threadIdx.x; 
-  if (i_conn >= n_conn) return;
-
-  uint i_block = (uint)(i_conn / ConnBlockSize);
-  int64_t i_block_conn = i_conn % ConnBlockSize;
-  connection_struct conn = ConnectionArray[i_block][i_block_conn];
-  // if (syn_group==-1 || conn.syn_group == syn_group) {
-  if (syn_group==-1 || (conn.target_port_syn & SynMask) == syn_group) {
-    // First get target node index
-    uint target_port_syn = conn.target_port_syn;
-    int i_target = target_port_syn >> MaxPortSynNBits;
-    uint source_delay = SourceDelayArray[i_block][i_block_conn];
-    int i_source = source_delay >> MaxPortSynNBits;
-    int64_t i_src_tgt = ((int64_t)i_source << 32) | i_target;
-    int64_t i_arr = locate(i_src_tgt, src_tgt_arr, n_source*n_target);
-    if (src_tgt_arr[i_arr] == i_src_tgt) {
-      //printf("i_conn %lld i_source %d i_target %d i_src_tgt %lld "
-      //     "i_arr %lld\n", i_conn, i_source, i_target, i_src_tgt, i_arr);
-      // (atomic)increase the number of connections for source-target couple
-      atomicAdd((unsigned long long *)&src_tgt_conn_num[i_arr], 1);
-    }
-  }
-}
-
-
-// Fill array of connection indexes
-__global__ void SetConnectionsIndexKernel(int64_t n_conn, int n_source,
-					  int n_target, int64_t *src_tgt_arr,
-					  int64_t *src_tgt_conn_num,
-					  int64_t *src_tgt_conn_cumul,
-					  int syn_group, int64_t *conn_ids)
-{
-  int64_t i_conn = (int64_t)blockIdx.x * blockDim.x + threadIdx.x; 
-  if (i_conn >= n_conn) return;
-
-  uint i_block = (uint)(i_conn / ConnBlockSize);
-  int64_t i_block_conn = i_conn % ConnBlockSize;
-  connection_struct conn = ConnectionArray[i_block][i_block_conn];
-  // if (syn_group==-1 || conn.syn_group == syn_group) {
-  if (syn_group==-1 || (conn.target_port_syn & SynMask) == syn_group) {
-    // First get target node index
-    uint target_port_syn = conn.target_port_syn;
-    int i_target = target_port_syn >> MaxPortSynNBits;
-    uint source_delay = SourceDelayArray[i_block][i_block_conn];
-    int i_source = source_delay >> MaxPortSynNBits;
-    int64_t i_src_tgt = ((int64_t)i_source << 32) | i_target;
-    int64_t i_arr = locate(i_src_tgt, src_tgt_arr, n_source*n_target);
-    if (src_tgt_arr[i_arr] == i_src_tgt) {
-      //printf("i_conn %lld i_source %d i_target %d i_src_tgt %lld "
-      //     "i_arr %lld\n", i_conn, i_source, i_target, i_src_tgt, i_arr);
-      // (atomic)increase the number of connections for source-target couple
-      int64_t pos =
-	atomicAdd((unsigned long long *)&src_tgt_conn_num[i_arr], 1);
-      //printf("pos %lld src_tgt_conn_cumul[i_arr] %lld\n",
-      //     pos, src_tgt_conn_cumul[i_arr]);
-      conn_ids[src_tgt_conn_cumul[i_arr] + pos] = i_conn;
-    }
-  }
-}
-
-
-int64_t *NESTGPU::GetConnections(int *i_source_pt, int n_source,
-				 int *i_target_pt, int n_target,
-				 int syn_group, int64_t *n_conn)
-{  
-  int64_t *h_conn_ids = NULL;
-  int64_t *d_conn_ids = NULL;
-  int64_t n_src_tgt = (int64_t)n_source * n_target;
-  int64_t n_conn_ids = 0;
-  
-  if (n_src_tgt > 0) {
-    // sort source node index array in GPU memory
-    int *d_src_arr = sortArray(i_source_pt, n_source);
-    // sort target node index array in GPU memory
-    int *d_tgt_arr = sortArray(i_target_pt, n_target);
-    // Allocate array of combined source-target indexes (src_arr x tgt_arr)
-    int64_t *d_src_tgt_arr;
-    CUDAMALLOCCTRL("&d_src_tgt_arr",&d_src_tgt_arr, n_src_tgt*sizeof(int64_t));
-    // Fill it with combined source-target indexes
-    setSourceTargetIndexKernel<<<(n_src_tgt+1023)/1024, 1024>>>
-      (n_src_tgt, n_source, n_target, d_src_tgt_arr, d_src_arr, d_tgt_arr);
-    // Allocate array of number of connections per source-target couple
-    // and initialize it to 0
-    int64_t *d_src_tgt_conn_num;
-    CUDAMALLOCCTRL("&d_src_tgt_conn_num",&d_src_tgt_conn_num, (n_src_tgt + 1)*sizeof(int64_t));
-    gpuErrchk(cudaMemset(d_src_tgt_conn_num, 0,
-			 (n_src_tgt + 1)*sizeof(int64_t)));
-
-    // Count number of connections per source-target couple
-    CountConnectionsKernel<<<(NConn+1023)/1024, 1024>>>
-      (NConn, n_source, n_target, d_src_tgt_arr, d_src_tgt_conn_num, syn_group);
-    // Evaluate exclusive sum of connections per source-target couple
-    // Allocate array for cumulative sum
-    int64_t *d_src_tgt_conn_cumul;
-    CUDAMALLOCCTRL("&d_src_tgt_conn_cumul",&d_src_tgt_conn_cumul,
-			 (n_src_tgt + 1)*sizeof(int64_t));
-    // Determine temporary device storage requirements
-    void *d_storage = NULL;
-    size_t storage_bytes = 0;
-    cub::DeviceScan::ExclusiveSum(d_storage, storage_bytes,
-				  d_src_tgt_conn_num,
-				  d_src_tgt_conn_cumul,
-				  n_src_tgt + 1);
-    // Allocate temporary storage
-    CUDAMALLOCCTRL("&d_storage",&d_storage, storage_bytes);
-    // Run exclusive prefix sum
-    cub::DeviceScan::ExclusiveSum(d_storage, storage_bytes,
-				  d_src_tgt_conn_num,
-				  d_src_tgt_conn_cumul,
-				  n_src_tgt + 1);
-    CUDAFREECTRL("d_storage",d_storage);
-    
-    // The last element is the total number of required connection Ids
-    cudaMemcpy(&n_conn_ids, &d_src_tgt_conn_cumul[n_src_tgt],
-	       sizeof(int64_t), cudaMemcpyDeviceToHost);
-    
-    if (n_conn_ids > 0) {
-      // Allocate array of connection indexes
-      CUDAMALLOCCTRL("&d_conn_ids",&d_conn_ids, n_conn_ids*sizeof(int64_t));  
-      // Set number of connections per source-target couple to 0 again
-      gpuErrchk(cudaMemset(d_src_tgt_conn_num, 0,
-			   (n_src_tgt + 1)*sizeof(int64_t)));
-      // Fill array of connection indexes
-      SetConnectionsIndexKernel<<<(NConn+1023)/1024, 1024>>>
-	(NConn, n_source, n_target, d_src_tgt_arr, d_src_tgt_conn_num,
-	 d_src_tgt_conn_cumul, syn_group, d_conn_ids);
-
-      /// check if allocating with new is more appropriate
-      h_conn_ids = (int64_t*)malloc(n_conn_ids*sizeof(int64_t));
-      gpuErrchk(cudaMemcpy(h_conn_ids, d_conn_ids,
-			   n_conn_ids*sizeof(int64_t),
-			   cudaMemcpyDeviceToHost));
-	
-      CUDAFREECTRL("d_src_tgt_arr",d_src_tgt_arr);
-      CUDAFREECTRL("d_src_tgt_conn_num",d_src_tgt_conn_num);
-      CUDAFREECTRL("d_src_tgt_conn_cumul",d_src_tgt_conn_cumul);
-      CUDAFREECTRL("d_conn_ids",d_conn_ids);
-    }
-  }
-  *n_conn = n_conn_ids;
-  
-  return h_conn_ids;
-}
-
-//////////////////////////////////////////////////////////////////////
-// CUDA Kernel that gets all parameters of an array of n_conn connections,
-// identified by the indexes conn_ids[i], and puts them in the arrays
-// i_source, i_target, port, syn_group, delay, weight
-//////////////////////////////////////////////////////////////////////
-__global__ void GetConnectionStatusKernel
-(int64_t *conn_ids, int64_t n_conn, int *i_source, int *i_target,
- int *port, unsigned char *syn_group, float *delay, float *weight)
-{
-  int64_t i_arr = (int64_t)blockIdx.x * blockDim.x + threadIdx.x; 
-  if (i_arr >= n_conn) return;
-
-   // get connection index, connection block index and index within block
-  int64_t i_conn = conn_ids[i_arr];
-  uint i_block = (uint)(i_conn / ConnBlockSize);
-  int64_t i_block_conn = i_conn % ConnBlockSize;
-  // get connection structure
-  connection_struct conn = ConnectionArray[i_block][i_block_conn];
-  // Get joined target-port parameter, then target index and port index
-  uint target_port_syn = conn.target_port_syn;
-  i_target[i_arr] = target_port_syn >> MaxPortSynNBits;
-  port[i_arr] = (target_port_syn & PortSynMask) >> MaxSynNBits;
-  // Get weight and synapse group
-  weight[i_arr] = conn.weight;
-  syn_group[i_arr] = target_port_syn & SynMask;
-  // Get joined source-delay parameter, then source index and delay
-  uint source_delay = SourceDelayArray[i_block][i_block_conn];
-  i_source[i_arr] = source_delay >> MaxPortSynNBits;
-  int i_delay = source_delay & PortSynMask;
-  delay[i_arr] = NESTGPUTimeResolution * i_delay;
-}
-
-//////////////////////////////////////////////////////////////////////
-// CUDA Kernel that gets a float parameter of an array of n_conn connections,
-// identified by the indexes conn_ids[i], and puts it in the array
-// param_arr
-//////////////////////////////////////////////////////////////////////
-__global__ void GetConnectionFloatParamKernel
-(int64_t *conn_ids, int64_t n_conn, float *param_arr, int i_param)
-{
-  int64_t i_arr = (int64_t)blockIdx.x * blockDim.x + threadIdx.x; 
-  if (i_arr >= n_conn) return;
-
-   // get connection index, connection block index and index within block
-  int64_t i_conn = conn_ids[i_arr];
-  uint i_block = (uint)(i_conn / ConnBlockSize);
-  int64_t i_block_conn = i_conn % ConnBlockSize;
-  // get connection structure
-  connection_struct conn = ConnectionArray[i_block][i_block_conn];
-  switch (i_param) {
-  case i_weight_param: {
-    param_arr[i_arr] = conn.weight;
-    break;
-  }
-  case i_delay_param: {
-    // Get joined source-delay parameter, then delay
-    uint source_delay = SourceDelayArray[i_block][i_block_conn];
-    int i_delay = source_delay & PortSynMask;
-    param_arr[i_arr] = NESTGPUTimeResolution * i_delay;
-    break;
-  }
-  }
-}
-
-//////////////////////////////////////////////////////////////////////
-// CUDA Kernel that gets an integer parameter of an array of n_conn connections,
-// identified by the indexes conn_ids[i], and puts it in the array
-// param_arr
-//////////////////////////////////////////////////////////////////////
-__global__ void GetConnectionIntParamKernel
-(int64_t *conn_ids, int64_t n_conn, int *param_arr, int i_param)
-{
-  int64_t i_arr = (int64_t)blockIdx.x * blockDim.x + threadIdx.x; 
-  if (i_arr >= n_conn) return;
-
-   // get connection index, connection block index and index within block
-  int64_t i_conn = conn_ids[i_arr];
-  uint i_block = (uint)(i_conn / ConnBlockSize);
-  int64_t i_block_conn = i_conn % ConnBlockSize;
-  // get connection structure
-  connection_struct conn = ConnectionArray[i_block][i_block_conn];
-  switch (i_param) {
-  case i_source_param: {
-    // Get joined source-delay parameter, then source index and delay
-    uint source_delay = SourceDelayArray[i_block][i_block_conn];
-    param_arr[i_arr] = source_delay >> MaxPortSynNBits;
-    break;
-  }
-  case i_target_param: {
-    // Get joined target-port parameter, then target index
-    param_arr[i_arr] = conn.target_port_syn >> MaxPortSynNBits;
-    break;
-  }
-  case i_port_param: {
-    // Get joined target-port parameter, then port index
-    param_arr[i_arr] = (conn.target_port_syn & PortSynMask) >> MaxSynNBits;
-    break;
-  }
-  case i_syn_group_param: {
-    // Get synapse group
-    param_arr[i_arr] = conn.target_port_syn & SynMask;
-    break;
-  }
-  }
-}
-
-//////////////////////////////////////////////////////////////////////
-// CUDA Kernel that sets a float parameter of an array of n_conn connections,
-// identified by the indexes conn_ids[i], using values from the array
-// param_arr
-//////////////////////////////////////////////////////////////////////
-__global__ void SetConnectionFloatParamKernel
-(int64_t *conn_ids, int64_t n_conn, float *param_arr, int i_param)
-{
-  int64_t i_arr = (int64_t)blockIdx.x * blockDim.x + threadIdx.x; 
-  if (i_arr >= n_conn) return;
-
-   // get connection index, connection block index and index within block
-  int64_t i_conn = conn_ids[i_arr];
-  uint i_block = (uint)(i_conn / ConnBlockSize);
-  int64_t i_block_conn = i_conn % ConnBlockSize;
-  // get connection structure
-  connection_struct *conn = &ConnectionArray[i_block][i_block_conn];
-  switch (i_param) {
-  case i_weight_param: {
-    conn->weight = param_arr[i_arr]; 
-    break;
-  }
-  }
-}
-
-//////////////////////////////////////////////////////////////////////
-// CUDA Kernel that sets a float parameter of an array of n_conn connections,
-// identified by the indexes conn_ids[i], to the value val
-//////////////////////////////////////////////////////////////////////
-__global__ void SetConnectionFloatParamKernel
-(int64_t *conn_ids, int64_t n_conn, float val, int i_param)
-{
-  int64_t i_arr = (int64_t)blockIdx.x * blockDim.x + threadIdx.x; 
-  if (i_arr >= n_conn) return;
-
-   // get connection index, connection block index and index within block
-  int64_t i_conn = conn_ids[i_arr];
-  uint i_block = (uint)(i_conn / ConnBlockSize);
-  int64_t i_block_conn = i_conn % ConnBlockSize;
-  // get connection structure
-  connection_struct *conn = &ConnectionArray[i_block][i_block_conn];
-  switch (i_param) {
-  case i_weight_param: {
-    conn->weight = val; 
-    break;
-  }
-  }
-}
-
-//////////////////////////////////////////////////////////////////////
-// CUDA Kernel that sets an integer parameter of an array of n_conn connections,
-// identified by the indexes conn_ids[i], using values from the array
-// param_arr
-//////////////////////////////////////////////////////////////////////
-__global__ void SetConnectionIntParamKernel
-(int64_t *conn_ids, int64_t n_conn, int *param_arr, int i_param)
-{
-  int64_t i_arr = (int64_t)blockIdx.x * blockDim.x + threadIdx.x; 
-  if (i_arr >= n_conn) return;
-
-   // get connection index, connection block index and index within block
-  int64_t i_conn = conn_ids[i_arr];
-  uint i_block = (uint)(i_conn / ConnBlockSize);
-  int64_t i_block_conn = i_conn % ConnBlockSize;
-  // get connection structure
-  connection_struct *conn = &ConnectionArray[i_block][i_block_conn];
-  switch (i_param) {
-  case i_target_param: {
-    // Get port index from joined target-port parameter
-    int i_port_syn = conn->target_port_syn & PortSynMask;
-    // Set joined target-port parameter
-    conn->target_port_syn = (param_arr[i_arr] << MaxPortSynNBits) | i_port_syn;
-    break;
-  }
-  case i_port_param: {
-    // Get target index from joined target-port parameter
-    int i_target_syn = conn->target_port_syn & (~PortSynMask | SynMask);
-    // Set joined target-port parameter
-    conn->target_port_syn = (param_arr[i_arr] << MaxSynNBits) | i_target_syn;
-    break;
-  }
-  case i_syn_group_param: {
-    int i_target_port = conn->target_port_syn & (~SynMask);
-    // Set synapse group
-    conn->target_port_syn = param_arr[i_arr] | i_target_port; 
-    break;
-  }
-  }
+  ConnKeyArray = conn_key_array;
+  ConnStructArray = conn_struct_array;
+  ConnectionSpikeTime = conn_spike_time;
+}
+
+__global__ void
+setSourceTargetIndexKernel( uint64_t n_src_tgt,
+  uint n_source,
+  uint n_target,
+  uint64_t* d_src_tgt_arr,
+  uint* d_src_arr,
+  uint* d_tgt_arr )
+{
+  uint64_t i_src_tgt = ( uint64_t ) blockIdx.x * blockDim.x + threadIdx.x;
+  if ( i_src_tgt >= n_src_tgt )
+  {
+    return;
+  }
+  uint i_src = ( uint ) ( i_src_tgt / n_target );
+  uint i_tgt = ( uint ) ( i_src_tgt % n_target );
+  uint src_id = d_src_arr[ i_src ];
+  uint tgt_id = d_tgt_arr[ i_tgt ];
+  uint64_t src_tgt_id = ( ( uint64_t ) src_id << 32 ) | tgt_id;
+  d_src_tgt_arr[ i_src_tgt ] = src_tgt_id;
+  // printf("i_src_tgt %lld\tsrc_id %d\ttgt_id %d\tsrc_tgt_id %lld\n",
+  //	 i_src_tgt, src_id, tgt_id, src_tgt_id);
 }
 
-//////////////////////////////////////////////////////////////////////
-// CUDA Kernel that sets an integer parameter of an array of n_conn connections,
-// identified by the indexes conn_ids[i], to the value val
-//////////////////////////////////////////////////////////////////////
-__global__ void SetConnectionIntParamKernel
-(int64_t *conn_ids, int64_t n_conn, int val, int i_param)
-{
-  int64_t i_arr = (int64_t)blockIdx.x * blockDim.x + threadIdx.x; 
-  if (i_arr >= n_conn) return;
-
-   // get connection index, connection block index and index within block
-  int64_t i_conn = conn_ids[i_arr];
-  uint i_block = (uint)(i_conn / ConnBlockSize);
-  int64_t i_block_conn = i_conn % ConnBlockSize;
-  // get connection structure
-  connection_struct *conn = &ConnectionArray[i_block][i_block_conn];
-  switch (i_param) {
-  case i_target_param: {
-    // Get port index from joined target-port parameter
-    int i_port_syn = conn->target_port_syn & PortSynMask;
-    // Set joined target-port parameter
-    conn->target_port_syn = (val << MaxPortSynNBits) | i_port_syn;    
-    break;
-  }
-  case i_port_param: {
-    // Get target index from joined target-port parameter
-    int i_target_syn = conn->target_port_syn & (~PortSynMask | SynMask);
-    // Set joined target-port parameter
-    conn->target_port_syn = (val << MaxSynNBits) | i_target_syn;
-    break;    
-  }
-  case i_syn_group_param: {
-    // Set synapse group
-    int i_target_port = conn->target_port_syn & (~SynMask);
-    // Set synapse group
-    conn->target_port_syn = val | i_target_port;  
-    break;
-  }
-  }
-}
-
-
-//////////////////////////////////////////////////////////////////////
-// Get all parameters of an array of n_conn connections, identified by
-// the indexes conn_ids[i], and put them in the arrays
-// i_source, i_target, port, syn_group, delay, weight
-// NOTE: host arrays should be pre-allocated to store n_conn elements
-//////////////////////////////////////////////////////////////////////
-int NESTGPU::GetConnectionStatus(int64_t *conn_ids, int64_t n_conn,
-				 int *i_source, int *i_target, int *port,
-				 unsigned char *syn_group, float *delay,
-				 float *weight)
-{
-  if (n_conn > 0) {
-    // declare pointers to arrays in device memory
-    int64_t *d_conn_ids;
-    int *d_source;
-    int *d_target;
-    int *d_port;
-    unsigned char *d_syn_group;
-    float *d_delay;
-    float *d_weight;
-
-    // allocate array of connection ids in device memory
-    // and copy the ids from host to device array
-    CUDAMALLOCCTRL("&d_conn_ids",&d_conn_ids, n_conn*sizeof(int64_t));
-    gpuErrchk(cudaMemcpy(d_conn_ids, conn_ids, n_conn*sizeof(int64_t),
-			 cudaMemcpyHostToDevice));
-
-    // allocate arrays of connection parameters in device memory
-    CUDAMALLOCCTRL("&d_source",&d_source, n_conn*sizeof(int));
-    CUDAMALLOCCTRL("&d_target",&d_target, n_conn*sizeof(int));
-    CUDAMALLOCCTRL("&d_port",&d_port, n_conn*sizeof(int));
-    CUDAMALLOCCTRL("&d_syn_group",&d_syn_group, n_conn*sizeof(unsigned char));
-    CUDAMALLOCCTRL("&d_delay",&d_delay, n_conn*sizeof(float));
-    CUDAMALLOCCTRL("&d_weight",&d_weight, n_conn*sizeof(float));
-    // host arrays
-    
-    // launch kernel to get connection parameters
-    GetConnectionStatusKernel<<<(n_conn+1023)/1024, 1024 >>>
-      (d_conn_ids, n_conn, d_source, d_target, d_port, d_syn_group,
-       d_delay, d_weight);
-
-    // copy connection parameters from device to host memory
-    gpuErrchk(cudaMemcpy(i_source, d_source, n_conn*sizeof(int),
-			 cudaMemcpyDeviceToHost));
-    gpuErrchk(cudaMemcpy(i_target, d_target, n_conn*sizeof(int),
-			 cudaMemcpyDeviceToHost));
-    gpuErrchk(cudaMemcpy(port, d_port, n_conn*sizeof(int),
-			 cudaMemcpyDeviceToHost));
-    gpuErrchk(cudaMemcpy(syn_group, d_syn_group,
-			 n_conn*sizeof(unsigned char),
-			 cudaMemcpyDeviceToHost));
-    gpuErrchk(cudaMemcpy(delay, d_delay, n_conn*sizeof(float),
-			 cudaMemcpyDeviceToHost));
-    gpuErrchk(cudaMemcpy(weight, d_weight, n_conn*sizeof(float),
-			 cudaMemcpyDeviceToHost));
-  }
-  
-  return 0;
-}
-
-
 // Get the index of the connection float parameter param_name
 // if param_name is not a float parameter, return -1
-int NESTGPU::GetConnectionFloatParamIndex(std::string param_name)
+int
+Connection::getConnectionFloatParamIndex( std::string param_name )
 {
-  for (int i=0; i<N_CONN_FLOAT_PARAM; i++) {
-    if (param_name==ConnectionFloatParamName[i]) {
+  for ( int i = 0; i < N_CONN_FLOAT_PARAM; i++ )
+  {
+    if ( param_name == ConnectionFloatParamName[ i ] )
+    {
       return i;
     }
   }
-  
+
   return -1;
 }
 
 // Get the index of the connection int parameter param_name
 // if param_name is not an int parameter, return -1
-int NESTGPU::GetConnectionIntParamIndex(std::string param_name)
+int
+Connection::getConnectionIntParamIndex( std::string param_name )
 {
-  for (int i=0; i<N_CONN_INT_PARAM; i++) {
-    if (param_name==ConnectionIntParamName[i]) {
+  for ( int i = 0; i < N_CONN_INT_PARAM; i++ )
+  {
+    if ( param_name == ConnectionIntParamName[ i ] )
+    {
       return i;
     }
   }
-  
+
   return -1;
 }
 
 // Check if param_name is a connection float parameter
-int NESTGPU::IsConnectionFloatParam(std::string param_name)
+int
+Connection::isConnectionFloatParam( std::string param_name )
 {
-  if (GetConnectionFloatParamIndex(param_name) >=0 ) {
+  if ( getConnectionFloatParamIndex( param_name ) >= 0 )
+  {
     return 1;
   }
-  else {
+  else
+  {
     return 0;
   }
 }
 
 // Check if param_name is a connection integer parameter
-int NESTGPU::IsConnectionIntParam(std::string param_name)
+int
+Connection::isConnectionIntParam( std::string param_name )
 {
-  if (GetConnectionIntParamIndex(param_name) >=0 ) {
+  if ( getConnectionIntParamIndex( param_name ) >= 0 )
+  {
     return 1;
   }
-  else {
+  else
+  {
     return 0;
   }
 }
-
-//////////////////////////////////////////////////////////////////////
-// Get the float parameter param_name of an array of n_conn connections,
-// identified by the indexes conn_ids[i], and put it in the array
-// h_param_arr
-// NOTE: host array should be pre-allocated to store n_conn elements
-//////////////////////////////////////////////////////////////////////
-int NESTGPU::GetConnectionFloatParam(int64_t *conn_ids, int64_t n_conn,
-				     float *h_param_arr,
-				     std::string param_name)
-{
-  // Check if param_name is a connection float parameter
-  int i_param = GetConnectionFloatParamIndex(param_name);
-  if (i_param < 0) {
-    throw ngpu_exception(std::string("Unrecognized connection float parameter ")
-			 + param_name);
-  }
-  if (n_conn > 0) {
-    // declare pointers to arrays in device memory
-    int64_t *d_conn_ids;
-    float *d_arr;
-    // allocate array of connection ids in device memory
-    // and copy the ids from host to device array
-    CUDAMALLOCCTRL("&d_conn_ids",&d_conn_ids, n_conn*sizeof(int64_t));
-    gpuErrchk(cudaMemcpy(d_conn_ids, conn_ids, n_conn*sizeof(int64_t),
-			 cudaMemcpyHostToDevice));
-    
-    // allocate connection parameter array in device memory
-    CUDAMALLOCCTRL("&d_arr",&d_arr, n_conn*sizeof(float));
-    
-    // launch kernel to get connection parameters
-    GetConnectionFloatParamKernel<<<(n_conn+1023)/1024, 1024 >>>
-      (d_conn_ids, n_conn, d_arr, i_param);
-    
-    // copy connection parameter array from device to host memory
-    gpuErrchk(cudaMemcpy(h_param_arr, d_arr, n_conn*sizeof(float),
-			 cudaMemcpyDeviceToHost));
-    // free allocated device memory
-    CUDAFREECTRL("d_conn_ids",d_conn_ids);
-    CUDAFREECTRL("d_arr",d_arr);
-  }
-  
-  return 0;
-}
-
-//////////////////////////////////////////////////////////////////////
-// Get the integer parameter param_name of an array of n_conn connections,
-// identified by the indexes conn_ids[i], and put it in the array
-// h_param_arr
-// NOTE: host array should be pre-allocated to store n_conn elements
-//////////////////////////////////////////////////////////////////////
-int NESTGPU::GetConnectionIntParam(int64_t *conn_ids, int64_t n_conn,
-				   int *h_param_arr,
-				   std::string param_name)
-{
-  // Check if param_name is a connection integer parameter
-  int i_param = GetConnectionIntParamIndex(param_name);
-  if (i_param < 0) {
-    throw ngpu_exception(std::string("Unrecognized connection "
-				     "integer parameter ") + param_name);
-  }
-  if (n_conn > 0) {
-    // declare pointers to arrays in device memory
-    int64_t *d_conn_ids;
-    int *d_arr;
-    // allocate array of connection ids in device memory
-    // and copy the ids from host to device array
-    CUDAMALLOCCTRL("&d_conn_ids",&d_conn_ids, n_conn*sizeof(int64_t));
-    gpuErrchk(cudaMemcpy(d_conn_ids, conn_ids, n_conn*sizeof(int64_t),
-			 cudaMemcpyHostToDevice));
-    
-    // allocate connection parameter array in device memory
-    CUDAMALLOCCTRL("&d_arr",&d_arr, n_conn*sizeof(int));
-    
-    // launch kernel to get connection parameters
-    GetConnectionIntParamKernel<<<(n_conn+1023)/1024, 1024 >>>
-      (d_conn_ids, n_conn, d_arr, i_param);
-    
-    // copy connection parameter array from device to host memory
-    gpuErrchk(cudaMemcpy(h_param_arr, d_arr, n_conn*sizeof(int),
-			 cudaMemcpyDeviceToHost));
-    // free allocated device memory
-    CUDAFREECTRL("d_conn_ids",d_conn_ids);
-    CUDAFREECTRL("d_arr",d_arr);
-  }
-  
-  return 0;
-}
-
-
-//////////////////////////////////////////////////////////////////////
-// Set the float parameter param_name of an array of n_conn connections,
-// identified by the indexes conn_ids[i], using values from a distribution
-// or from an array
-//////////////////////////////////////////////////////////////////////
-int NESTGPU::SetConnectionFloatParamDistr(int64_t *conn_ids, int64_t n_conn,
-					  std::string param_name)
-{
-  // Check if param_name is a connection float parameter
-  int i_param = GetConnectionFloatParamIndex(param_name);
-  if (i_param < 0) {
-    throw ngpu_exception(std::string("Unrecognized connection float parameter ")
-			 + param_name);
-  }
-  if (i_param == i_delay_param) {
-    throw ngpu_exception("Connection delay cannot be modified");
-  }
-
-  if (n_conn > 0) {
-    // declare pointers to arrays in device memory
-    int64_t *d_conn_ids;
-    // allocate array of connection ids in device memory
-    // and copy the ids from host to device array
-    CUDAMALLOCCTRL("&d_conn_ids",&d_conn_ids, n_conn*sizeof(int64_t));
-    gpuErrchk(cudaMemcpy(d_conn_ids, conn_ids, n_conn*sizeof(int64_t),
-			 cudaMemcpyHostToDevice));
-    
-    // get values from array or distribution
-    float *d_arr = distribution_->getArray
-      (conn_random_generator_[this_host_][this_host_], n_conn);
-    // launch kernel to set connection parameters
-    SetConnectionFloatParamKernel<<<(n_conn+1023)/1024, 1024 >>>
-      (d_conn_ids, n_conn, d_arr, i_param);
-    // free allocated device memory
-    CUDAFREECTRL("d_conn_ids",d_conn_ids);
-    CUDAFREECTRL("d_arr",d_arr);
-  }
-  
-  return 0;
-}
-
-//////////////////////////////////////////////////////////////////////
-// Set the float parameter param_name of an array of n_conn connections,
-// identified by the indexes conn_ids[i], to the value val
-//////////////////////////////////////////////////////////////////////
-int NESTGPU::SetConnectionFloatParam(int64_t *conn_ids, int64_t n_conn,
-				     float val,
-				     std::string param_name)
-{
-  // Check if param_name is a connection float parameter
-  int i_param = GetConnectionFloatParamIndex(param_name);
-  if (i_param < 0) {
-    throw ngpu_exception(std::string("Unrecognized connection float parameter ")
-			 + param_name);
-  }
-  if (i_param == i_delay_param) {
-        throw ngpu_exception("Connection delay cannot be modified");
-  }
-
-  if (n_conn > 0) {
-    // declare pointers to arrays in device memory
-    int64_t *d_conn_ids;
-    // allocate array of connection ids in device memory
-    // and copy the ids from host to device array
-    CUDAMALLOCCTRL("&d_conn_ids",&d_conn_ids, n_conn*sizeof(int64_t));
-    gpuErrchk(cudaMemcpy(d_conn_ids, conn_ids, n_conn*sizeof(int64_t),
-			 cudaMemcpyHostToDevice));
-        
-    // launch kernel to set connection parameters
-    SetConnectionFloatParamKernel<<<(n_conn+1023)/1024, 1024 >>>
-      (d_conn_ids, n_conn, val, i_param);
-    // free allocated device memory
-    CUDAFREECTRL("d_conn_ids",d_conn_ids);    
-  }
-  
-  return 0;
-}
-
-
-//////////////////////////////////////////////////////////////////////
-// Set the integer parameter param_name of an array of n_conn connections,
-// identified by the indexes conn_ids[i], using the values from the array
-// h_param_arr
-//////////////////////////////////////////////////////////////////////
-int NESTGPU::SetConnectionIntParamArr(int64_t *conn_ids, int64_t n_conn,
-				      int *h_param_arr,
-				      std::string param_name)
-{
-  // Check if param_name is a connection int parameter
-  int i_param = GetConnectionIntParamIndex(param_name);
-  if (i_param < 0) {
-    throw ngpu_exception(std::string("Unrecognized connection int parameter ")
-			 + param_name);
-  }
-  if (i_param == i_source_param) {
-    throw ngpu_exception("Connection source node cannot be modified");
-  }
-
-  if (n_conn > 0) {
-    // declare pointers to arrays in device memory
-    int64_t *d_conn_ids;
-    int *d_arr;
-    // allocate array of connection ids in device memory
-    // and copy the ids from host to device array
-    CUDAMALLOCCTRL("&d_conn_ids",&d_conn_ids, n_conn*sizeof(int64_t));
-    gpuErrchk(cudaMemcpy(d_conn_ids, conn_ids, n_conn*sizeof(int64_t),
-			 cudaMemcpyHostToDevice));
-    
-    // allocate connection parameter array in device memory
-    CUDAMALLOCCTRL("&d_arr",&d_arr, n_conn*sizeof(int));
-
-    // copy connection parameter array from host to device memory
-    gpuErrchk(cudaMemcpy(d_arr, h_param_arr, n_conn*sizeof(int),
-			 cudaMemcpyHostToDevice));
-    
-    // launch kernel to set connection parameters
-    SetConnectionIntParamKernel<<<(n_conn+1023)/1024, 1024 >>>
-      (d_conn_ids, n_conn, d_arr, i_param);
-    // free allocated device memory
-    CUDAFREECTRL("d_conn_ids",d_conn_ids);
-    CUDAFREECTRL("d_arr",d_arr);
-
-  }
-  
-  return 0;
-}
-
-//////////////////////////////////////////////////////////////////////
-// Set the int parameter param_name of an array of n_conn connections,
-// identified by the indexes conn_ids[i], to the value val
-//////////////////////////////////////////////////////////////////////
-int NESTGPU::SetConnectionIntParam(int64_t *conn_ids, int64_t n_conn,
-				   int val, std::string param_name)
-{
-  // Check if param_name is a connection float parameter
-  int i_param = GetConnectionIntParamIndex(param_name);
-  if (i_param < 0) {
-    throw ngpu_exception(std::string("Unrecognized connection int parameter ")
-			 + param_name);
-  }
-  if (i_param == i_source_param) {
-    throw ngpu_exception("Connection source node cannot be modified");
-  }
-
-  if (n_conn > 0) {
-    // declare pointers to arrays in device memory
-    int64_t *d_conn_ids;
-    // allocate array of connection ids in device memory
-    // and copy the ids from host to device array
-    CUDAMALLOCCTRL("&d_conn_ids",&d_conn_ids, n_conn*sizeof(int64_t));
-    gpuErrchk(cudaMemcpy(d_conn_ids, conn_ids, n_conn*sizeof(int64_t),
-			 cudaMemcpyHostToDevice));
-        
-    // launch kernel to set connection parameters
-    SetConnectionIntParamKernel<<<(n_conn+1023)/1024, 1024 >>>
-      (d_conn_ids, n_conn, val, i_param);
-    // free allocated device memory
-    CUDAFREECTRL("d_conn_ids",d_conn_ids);
-  }
-  
-  return 0;
-}
-
diff --git a/src/connect.h b/src/connect.h
index 90397814a..768e9fd18 100644
--- a/src/connect.h
+++ b/src/connect.h
@@ -23,629 +23,4002 @@
 #ifndef CONNECT_H
 #define CONNECT_H
 
+// The following line must be skipped by clang-tidy to avoid errors
+// which are not related to our code but to the CUB CUDA library
+//<BEGIN-CLANG-TIDY-SKIP>//
+#include <cub/cub.cuh>
+//<END-CLANG-TIDY-SKIP>//
+
 #include <curand.h>
+#include <curand_kernel.h>
+#include <sys/time.h>
+#include <time.h>
 #include <vector>
 
-#include "cuda_error.h"
-#include "connect_spec.h"
-#include "nestgpu.h"
+#include "connect_spec.h"
+#include "copass_kernels.h"
+#include "copass_sort.h"
+#include "cuda_error.h"
+// #include "nestgpu.h"
+#include "distribution.h"
+#include "node_group.h"
+#include "utilities.h"
+
+typedef uint inode_t;
+typedef uint iconngroup_t;
+
+// Connection is the class used to represent connection data and methods.
+// It is defined as an abstract class, with pure virtual methods
+// that offer an interface for using this class in the same way
+// no matter what specific structure is used to represent individual connections
+// This abstract class will then be used as a base for derived classes
+// using templates, with the connection structure specified by template parameters
+class Connection
+{
+public:
+  virtual ~Connection() {}; // destructor
+
+  virtual int calibrate() = 0; // method called by nestgpu calibration
+
+  // methods used to specify the number of bits reserved to represent
+  // different connection parameters
+
+  // bits reserved for representing node indexes (same value for source and target nodes)
+  virtual int setMaxNodeNBits( int max_node_nbits ) = 0;
+
+  // bits reserved to represent delays as integer (integer delays) in time-resolution units
+  virtual int setMaxDelayNBits( int max_node_nbits ) = 0;
+
+  // bits reserved to represent synapse group
+  virtual int setMaxSynNBits( int max_syn_nbits ) = 0;
+
+  // get number of bits reserved to represent node indexes
+  virtual int getMaxNodeNBits() = 0;
+
+  // get number of bits reserved to represent integer delays
+  virtual int getMaxDelayNBits() = 0;
+
+  // get number of bits reserved to represent receptor ports
+  virtual int getMaxPortNBits() = 0;
+
+  // get number of bits reserved to represent synapse groups
+  virtual int getMaxSynNBits() = 0;
+
+  // get maximum number of integer-delay values used by a node
+  virtual int getMaxDelayNum() = 0;
+
+  // get number of images of remote spiking nodes having connections to local target nodes
+  virtual int getNImageNodes() = 0;
+
+  // get flag that indicates if reverse connections are used (e.g. for STDP)
+  virtual bool getRevConnFlag() = 0;
+
+  // get number of reverse connections
+  virtual int getNRevConn() = 0;
+
+
+  virtual uint* getDevRevSpikeNumPt() = 0;
+
+  // get pt to array of number of reverse connections incoming to each node
+  virtual int* getDevRevSpikeNConnPt() = 0;
+
+  // get array of number of remote target hosts per local source node
+  virtual uint* getDevNTargetHosts() = 0;
+
+  // get array with remote target hosts of all nodes
+  virtual uint** getDevNodeTargetHosts() = 0;
+
+  // get array with remote target hosts map index
+  virtual uint** getDevNodeTargetHostIMap() = 0;
+
+  // method to organize connections after creation and before using them in simulation
+  virtual int organizeConnections( inode_t n_node ) = 0;
+
+  // connection methods. 4 combinations where source and target can be either
+  // of inode_t type (in case of a sequence) or pointers to inode_t
+  // (in case of arbitrary arrays if node indexes)
+  virtual int connect( inode_t source,
+    inode_t n_source,
+    inode_t target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec ) = 0;
+
+  virtual int connect( inode_t source,
+    inode_t n_source,
+    inode_t* target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec ) = 0;
+
+  virtual int connect( inode_t* source,
+    inode_t n_source,
+    inode_t target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec ) = 0;
+
+  virtual int connect( inode_t* source,
+    inode_t n_source,
+    inode_t* target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec ) = 0;
+
+  // methods to check if a connection parameter, specified by the param_name string
+  // is an integer or float parameter
+  int isConnectionIntParam( std::string param_name );
+
+  int isConnectionFloatParam( std::string param_name );
+
+  // methods to get the index of the (integer or float) connection parameter specified by
+  // the param_name string
+  int getConnectionIntParamIndex( std::string param_name );
+
+  int getConnectionFloatParamIndex( std::string param_name );
+
+  // methods to get the values of the (integer or float) connection parameter param_name
+  // for the connections specified in the array conn_ids in device memory
+  virtual int
+  getConnectionFloatParam( int64_t* conn_ids, int64_t n_conn, float* h_param_arr, std::string param_name ) = 0;
+
+  virtual int getConnectionIntParam( int64_t* conn_ids, int64_t n_conn, int* h_param_arr, std::string param_name ) = 0;
+
+  // methods to set the values of the (integer or float) connection parameter param_name
+  // for the connections specified in the array conn_ids in device memory
+  // The entries can be specified by a single value (val), by a distribution
+  // (which must be configured before this command) or by an array of values
+  virtual int setConnectionFloatParam( int64_t* conn_ids, int64_t n_conn, float val, std::string param_name ) = 0;
+
+  virtual int setConnectionFloatParamDistr( int64_t* conn_ids, int64_t n_conn, std::string param_name ) = 0;
+
+  virtual int
+  setConnectionIntParamArr( int64_t* conn_ids, int64_t n_conn, int* h_param_arr, std::string param_name ) = 0;
+
+  virtual int setConnectionIntParam( int64_t* conn_ids, int64_t n_conn, int val, std::string param_name ) = 0;
+
+  // method to get the indexes of all the connection specified by an array of source-node indexes
+  // and/or an array of target-node indexes and eventually the synaptic group
+  virtual int64_t* getConnections( inode_t* i_source_pt,
+    inode_t n_source,
+    inode_t* i_target_pt,
+    inode_t n_target,
+    int syn_group,
+    int64_t* n_conn ) = 0;
+
+  // method to get all parameters of the connections specified by the array conn_ids in device memory
+  virtual int getConnectionStatus( int64_t* conn_ids,
+    int64_t n_conn,
+    inode_t* source,
+    inode_t* target,
+    int* port,
+    int* syn_group,
+    float* delay,
+    float* weight ) = 0;
+
+  // method to build direct connections, used by Poisson generators
+  virtual int buildDirectConnections( inode_t i_node_0,
+    inode_t n_node,
+    int64_t& i_conn0,
+    int64_t& n_dir_conn,
+    int& max_delay,
+    float*& d_mu_arr,
+    void*& d_poiss_key_array ) = 0;
+
+  // method to send spikes through direct connections, used by Poisson generators
+  virtual int sendDirectSpikes( long long time_idx,
+    int64_t i_conn0,
+    int64_t n_dir_conn,
+    inode_t n_node,
+    int max_delay,
+    float* d_mu_arr,
+    void* d_poiss_key_array,
+    curandState* d_curand_state ) = 0;
+
+  // method to organize direct connections, after they are created and before using them in the simulation
+  virtual int organizeDirectConnections( void*& d_poiss_key_array_data_pt,
+    void*& d_poiss_subarray,
+    int64_t*& d_poiss_num,
+    int64_t*& d_poiss_sum,
+    void*& d_poiss_thresh ) = 0;
+
+  // add a proper offset to externa nodes ids
+  virtual int addOffsetToExternalNodeIds( uint n_local_nodes ) = 0;
+
+  // deallocate memory used to represent the key part of the connection structure for all connections
+  virtual int freeConnectionKey() = 0;
+
+  // initialize reverse spikes, used e.g. by STDP
+  virtual int revSpikeInit( uint n_spike_buffers ) = 0;
+
+  // spike time stored in STDP connections is limited to a time window, to reduce memory usage
+  // the left and right limits of this time interval must be periodically updated
+  virtual int resetConnectionSpikeTimeUp() = 0;
+
+  virtual int resetConnectionSpikeTimeDown() = 0;
+
+  // set the seeds for random number generation
+  virtual int setRandomSeed( unsigned long long seed ) = 0;
+
+  // set the time resolution. Must be consistent with the value stored in the nestgpu class
+  virtual int setTimeResolution( float time_resolution ) = 0;
+
+  // set number of hosts
+  virtual int setNHosts( int n_hosts ) = 0;
+
+  // set index of this host
+  virtual int setThisHost( int this_host ) = 0;
+
+  // initialize the maps used to send spikes among remote hosts
+  virtual int remoteConnectionMapInit() = 0;
+
+  // calibrate the maps used to send spikes among remote hosts
+  virtual int remoteConnectionMapCalibrate( inode_t n_nodes ) = 0;
+
+  // remote connection methods. 4 combinations where source and target can be either
+  // of inode_t type (in case of a sequence) or pointers to inode_t
+  // (in case of arbitrary arrays if node indexes)
+
+  virtual int remoteConnect( int source_host,
+    inode_t source,
+    inode_t n_source,
+    int target_host,
+    inode_t target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec ) = 0;
+
+  virtual int remoteConnect( int source_host,
+    inode_t* source,
+    inode_t n_source,
+    int target_host,
+    inode_t target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec ) = 0;
+
+  virtual int remoteConnect( int source_host,
+    inode_t source,
+    inode_t n_source,
+    int target_host,
+    inode_t* target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec ) = 0;
+
+  virtual int remoteConnect( int source_host,
+    inode_t* source,
+    inode_t n_source,
+    int target_host,
+    inode_t* target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec ) = 0;
+
+  // add an offset to the remote source node indexes in the spike buffer maps
+  // after the creation of the spike buffers used to represent such nodes
+  virtual int addOffsetToSpikeBufferMap( inode_t n_nodes ) = 0;
+};
+
+
+//////////////////////////////////////////////////////////////////////
+// Template class used to represent connection of different types
+// as derived classes of the base (abstract) class connection
+// sharing with that class its method, which offer a common interface that can be used
+// in the same way for all derived classes, and adding further internal methods.
+// The connection must be represented by a pair key-value
+//  * The key is a type or a class
+//    that MUST contain the source-node index and the integer delay, which are used as keys
+//    for sorting the connections (source-node index as primary key, integer delay as second key).
+//    It can (but not necessarily should) contain other connection parameters.
+// *  The value is a structure that must contain all the remaining connection parameters
+// Typically, for efficient sorting, the source node index will be stored in the most significant bits
+// of the key, followed by bits used to represent the delay, end eventually by the bits used to represent
+// other parameters not relevant for the sort.
+
+template < class ConnKeyT, class ConnStructT >
+class ConnectionTemplate : public Connection
+{
+  //////////////////////////////////////////////////
+  // Member variables
+  //////////////////////////////////////////////////
+  static const int conn_seed_offset_ = 12345;
+
+  int64_t conn_block_size_;
+
+  int64_t n_conn_;
+
+  std::vector< ConnKeyT* > conn_key_vect_;
+
+  std::vector< ConnStructT* > conn_struct_vect_;
+
+  float time_resolution_;
+
+  std::vector< std::vector< curandGenerator_t > > conn_random_generator_;
+
+  curandGenerator_t local_rnd_gen_;
+
+  Distribution* distribution_;
+
+  // pointer to temporary storage in device memory
+  void* d_conn_storage_;
+
+  // maximum number of bits used to represent node index
+  int max_node_nbits_;
+
+  // maximum number of bits used to represent delays
+  int max_delay_nbits_;
+
+  // maximum number of bits used to represent synapse group index
+  int max_syn_nbits_;
+
+  // maximum number of bits used to represent receptor port index
+  int max_port_nbits_;
+
+  // maximum number of bits used to represent receptor port index
+  // and synapse group index
+  int max_port_syn_nbits_;
+
+  // bit mask used to extract source node index
+  uint source_mask_;
+
+  // bit mask used to extract delay
+  uint delay_mask_;
+
+  // bit mask used to extract target node index
+  uint target_mask_;
+
+  // bit mask used to extract synapse group index
+  uint syn_mask_;
+
+  // bit mask used to extract port index
+  uint port_mask_;
+
+  // bit mask used to extract port and synapse group index
+  uint port_syn_mask_;
+
+  iconngroup_t* d_conn_group_idx0_;
+
+  int64_t* d_conn_group_iconn0_;
+
+  int* d_conn_group_delay_;
+
+  iconngroup_t tot_conn_group_num_;
+
+  int max_delay_num_;
+
+  ConnKeyT** d_conn_key_array_;
+
+  ConnStructT** d_conn_struct_array_;
+
+  inode_t* d_conn_source_ids_;
+
+  int64_t conn_source_ids_size_;
+
+  //////////////////////////////////////////////////
+  // Remote-connection-related member variables
+  //////////////////////////////////////////////////
+  int this_host_;
+
+  int n_hosts_;
+
+  int n_image_nodes_;
+
+  // The arrays that map remote source nodes to local spike buffers
+  // are organized in blocks having block size:
+  uint node_map_block_size_; // = 100000;
+
+  // number of elements in the map for each source host
+  // n_remote_source_node_map[i_source_host]
+  // with i_source_host = 0, ..., n_hosts-1 excluding this host itself
+  std::vector< uint > h_n_remote_source_node_map_;
+  uint* d_n_remote_source_node_map_;
+
+  // remote_source_node_map_[i_source_host][i_block][i]
+  std::vector< std::vector< uint* > > h_remote_source_node_map_;
+
+  // local_spike_buffer_map[i_source_host][i_block][i]
+  std::vector< std::vector< uint* > > h_local_spike_buffer_map_;
+  uint*** d_local_spike_buffer_map_;
+
+  // hd_local_spike_buffer_map_[i_source_host] vector of pointers to gpu memory
+  std::vector< uint** > hd_local_spike_buffer_map_;
+
+  // Arrays that map local source nodes to remote spike buffers
+  // number of elements in the map for each target host
+  // n_local_source_node_map[i_target_host]
+  // with i_target_host = 0, ..., n_hosts-1 excluding this host itself
+  uint* d_n_local_source_node_map_;
+  std::vector< uint > h_n_local_source_node_map_;
+
+  // local_source_node_map[i_target_host][i_block][i]
+  std::vector< std::vector< uint* > > h_local_source_node_map_;
+  uint*** d_local_source_node_map_;
+
+  // hd_local_source_node_map_[i_target_host] vector of pointers to gpu memory
+  std::vector< uint** > hd_local_source_node_map_;
+
+  // number of remote target hosts on which each local node
+  // has outgoing connections
+  uint* d_n_target_hosts_; // [n_nodes]
+  // cumulative sum of d_n_target_hosts
+  uint* d_n_target_hosts_cumul_; // [n_nodes+1]
+
+  // Global array with remote target hosts indexes of all nodes
+  // target_host_array[total_num] where total_num is the sum
+  // of n_target_hosts[i_node] on all nodes
+  uint* d_target_host_array_;
+  // pointer to the starting position in target_host_array
+  // of the target hosts for the node i_node
+  uint** d_node_target_hosts_; // [i_node]
+
+  // Global array with remote target hosts map indexes of all nodes
+  // target_host_i_map[total_num] where total_num is the sum
+  // of n_target_hosts[i_node] on all nodes
+  uint* d_target_host_i_map_;
+  // pointer to the starting position in target_host_i_map array
+  // of the target host map indexes for the node i_node
+  uint** d_node_target_host_i_map_; // [i_node]
+
+  // node map index
+  uint** d_node_map_index_; // [i_node]
+
+  // Boolean array with one boolean value for each connection rule
+  // - true if the rule always creates at least one outgoing connection
+  // from each source node (one_to_one, all_to_all, fixed_outdegree)
+  // - false otherwise (fixed_indegree, fixed_total_number, pairwise_bernoulli)
+  bool* use_all_source_nodes_; // [n_connection_rules]:
+
+  //////////////////////////////////////////////////
+  // reverse-connection-related member variables
+  //////////////////////////////////////////////////
+  bool rev_conn_flag_;
+
+  bool spike_time_flag_;
+
+  unsigned short* d_conn_spike_time_; // [n_conn_];
+
+  int64_t n_rev_conn_;
+
+  uint* d_rev_spike_num_;
+
+  uint* d_rev_spike_target_;
+
+  int* d_rev_spike_n_conn_;
+
+  int64_t* d_rev_conn_; //[i] i=0,..., n_rev_conn_ - 1;
+
+  int* d_target_rev_conn_size_; //[i] i=0,..., n_neuron-1;
+
+  int64_t** d_target_rev_conn_; //[i][j] j=0,...,rev_conn_size_[i]-1
+
+  //////////////////////////////////////////////////
+  // class ConnectionTemplate methods
+  //////////////////////////////////////////////////
+public:
+  ConnectionTemplate();
+
+  int init();
+
+  int calibrate();
+
+  int initConnRandomGenerator();
+
+  int freeConnRandomGenerator();
+
+  int setRandomSeed( unsigned long long seed );
+
+  int setTimeResolution( float time_resolution );
+
+  int _setMaxNodeNBits( int max_node_nbits );
+
+  int _setMaxDelayNBits( int max_delay_nbits );
+
+  int _setMaxSynNBits( int max_syn_nbits );
+
+  int
+  setMaxNodeNBits( int max_node_nbits )
+  {
+    return _setMaxNodeNBits( max_node_nbits );
+  }
+
+  int
+  setMaxDelayNBits( int max_delay_nbits )
+  {
+    return _setMaxDelayNBits( max_delay_nbits );
+  }
+
+  int
+  setMaxSynNBits( int max_syn_nbits )
+  {
+    return _setMaxSynNBits( max_syn_nbits );
+  }
+
+  int
+  getMaxNodeNBits()
+  {
+    return max_node_nbits_;
+  }
+
+  int
+  getMaxDelayNBits()
+  {
+    return max_delay_nbits_;
+  }
+
+  int
+  getMaxPortNBits()
+  {
+    return max_port_nbits_;
+  }
+
+  int
+  getMaxSynNBits()
+  {
+    return max_syn_nbits_;
+  }
+
+  int
+  getMaxDelayNum()
+  {
+    return max_delay_num_;
+  }
+
+  int
+  getNImageNodes()
+  {
+    return n_image_nodes_;
+  }
+
+  bool
+  getRevConnFlag()
+  {
+    return rev_conn_flag_;
+  }
+
+  int
+  getNRevConn()
+  {
+    return n_rev_conn_;
+  }
+
+  uint*
+  getDevRevSpikeNumPt()
+  {
+    return d_rev_spike_num_;
+  }
+
+  int*
+  getDevRevSpikeNConnPt()
+  {
+    return d_rev_spike_n_conn_;
+  }
+
+  uint*
+  getDevNTargetHosts()
+  {
+    return d_n_target_hosts_;
+  }
+
+  uint**
+  getDevNodeTargetHosts()
+  {
+    return d_node_target_hosts_;
+  }
+
+  uint**
+  getDevNodeTargetHostIMap()
+  {
+    return d_node_target_host_i_map_;
+  }
+
+  int allocateNewBlocks( int new_n_block );
+
+  int freeConnectionKey();
+
+  int setConnectionWeights( curandGenerator_t& gen,
+    void* d_storage,
+    ConnStructT* conn_struct_subarray,
+    int64_t n_conn,
+    SynSpec& syn_spec );
+
+  int setConnectionDelays( curandGenerator_t& gen,
+    void* d_storage,
+    ConnKeyT* conn_key_subarray,
+    int64_t n_conn,
+    SynSpec& syn_spec );
+
+  void setConnSource( ConnKeyT& conn_key, inode_t source );
+
+  int getConnDelay( const ConnKeyT& conn_key );
+
+  int
+  connect( inode_t source, inode_t n_source, inode_t target, inode_t n_target, ConnSpec& conn_spec, SynSpec& syn_spec )
+  {
+    return _Connect( source, n_source, target, n_target, conn_spec, syn_spec );
+  }
+
+  int
+  connect( inode_t* source, inode_t n_source, inode_t target, inode_t n_target, ConnSpec& conn_spec, SynSpec& syn_spec )
+  {
+    return _Connect( source, n_source, target, n_target, conn_spec, syn_spec );
+  }
+
+  int
+  connect( inode_t source, inode_t n_source, inode_t* target, inode_t n_target, ConnSpec& conn_spec, SynSpec& syn_spec )
+  {
+    return _Connect( source, n_source, target, n_target, conn_spec, syn_spec );
+  }
+
+  int
+  connect( inode_t* source,
+    inode_t n_source,
+    inode_t* target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec )
+  {
+    return _Connect( source, n_source, target, n_target, conn_spec, syn_spec );
+  }
+
+  template < class T1, class T2 >
+  int _Connect( T1 source, inode_t n_source, T2 target, inode_t n_target, ConnSpec& conn_spec, SynSpec& syn_spec );
+
+  template < class T1, class T2 >
+  int _Connect( curandGenerator_t& gen,
+    T1 source,
+    inode_t n_source,
+    T2 target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec,
+    bool remote_source_flag );
+
+  template < class T1, class T2 >
+  int connectOneToOne( curandGenerator_t& gen,
+    T1 source,
+    T2 target,
+    inode_t n_node,
+    SynSpec& syn_spec,
+    bool remote_source_flag );
+
+  template < class T1, class T2 >
+  int connectAllToAll( curandGenerator_t& gen,
+    T1 source,
+    inode_t n_source,
+    T2 target,
+    inode_t n_target,
+    SynSpec& syn_spec,
+    bool remote_source_flag );
+
+  template < class T1, class T2 >
+  int connectFixedTotalNumber( curandGenerator_t& gen,
+    T1 source,
+    inode_t n_source,
+    T2 target,
+    inode_t n_target,
+    int64_t total_num,
+    SynSpec& syn_spec,
+    bool remote_source_flag );
+
+  template < class T1, class T2 >
+  int connectFixedIndegree( curandGenerator_t& gen,
+    T1 source,
+    inode_t n_source,
+    T2 target,
+    inode_t n_target,
+    int indegree,
+    SynSpec& syn_spec,
+    bool remote_source_flag );
+
+  template < class T1, class T2 >
+  int connectFixedOutdegree( curandGenerator_t& gen,
+    T1 source,
+    inode_t n_source,
+    T2 target,
+    inode_t n_target,
+    int outdegree,
+    SynSpec& syn_spec,
+    bool remote_source_flag );
+
+public:
+  int organizeConnections( inode_t n_node );
+
+  int getConnectionFloatParam( int64_t* conn_ids, int64_t n_conn, float* h_param_arr, std::string param_name );
+
+  int getConnectionIntParam( int64_t* conn_ids, int64_t n_conn, int* h_param_arr, std::string param_name );
+
+  int setConnectionFloatParam( int64_t* conn_ids, int64_t n_conn, float val, std::string param_name );
+
+  int setConnectionFloatParamDistr( int64_t* conn_ids, int64_t n_conn, std::string param_name );
+
+  int setConnectionIntParamArr( int64_t* conn_ids, int64_t n_conn, int* h_param_arr, std::string param_name );
+
+  int setConnectionIntParam( int64_t* conn_ids, int64_t n_conn, int val, std::string param_name );
+
+  int64_t* getConnections( inode_t* i_source_pt,
+    inode_t n_source,
+    inode_t* i_target_pt,
+    inode_t n_target,
+    int syn_group,
+    int64_t* n_conn );
+
+  int getConnectionStatus( int64_t* conn_ids,
+    int64_t n_conn,
+    inode_t* source,
+    inode_t* target,
+    int* port,
+    int* syn_group,
+    float* delay,
+    float* weight );
+
+  //////////////////////////////////////////////////
+  // class ConnectionTemplate remote-connection-related methods
+  //////////////////////////////////////////////////
+
+  // set number of hosts
+  int setNHosts( int n_hosts );
+
+  // set index of this host
+  int setThisHost( int this_host );
+
+  // Initialize the maps
+  int remoteConnectionMapInit();
+
+  // Calibrate the maps
+  int remoteConnectionMapCalibrate( inode_t n_nodes );
+
+  // Allocate GPU memory for new remote-source-node-map blocks
+  int allocRemoteSourceNodeMapBlocks( std::vector< uint* >& i_remote_src_node_map,
+    std::vector< uint* >& i_local_spike_buf_map,
+    uint new_n_block );
+
+  // Allocate GPU memory for new local-source-node-map blocks
+  int allocLocalSourceNodeMapBlocks( std::vector< uint* >& i_local_src_node_map, uint new_n_block );
+
+  // allocate/reallocate device memory to store source node indexes of a
+  // remote connection command
+  int reallocConnSourceIds( int64_t n_conn );
+
+  // Loop on all new connections and set source_node_flag[i_source]=true
+  int setUsedSourceNodes( int64_t old_n_conn, uint* d_source_node_flag );
+
+  int setUsedSourceNodesOnSourceHost( int64_t old_n_conn, uint* d_source_node_flag );
+
+  // Loops on all new connections and replaces the source node index
+  // source_node[i_conn] with the value of the element pointed by the
+  // index itself in the array local_node_index
+  int fixConnectionSourceNodeIndexes( int64_t old_n_conn, uint* d_local_node_index );
+
+  // remote connect functions
+  int
+  remoteConnect( int source_host,
+    inode_t source,
+    inode_t n_source,
+    int target_host,
+    inode_t target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec )
+  {
+    return _RemoteConnect< inode_t, inode_t >(
+      source_host, source, n_source, target_host, target, n_target, conn_spec, syn_spec );
+  }
+
+  int
+  remoteConnect( int source_host,
+    inode_t* source,
+    inode_t n_source,
+    int target_host,
+    inode_t target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec )
+  {
+    return _RemoteConnect< inode_t*, inode_t >(
+      source_host, source, n_source, target_host, target, n_target, conn_spec, syn_spec );
+  }
+
+  int
+  remoteConnect( int source_host,
+    inode_t source,
+    inode_t n_source,
+    int target_host,
+    inode_t* target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec )
+  {
+    return _RemoteConnect< inode_t, inode_t* >(
+      source_host, source, n_source, target_host, target, n_target, conn_spec, syn_spec );
+  }
+
+  int
+  remoteConnect( int source_host,
+    inode_t* source,
+    inode_t n_source,
+    int target_host,
+    inode_t* target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec )
+  {
+    return _RemoteConnect< inode_t*, inode_t* >(
+      source_host, source, n_source, target_host, target, n_target, conn_spec, syn_spec );
+  }
+
+  template < class T1, class T2 >
+  int _RemoteConnect( int source_host,
+    T1 source,
+    inode_t n_source,
+    int target_host,
+    T2 target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec );
+
+  int addOffsetToExternalNodeIds( uint n_local_nodes );
+
+  // REMOTE CONNECT FUNCTION for target_host matching this_host
+  template < class T1, class T2 >
+  int remoteConnectSource( int source_host,
+    T1 source,
+    inode_t n_source,
+    T2 target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec );
+
+  // REMOTE CONNECT FUNCTION for source_host matching this_host
+  template < class T1, class T2 >
+  int remoteConnectTarget( int target_host,
+    T1 source,
+    inode_t n_source,
+    T2 target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec );
+
+  int addOffsetToSpikeBufferMap( inode_t n_nodes );
+
+  //////////////////////////////////////////////////
+  // class ConnectionTemplate reverse-connection-related methods
+  //////////////////////////////////////////////////
+  int revSpikeFree();
+
+  int revSpikeInit( uint n_spike_buffers );
+
+  int resetConnectionSpikeTimeUp();
+
+  int resetConnectionSpikeTimeDown();
+
+  //////////////////////////////////////////////////
+  // class ConnectionTemplate direct-connection-related methods
+  //////////////////////////////////////////////////
+  int buildDirectConnections( inode_t i_node_0,
+    inode_t n_node,
+    int64_t& i_conn0,
+    int64_t& n_dir_conn,
+    int& max_delay,
+    float*& d_mu_arr,
+    void*& d_poiss_key_array );
+
+  int organizeDirectConnections( void*& d_poiss_key_array_data_pt,
+    void*& d_poiss_subarray,
+    int64_t*& d_poiss_num,
+    int64_t*& d_poiss_sum,
+    void*& d_poiss_thresh );
+
+  int sendDirectSpikes( long long time_idx,
+    int64_t i_conn0,
+    int64_t n_dir_conn,
+    inode_t n_node,
+    int max_delay,
+    float* d_mu_arr,
+    void* d_poiss_key_array,
+    curandState* d_curand_state );
+};
+
+namespace poiss_conn
+{
+extern void* d_poiss_key_array_data_pt;
+extern void* d_poiss_subarray;
+extern int64_t* d_poiss_num;
+extern int64_t* d_poiss_sum;
+extern void* d_poiss_thresh;
+int organizeDirectConnections( Connection* conn );
+}; // namespace poiss_conn
+
+enum ConnectionFloatParamIndexes
+{
+  i_weight_param = 0,
+  i_delay_param,
+  N_CONN_FLOAT_PARAM
+};
+
+enum ConnectionIntParamIndexes
+{
+  i_source_param = 0,
+  i_target_param,
+  i_port_param,
+  i_syn_group_param,
+  N_CONN_INT_PARAM
+};
+
+extern __constant__ float NESTGPUTimeResolution;
+
+extern __device__ int16_t* NodeGroupMap;
+
+extern __constant__ NodeGroupStruct NodeGroupArray[];
+
+// maximum number of bits used to represent node index
+extern __device__ int MaxNodeNBits;
+
+// maximum number of bits used to represent delays
+extern __device__ int MaxDelayNBits;
+
+// maximum number of bits used to represent synapse group index
+extern __device__ int MaxSynNBits;
+
+// maximum number of bits used to represent receptor port index
+extern __device__ int MaxPortNBits;
+
+// maximum number of bits used to represent receptor port index
+// and synapse group index
+extern __device__ int MaxPortSynNBits;
+
+// bit mask used to extract source node index
+extern __device__ uint SourceMask;
+
+// bit mask used to extract delay
+extern __device__ uint DelayMask;
+
+// bit mask used to extract target node index
+extern __device__ uint TargetMask;
+
+// bit mask used to extract synapse group index
+extern __device__ uint SynMask;
+
+// bit mask used to extract port index
+extern __device__ uint PortMask;
+
+// bit mask used to extract port and synapse group index
+extern __device__ uint PortSynMask;
+
+extern __device__ iconngroup_t* ConnGroupIdx0;
+
+extern __device__ int64_t* ConnGroupIConn0;
+
+extern __device__ int* ConnGroupDelay;
+
+extern __device__ int64_t ConnBlockSize;
+
+// it seems that there is no relevant advantage in using a constant array
+// however better to keep this option ready and commented
+// extern __constant__ uint* ConnKeyArray[];
+extern __device__ void* ConnKeyArray;
+
+// extern __constant__ connection_struct* ConnStructArray[];
+extern __device__ void* ConnStructArray;
+
+extern __device__ unsigned short* ConnectionSpikeTime;
+
+template < class ConnKeyT >
+__device__ __forceinline__ void setConnDelay( ConnKeyT& conn_key, int delay );
+
+template < class ConnKeyT >
+__device__ __forceinline__ void setConnSource( ConnKeyT& conn_key, inode_t source );
+
+template < class ConnStructT >
+__device__ __forceinline__ void setConnTarget( ConnStructT& conn_struct, inode_t target );
+
+template < class ConnKeyT, class ConnStructT >
+__device__ __forceinline__ void setConnPort( ConnKeyT& conn_key, ConnStructT& conn_struct, int port );
+
+template < class ConnKeyT, class ConnStructT >
+__device__ __forceinline__ void setConnSyn( ConnKeyT& conn_key, ConnStructT& conn_struct, int syn );
+
+template < class ConnKeyT >
+__device__ __forceinline__ int getConnDelay( const ConnKeyT& conn_key );
+
+template < class ConnKeyT >
+__device__ __forceinline__ inode_t getConnSource( ConnKeyT& conn_key );
+
+template < class ConnStructT >
+__device__ __forceinline__ inode_t getConnTarget( ConnStructT& conn_struct );
+
+template < class ConnKeyT, class ConnStructT >
+__device__ __forceinline__ int getConnPort( ConnKeyT& conn_key, ConnStructT& conn_struct );
+
+template < class ConnKeyT, class ConnStructT >
+__device__ __forceinline__ int getConnSyn( ConnKeyT& conn_key, ConnStructT& conn_struct );
+
+template < class ConnKeyT, class ConnStructT >
+__device__ __forceinline__ bool getConnRemoteFlag( ConnKeyT& conn_key, ConnStructT& conn_struct );
+
+template < class ConnKeyT, class ConnStructT >
+__device__ __forceinline__ void clearConnRemoteFlag( ConnKeyT& conn_key, ConnStructT& conn_struct );
+
+template < class ConnStructT >
+__global__ void
+setWeights( ConnStructT* conn_struct_subarray, float weight, int64_t n_conn )
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+  conn_struct_subarray[ i_conn ].weight = weight;
+}
+
+template < class ConnStructT >
+__global__ void
+setWeights( ConnStructT* conn_struct_subarray, float* arr_val, int64_t n_conn )
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+  conn_struct_subarray[ i_conn ].weight = arr_val[ i_conn ];
+}
+
+template < class ConnKeyT >
+__global__ void
+setDelays( ConnKeyT* conn_key_subarray, float* arr_val, int64_t n_conn, float time_resolution )
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+  int delay = ( int ) round( arr_val[ i_conn ] / time_resolution );
+  delay = max( delay, 1 );
+  setConnDelay< ConnKeyT >( conn_key_subarray[ i_conn ], delay );
+}
+
+template < class ConnKeyT >
+__global__ void
+setDelays( ConnKeyT* conn_key_subarray, float fdelay, int64_t n_conn, float time_resolution )
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+  int delay = ( int ) round( fdelay / time_resolution );
+  delay = max( delay, 1 );
+  setConnDelay< ConnKeyT >( conn_key_subarray[ i_conn ], delay );
+}
+
+template < class ConnKeyT, class ConnStructT >
+__global__ void
+setPort( ConnKeyT* conn_key_subarray, ConnStructT* conn_struct_subarray, int port, int64_t n_conn )
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+  setConnPort< ConnKeyT, ConnStructT >( conn_key_subarray[ i_conn ], conn_struct_subarray[ i_conn ], port );
+}
+
+template < class ConnKeyT, class ConnStructT >
+__global__ void
+setSynGroup( ConnKeyT* conn_key_subarray, ConnStructT* conn_struct_subarray, int syn_group, int64_t n_conn )
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+  setConnSyn< ConnKeyT, ConnStructT >( conn_key_subarray[ i_conn ], conn_struct_subarray[ i_conn ], syn_group );
+}
+
+template < class ConnKeyT, class ConnStructT >
+__global__ void
+setPortSynGroup( ConnKeyT* conn_key_subarray,
+  ConnStructT* conn_struct_subarray,
+  int port,
+  int syn_group,
+  int64_t n_conn )
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+  setConnPort< ConnKeyT, ConnStructT >( conn_key_subarray[ i_conn ], conn_struct_subarray[ i_conn ], port );
+  setConnSyn< ConnKeyT, ConnStructT >( conn_key_subarray[ i_conn ], conn_struct_subarray[ i_conn ], syn_group );
+}
+
+__global__ void setSourceTargetIndexKernel( uint64_t n_src_tgt,
+  inode_t n_source,
+  inode_t n_target,
+  uint64_t* d_src_tgt_arr,
+  inode_t* d_src_arr,
+  inode_t* d_tgt_arr );
+
+__global__ void setConnGroupNum( inode_t n_compact,
+  iconngroup_t* conn_group_num,
+  iconngroup_t* conn_group_idx0_compact,
+  inode_t* conn_group_source_compact );
+
+__global__ void setConnGroupIConn0( int64_t n_block_conn,
+  int* conn_group_iconn0_mask,
+  iconngroup_t* conn_group_iconn0_mask_cumul,
+  int64_t* conn_group_iconn0,
+  int64_t i_conn0,
+  iconngroup_t* offset );
+
+template < class T >
+__global__ void
+setConnGroupNewOffset( T* offset, T* add_offset )
+{
+  *offset = *offset + *add_offset;
+}
+
+template < class ConnKeyT >
+__global__ void
+buildConnGroupIConn0Mask( ConnKeyT* conn_key_subarray,
+  ConnKeyT* conn_key_subarray_prev,
+  int64_t n_block_conn,
+  int* conn_group_iconn0_mask )
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_block_conn )
+  {
+    return;
+  }
+  ConnKeyT val = conn_key_subarray[ i_conn ];
+  ConnKeyT prev_val;
+  inode_t prev_source;
+  int prev_delay;
+  if ( i_conn == 0 )
+  {
+    if ( conn_key_subarray_prev != NULL )
+    {
+      prev_val = *conn_key_subarray_prev;
+      prev_source = getConnSource< ConnKeyT >( prev_val );
+      prev_delay = getConnDelay< ConnKeyT >( prev_val );
+    }
+    else
+    {
+      prev_source = 0;
+      prev_delay = -1; // just to ensure it is different
+    }
+  }
+  else
+  {
+    prev_val = conn_key_subarray[ i_conn - 1 ];
+    prev_source = getConnSource< ConnKeyT >( prev_val );
+    prev_delay = getConnDelay< ConnKeyT >( prev_val );
+  }
+  inode_t source = getConnSource< ConnKeyT >( val );
+  int delay = getConnDelay< ConnKeyT >( val );
+  if ( source != prev_source || delay != prev_delay )
+  {
+    conn_group_iconn0_mask[ i_conn ] = 1;
+  }
+}
+
+template < class ConnKeyT >
+__global__ void
+setConnGroupIdx0Compact( ConnKeyT* conn_key_subarray,
+  int64_t n_block_conn,
+  int* conn_group_idx0_mask,
+  iconngroup_t* conn_group_iconn0_mask_cumul,
+  inode_t* conn_group_idx0_mask_cumul,
+  iconngroup_t* conn_group_idx0_compact,
+  inode_t* conn_group_source_compact,
+  iconngroup_t* iconn0_offset,
+  inode_t* idx0_offset )
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn > n_block_conn )
+  {
+    return;
+  }
+  if ( i_conn < n_block_conn && conn_group_idx0_mask[ i_conn ] == 0 )
+  {
+    return;
+  }
+  iconngroup_t i_group = conn_group_iconn0_mask_cumul[ i_conn ] + *iconn0_offset;
+  inode_t i_source_compact = conn_group_idx0_mask_cumul[ i_conn ] + *idx0_offset;
+  conn_group_idx0_compact[ i_source_compact ] = i_group;
+  if ( i_conn < n_block_conn )
+  {
+    // int source = conn_key_subarray[i_conn] >> MaxPortSynNBits;
+    inode_t source = getConnSource< ConnKeyT >( conn_key_subarray[ i_conn ] );
+    conn_group_source_compact[ i_source_compact ] = source;
+  }
+}
+
+template < class ConnKeyT >
+__global__ void
+getConnGroupDelay( int64_t block_size,
+  ConnKeyT** conn_key_array,
+  int64_t* conn_group_iconn0,
+  int* conn_group_delay,
+  iconngroup_t conn_group_num )
+{
+  iconngroup_t conn_group_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( conn_group_idx >= conn_group_num )
+  {
+    return;
+  }
+  int64_t i_conn = conn_group_iconn0[ conn_group_idx ];
+  int i_block = ( int ) ( i_conn / block_size );
+  int64_t i_block_conn = i_conn % block_size;
+  ConnKeyT& conn_key = conn_key_array[ i_block ][ i_block_conn ];
+  conn_group_delay[ conn_group_idx ] = getConnDelay( conn_key );
+}
+
+template < class ConnKeyT >
+__global__ void
+buildConnGroupMask( ConnKeyT* conn_key_subarray,
+  ConnKeyT* conn_key_subarray_prev,
+  int64_t n_block_conn,
+  int* conn_group_iconn0_mask,
+  int* conn_group_idx0_mask )
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_block_conn )
+  {
+    return;
+  }
+  ConnKeyT val = conn_key_subarray[ i_conn ];
+  ConnKeyT prev_val;
+  inode_t prev_source;
+  int prev_delay;
+  if ( i_conn == 0 )
+  {
+    if ( conn_key_subarray_prev != NULL )
+    {
+      prev_val = *conn_key_subarray_prev;
+      // prev_source = prev_val >> MaxPortSynNBits;
+      prev_source = getConnSource< ConnKeyT >( prev_val );
+      prev_delay = getConnDelay< ConnKeyT >( prev_val );
+    }
+    else
+    {
+      prev_source = 0;
+      prev_delay = -1; // just to ensure it is different
+    }
+  }
+  else
+  {
+    prev_val = conn_key_subarray[ i_conn - 1 ];
+    // prev_source = prev_val >> MaxPortSynNBits;
+    prev_source = getConnSource< ConnKeyT >( prev_val );
+    prev_delay = getConnDelay< ConnKeyT >( prev_val );
+  }
+  // int source = val >> MaxPortSynNBits;
+  inode_t source = getConnSource< ConnKeyT >( val );
+  if ( source != prev_source || prev_delay < 0 )
+  {
+    conn_group_iconn0_mask[ i_conn ] = 1;
+    conn_group_idx0_mask[ i_conn ] = 1;
+  }
+  else
+  {
+    int delay = getConnDelay< ConnKeyT >( val );
+    if ( delay != prev_delay )
+    {
+      conn_group_iconn0_mask[ i_conn ] = 1;
+    }
+  }
+}
+
+__device__ __forceinline__ inode_t
+getNodeIndex( inode_t i_node_0, inode_t i_node_rel )
+{
+  return i_node_0 + i_node_rel;
+}
+
+__device__ __forceinline__ inode_t
+getNodeIndex( inode_t* i_node_0, inode_t i_node_rel )
+{
+  return *( i_node_0 + i_node_rel );
+}
+
+template < class T, class ConnKeyT >
+__global__ void
+setSource( ConnKeyT* conn_key_subarray, uint* rand_val, int64_t n_conn, T source, inode_t n_source )
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+  inode_t i_source = getNodeIndex( source, rand_val[ i_conn ] % n_source );
+  setConnSource< ConnKeyT >( conn_key_subarray[ i_conn ], i_source );
+}
+
+template < class T >
+__global__ void
+setSource( inode_t* conn_source_ids, uint* rand_val, int64_t n_conn, T source, inode_t n_source )
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+  inode_t i_source = getNodeIndex( source, rand_val[ i_conn ] % n_source );
+  conn_source_ids[ i_conn ] = i_source;
+}
+
+template < class T, class ConnStructT >
+__global__ void
+setTarget( ConnStructT* conn_struct_subarray, uint* rand_val, int64_t n_conn, T target, inode_t n_target )
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+  inode_t i_target = getNodeIndex( target, rand_val[ i_conn ] % n_target );
+  setConnTarget< ConnStructT >( conn_struct_subarray[ i_conn ], i_target );
+}
+
+template < class T1, class T2, class ConnKeyT, class ConnStructT >
+__global__ void
+setOneToOneSourceTarget( ConnKeyT* conn_key_subarray,
+  ConnStructT* conn_struct_subarray,
+  int64_t n_block_conn,
+  int64_t n_prev_conn,
+  T1 source,
+  T2 target )
+{
+  int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_block_conn >= n_block_conn )
+  {
+    return;
+  }
+  int64_t i_conn = n_prev_conn + i_block_conn;
+  inode_t i_source = getNodeIndex( source, ( int ) ( i_conn ) );
+  inode_t i_target = getNodeIndex( target, ( int ) ( i_conn ) );
+  setConnSource< ConnKeyT >( conn_key_subarray[ i_block_conn ], i_source );
+  setConnTarget< ConnStructT >( conn_struct_subarray[ i_block_conn ], i_target );
+}
+
+template < class T >
+__global__ void
+setOneToOneSource( inode_t* conn_source_ids, int64_t n_block_conn, int64_t n_prev_conn, T source )
+{
+  int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_block_conn >= n_block_conn )
+  {
+    return;
+  }
+  int64_t i_conn = n_prev_conn + i_block_conn;
+  inode_t i_source = getNodeIndex( source, ( int ) ( i_conn ) );
+  conn_source_ids[ i_conn ] = i_source;
+}
+
+template < class T1, class T2, class ConnKeyT, class ConnStructT >
+__global__ void
+setAllToAllSourceTarget( ConnKeyT* conn_key_subarray,
+  ConnStructT* conn_struct_subarray,
+  int64_t n_block_conn,
+  int64_t n_prev_conn,
+  T1 source,
+  inode_t n_source,
+  T2 target,
+  inode_t n_target )
+{
+  int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_block_conn >= n_block_conn )
+  {
+    return;
+  }
+  int64_t i_conn = n_prev_conn + i_block_conn;
+  inode_t i_source = getNodeIndex( source, ( int ) ( i_conn / n_target ) );
+  inode_t i_target = getNodeIndex( target, ( int ) ( i_conn % n_target ) );
+  setConnSource< ConnKeyT >( conn_key_subarray[ i_block_conn ], i_source );
+  setConnTarget< ConnStructT >( conn_struct_subarray[ i_block_conn ], i_target );
+}
+
+template < class T1 >
+__global__ void
+setAllToAllSource( inode_t* conn_source_ids,
+  int64_t n_block_conn,
+  int64_t n_prev_conn,
+  T1 source,
+  inode_t n_source,
+  inode_t n_target )
+{
+  int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_block_conn >= n_block_conn )
+  {
+    return;
+  }
+  int64_t i_conn = n_prev_conn + i_block_conn;
+  inode_t i_source = getNodeIndex( source, ( int ) ( i_conn / n_target ) );
+  conn_source_ids[ i_conn ] = i_source;
+}
+
+template < class T, class ConnStructT >
+__global__ void
+setIndegreeTarget( ConnStructT* conn_struct_subarray,
+  int64_t n_block_conn,
+  int64_t n_prev_conn,
+  T target,
+  int indegree )
+{
+  int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_block_conn >= n_block_conn )
+  {
+    return;
+  }
+  int64_t i_conn = n_prev_conn + i_block_conn;
+  inode_t i_target = getNodeIndex( target, ( int ) ( i_conn / indegree ) );
+  setConnTarget< ConnStructT >( conn_struct_subarray[ i_block_conn ], i_target );
+}
+
+template < class T, class ConnKeyT >
+__global__ void
+setOutdegreeSource( ConnKeyT* conn_key_subarray, int64_t n_block_conn, int64_t n_prev_conn, T source, int outdegree )
+{
+  int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_block_conn >= n_block_conn )
+  {
+    return;
+  }
+  int64_t i_conn = n_prev_conn + i_block_conn;
+  inode_t i_source = getNodeIndex( source, ( int ) ( i_conn / outdegree ) );
+  setConnSource< ConnKeyT >( conn_key_subarray[ i_block_conn ], i_source );
+}
+
+template < class T >
+__global__ void
+setOutdegreeSource( inode_t* conn_source_ids, int64_t n_block_conn, int64_t n_prev_conn, T source, int outdegree )
+{
+  int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_block_conn >= n_block_conn )
+  {
+    return;
+  }
+  int64_t i_conn = n_prev_conn + i_block_conn;
+  inode_t i_source = getNodeIndex( source, ( int ) ( i_conn / outdegree ) );
+  conn_source_ids[ i_conn ] = i_source;
+}
+
+// Count number of connections per source-target couple
+template < class ConnKeyT, class ConnStructT >
+__global__ void
+countConnectionsKernel( int64_t n_conn,
+  inode_t n_source,
+  inode_t n_target,
+  uint64_t* src_tgt_arr,
+  uint64_t* src_tgt_conn_num,
+  int syn_group )
+{
+  int64_t i_conn = ( int64_t ) blockIdx.x * blockDim.x + threadIdx.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+
+  int i_block = ( int ) ( i_conn / ConnBlockSize );
+  int64_t i_block_conn = i_conn % ConnBlockSize;
+  ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ];
+  ConnKeyT& conn_key = ( ( ConnKeyT** ) ConnKeyArray )[ i_block ][ i_block_conn ];
+  // if (syn_group==-1 || conn.syn_group == syn_group) {
+  int syn_group1 = getConnSyn< ConnKeyT, ConnStructT >( conn_key, conn_struct );
+  if ( syn_group == -1 || ( syn_group1 == syn_group ) )
+  {
+    // First get source and target node index
+    inode_t i_target = getConnTarget< ConnStructT >( conn_struct );
+    inode_t i_source = getConnSource< ConnKeyT >( conn_key );
+    uint64_t i_src_tgt = ( ( int64_t ) i_source << 32 ) | i_target;
+    uint64_t i_arr = locate( i_src_tgt, src_tgt_arr, n_source * n_target );
+    if ( src_tgt_arr[ i_arr ] == i_src_tgt )
+    {
+      // printf("i_conn %lld i_source %d i_target %d i_src_tgt %lld "
+      //      "i_arr %lld\n", i_conn, i_source, i_target, i_src_tgt, i_arr);
+      //  (atomic)increase the number of connections for source-target couple
+      atomicAdd( ( unsigned long long* ) &src_tgt_conn_num[ i_arr ], 1 );
+    }
+  }
+}
+
+// Fill array of connection indexes
+template < class ConnKeyT, class ConnStructT >
+__global__ void
+setConnectionsIndexKernel( int64_t n_conn,
+  inode_t n_source,
+  inode_t n_target,
+  uint64_t* src_tgt_arr,
+  uint64_t* src_tgt_conn_num,
+  uint64_t* src_tgt_conn_cumul,
+  int syn_group,
+  int64_t* conn_ids )
+{
+  int64_t i_conn = ( int64_t ) blockIdx.x * blockDim.x + threadIdx.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+
+  int i_block = ( int ) ( i_conn / ConnBlockSize );
+  int64_t i_block_conn = i_conn % ConnBlockSize;
+  ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ];
+  ConnKeyT& conn_key = ( ( ConnKeyT** ) ConnKeyArray )[ i_block ][ i_block_conn ];
+  // if (syn_group==-1 || conn.syn_group == syn_group) {
+  int syn_group1 = getConnSyn< ConnKeyT, ConnStructT >( conn_key, conn_struct );
+  if ( syn_group == -1 || ( syn_group1 == syn_group ) )
+  {
+    // First get source and target node index
+    inode_t i_target = getConnTarget< ConnStructT >( conn_struct );
+    inode_t i_source = getConnSource< ConnKeyT >( conn_key );
+    uint64_t i_src_tgt = ( ( int64_t ) i_source << 32 ) | i_target;
+    uint64_t i_arr = locate( i_src_tgt, src_tgt_arr, n_source * n_target );
+    if ( src_tgt_arr[ i_arr ] == i_src_tgt )
+    {
+      // printf("i_conn %lld i_source %d i_target %d i_src_tgt %lld "
+      //      "i_arr %lld\n", i_conn, i_source, i_target, i_src_tgt, i_arr);
+      //  (atomic)increase the number of connections for source-target couple
+      uint64_t pos = atomicAdd( ( unsigned long long* ) &src_tgt_conn_num[ i_arr ], 1 );
+      // printf("pos %lld src_tgt_conn_cumul[i_arr] %lld\n",
+      //      pos, src_tgt_conn_cumul[i_arr]);
+      conn_ids[ src_tgt_conn_cumul[ i_arr ] + pos ] = i_conn;
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CUDA Kernel that gets all parameters of an array of n_conn connections,
+// identified by the indexes conn_ids[i], and puts them in the arrays
+// i_source, i_target, port, syn_group, delay, weight
+//////////////////////////////////////////////////////////////////////
+template < class ConnKeyT, class ConnStructT >
+__global__ void
+getConnectionStatusKernel( int64_t* conn_ids,
+  int64_t n_conn,
+  inode_t* source,
+  inode_t* target,
+  int* port,
+  int* syn_group,
+  float* delay,
+  float* weight )
+{
+  int64_t i_arr = ( int64_t ) blockIdx.x * blockDim.x + threadIdx.x;
+  if ( i_arr >= n_conn )
+  {
+    return;
+  }
+
+  // get connection index, connection block index and index within block
+  int64_t i_conn = conn_ids[ i_arr ];
+  int i_block = ( int ) ( i_conn / ConnBlockSize );
+  int64_t i_block_conn = i_conn % ConnBlockSize;
+  // get connection structure
+  ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ];
+  ConnKeyT& conn_key = ( ( ConnKeyT** ) ConnKeyArray )[ i_block ][ i_block_conn ];
+  // Get source, target, port, synaptic group and delay
+  inode_t i_source = getConnSource< ConnKeyT >( conn_key );
+  inode_t i_target = getConnTarget< ConnStructT >( conn_struct );
+  int i_port = getConnPort< ConnKeyT, ConnStructT >( conn_key, conn_struct );
+  int i_syn_group = getConnSyn< ConnKeyT, ConnStructT >( conn_key, conn_struct );
+  int i_delay = getConnDelay< ConnKeyT >( conn_key );
+  source[ i_arr ] = i_source;
+  target[ i_arr ] = i_target;
+  port[ i_arr ] = i_port;
+  // Get weight and synapse group
+  weight[ i_arr ] = conn_struct.weight;
+  syn_group[ i_arr ] = i_syn_group;
+  delay[ i_arr ] = NESTGPUTimeResolution * i_delay;
+}
+
+//////////////////////////////////////////////////////////////////////
+// CUDA Kernel that gets a float parameter of an array of n_conn connections,
+// identified by the indexes conn_ids[i], and puts it in the array
+// param_arr
+//////////////////////////////////////////////////////////////////////
+template < class ConnKeyT, class ConnStructT >
+__global__ void
+getConnectionFloatParamKernel( int64_t* conn_ids, int64_t n_conn, float* param_arr, int i_param )
+{
+  int64_t i_arr = ( int64_t ) blockIdx.x * blockDim.x + threadIdx.x;
+  if ( i_arr >= n_conn )
+  {
+    return;
+  }
+
+  // get connection index, connection block index and index within block
+  int64_t i_conn = conn_ids[ i_arr ];
+  int i_block = ( int ) ( i_conn / ConnBlockSize );
+  int64_t i_block_conn = i_conn % ConnBlockSize;
+  // get connection structure
+  ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ];
+  ConnKeyT& conn_key = ( ( ConnKeyT** ) ConnKeyArray )[ i_block ][ i_block_conn ];
+  switch ( i_param )
+  {
+  case i_weight_param:
+  {
+    param_arr[ i_arr ] = conn_struct.weight;
+    break;
+  }
+  case i_delay_param:
+  {
+    // Get joined source-delay parameter, then delay
+    int i_delay = getConnDelay< ConnKeyT >( conn_key );
+    param_arr[ i_arr ] = NESTGPUTimeResolution * i_delay;
+    break;
+  }
+  }
+}
+
+template < class ConnKeyT, class ConnStructT >
+//////////////////////////////////////////////////////////////////////
+// CUDA Kernel that gets an integer parameter of an array of n_conn connections,
+// identified by the indexes conn_ids[i], and puts it in the array
+// param_arr
+//////////////////////////////////////////////////////////////////////
+__global__ void
+getConnectionIntParamKernel( int64_t* conn_ids, int64_t n_conn, int* param_arr, int i_param )
+{
+  int64_t i_arr = ( int64_t ) blockIdx.x * blockDim.x + threadIdx.x;
+  if ( i_arr >= n_conn )
+  {
+    return;
+  }
+
+  // get connection index, connection block index and index within block
+  int64_t i_conn = conn_ids[ i_arr ];
+  int i_block = ( int ) ( i_conn / ConnBlockSize );
+  int64_t i_block_conn = i_conn % ConnBlockSize;
+  // get connection structure
+  ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ];
+  ConnKeyT& conn_key = ( ( ConnKeyT** ) ConnKeyArray )[ i_block ][ i_block_conn ];
+  switch ( i_param )
+  {
+  case i_source_param:
+  {
+    inode_t i_source = getConnSource< ConnKeyT >( conn_key );
+    param_arr[ i_arr ] = i_source;
+    break;
+  }
+  case i_target_param:
+  {
+    inode_t i_target = getConnTarget< ConnStructT >( conn_struct );
+    param_arr[ i_arr ] = i_target;
+    break;
+  }
+  case i_port_param:
+  {
+    int i_port = getConnPort< ConnKeyT, ConnStructT >( conn_key, conn_struct );
+    param_arr[ i_arr ] = i_port;
+    break;
+  }
+  case i_syn_group_param:
+  {
+    // Get synapse group
+    int i_syn_group = getConnSyn< ConnKeyT, ConnStructT >( conn_key, conn_struct );
+    param_arr[ i_arr ] = i_syn_group;
+    break;
+  }
+  }
+}
+
+template < class ConnStructT >
+//////////////////////////////////////////////////////////////////////
+// CUDA Kernel that sets a float parameter of an array of n_conn connections,
+// identified by the indexes conn_ids[i], using values from the array
+// param_arr
+//////////////////////////////////////////////////////////////////////
+__global__ void
+setConnectionFloatParamKernel( int64_t* conn_ids, int64_t n_conn, float* param_arr, int i_param )
+{
+  int64_t i_arr = ( int64_t ) blockIdx.x * blockDim.x + threadIdx.x;
+  if ( i_arr >= n_conn )
+  {
+    return;
+  }
+
+  // get connection index, connection block index and index within block
+  int64_t i_conn = conn_ids[ i_arr ];
+  int i_block = ( int ) ( i_conn / ConnBlockSize );
+  int64_t i_block_conn = i_conn % ConnBlockSize;
+  // get connection structure
+  ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ];
+  switch ( i_param )
+  {
+  case i_weight_param:
+  {
+    conn_struct.weight = param_arr[ i_arr ];
+    break;
+  }
+  }
+}
+
+template < class ConnStructT >
+//////////////////////////////////////////////////////////////////////
+// CUDA Kernel that sets a float parameter of an array of n_conn connections,
+// identified by the indexes conn_ids[i], to the value val
+//////////////////////////////////////////////////////////////////////
+__global__ void
+setConnectionFloatParamKernel( int64_t* conn_ids, int64_t n_conn, float val, int i_param )
+{
+  int64_t i_arr = ( int64_t ) blockIdx.x * blockDim.x + threadIdx.x;
+  if ( i_arr >= n_conn )
+  {
+    return;
+  }
+
+  // get connection index, connection block index and index within block
+  int64_t i_conn = conn_ids[ i_arr ];
+  int i_block = ( int ) ( i_conn / ConnBlockSize );
+  int64_t i_block_conn = i_conn % ConnBlockSize;
+  // get connection structure
+  ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ];
+  switch ( i_param )
+  {
+  case i_weight_param:
+  {
+    conn_struct.weight = val;
+    break;
+  }
+  }
+}
+
+template < class ConnKeyT, class ConnStructT >
+//////////////////////////////////////////////////////////////////////
+// CUDA Kernel that sets an integer parameter of an array of n_conn connections,
+// identified by the indexes conn_ids[i], using values from the array
+// param_arr
+//////////////////////////////////////////////////////////////////////
+__global__ void
+setConnectionIntParamKernel( int64_t* conn_ids, int64_t n_conn, int* param_arr, int i_param )
+{
+  int64_t i_arr = ( int64_t ) blockIdx.x * blockDim.x + threadIdx.x;
+  if ( i_arr >= n_conn )
+  {
+    return;
+  }
+
+  // get connection index, connection block index and index within block
+  int64_t i_conn = conn_ids[ i_arr ];
+  int i_block = ( int ) ( i_conn / ConnBlockSize );
+  int64_t i_block_conn = i_conn % ConnBlockSize;
+  // get connection structure
+  ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ];
+  ConnKeyT& conn_key = ( ( ConnKeyT** ) ConnKeyArray )[ i_block ][ i_block_conn ];
+  switch ( i_param )
+  {
+  case i_target_param:
+  {
+    setConnTarget< ConnStructT >( conn_struct, param_arr[ i_arr ] );
+    break;
+  }
+  case i_port_param:
+  {
+    setConnPort< ConnKeyT, ConnStructT >( conn_key, conn_struct, param_arr[ i_arr ] );
+    break;
+  }
+  case i_syn_group_param:
+  {
+    setConnSyn< ConnKeyT, ConnStructT >( conn_key, conn_struct, param_arr[ i_arr ] );
+    break;
+  }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CUDA Kernel that sets an integer parameter of an array of n_conn connections,
+// identified by the indexes conn_ids[i], to the value val
+//////////////////////////////////////////////////////////////////////
+template < class ConnKeyT, class ConnStructT >
+__global__ void
+setConnectionIntParamKernel( int64_t* conn_ids, int64_t n_conn, int val, int i_param )
+{
+  int64_t i_arr = ( int64_t ) blockIdx.x * blockDim.x + threadIdx.x;
+  if ( i_arr >= n_conn )
+  {
+    return;
+  }
+
+  // get connection index, connection block index and index within block
+  int64_t i_conn = conn_ids[ i_arr ];
+  int i_block = ( int ) ( i_conn / ConnBlockSize );
+  int64_t i_block_conn = i_conn % ConnBlockSize;
+  // get connection structure
+  ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ];
+  ConnKeyT& conn_key = ( ( ConnKeyT** ) ConnKeyArray )[ i_block ][ i_block_conn ];
+  switch ( i_param )
+  {
+  case i_target_param:
+  {
+    setConnTarget< ConnStructT >( conn_struct, val );
+    break;
+  }
+  case i_port_param:
+  {
+    setConnPort< ConnKeyT, ConnStructT >( conn_key, conn_struct, val );
+    break;
+  }
+  case i_syn_group_param:
+  {
+    setConnSyn< ConnKeyT, ConnStructT >( conn_key, conn_struct, val );
+    break;
+  }
+  }
+}
+
+/*
+// max delay functor
+struct MaxDelay
+{
+  template <class ConnKeyT>
+  __device__ __forceinline__
+  //uint operator()(const uint &source_delay_a, const uint &source_delay_b)
+  //const {
+  ConnKeyT operator()(const ConnKeyT &conn_key_a,
+                      const ConnKeyT &conn_key_b) const {
+    int i_delay_a = getConnDelay<ConnKeyT>(conn_key_a);
+    int i_delay_b = getConnDelay<ConnKeyT>(conn_key_b);
+    return (i_delay_b > i_delay_a) ? i_delay_b : i_delay_a;
+  }
+};
+*/
+
+// max delay functor
+template < class ConnKeyT >
+struct MaxDelay
+{
+  __device__ __forceinline__
+    // uint operator()(const uint &source_delay_a, const uint &source_delay_b)
+    // const {
+    ConnKeyT
+    operator()( const ConnKeyT& conn_key_a, const ConnKeyT& conn_key_b ) const
+  {
+    int i_delay_a = getConnDelay< ConnKeyT >( conn_key_a );
+    int i_delay_b = getConnDelay< ConnKeyT >( conn_key_b );
+    // printf("conn_key_a: %lu\tconn_key_b: %lu\ti_delay_a: %d\ti_delay_b:
+    // %d\n",
+    //   conn_key_a, conn_key_b, i_delay_a, i_delay_b);
+    // return (i_delay_b > i_delay_a) ? i_delay_b : i_delay_a;
+
+    return ( i_delay_b > i_delay_a ) ? conn_key_b : conn_key_a;
+  }
+};
+
+template < class ConnKeyT >
+__global__ void
+poissGenSubstractFirstNodeIndexKernel( int64_t n_conn, ConnKeyT* poiss_key_array, int i_node_0 )
+{
+  int64_t blockId = ( int64_t ) blockIdx.y * gridDim.x + blockIdx.x;
+  int64_t i_conn_rel = blockId * blockDim.x + threadIdx.x;
+  if ( i_conn_rel >= n_conn )
+  {
+    return;
+  }
+  ConnKeyT& conn_key = poiss_key_array[ i_conn_rel ];
+  int i_source_rel = getConnSource< ConnKeyT >( conn_key ) - i_node_0;
+  setConnSource< ConnKeyT >( conn_key, i_source_rel );
+}
+
+template < class ConnKeyT, class ConnStructT >
+__global__ void
+sendDirectSpikeKernel( curandState* curand_state,
+  long long time_idx,
+  float* mu_arr,
+  ConnKeyT* poiss_key_array,
+  int64_t n_conn,
+  int64_t i_conn_0,
+  int64_t block_size,
+  int n_node,
+  int max_delay )
+{
+  int64_t blockId = ( int64_t ) blockIdx.y * gridDim.x + blockIdx.x;
+  int64_t i_conn_rel = blockId * blockDim.x + threadIdx.x;
+  if ( i_conn_rel >= n_conn )
+  {
+    return;
+  }
+  ConnKeyT& conn_key = poiss_key_array[ i_conn_rel ];
+  int i_source = getConnSource< ConnKeyT >( conn_key );
+  int i_delay = getConnDelay< ConnKeyT >( conn_key );
+  int id = ( int ) ( ( time_idx - i_delay + 1 ) % max_delay );
+
+  if ( id < 0 )
+  {
+    return;
+  }
+
+  float mu = mu_arr[ id * n_node + i_source ];
+  int n = curand_poisson( curand_state + i_conn_rel, mu );
+  if ( n > 0 )
+  {
+    int64_t i_conn = i_conn_0 + i_conn_rel;
+    int i_block = ( int ) ( i_conn / block_size );
+    int64_t i_block_conn = i_conn % block_size;
+    ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ];
+
+    int i_target = getConnTarget< ConnStructT >( conn_struct );
+    int port = getConnPort< ConnKeyT, ConnStructT >( conn_key, conn_struct );
+    float weight = conn_struct.weight;
+
+    int i_group = NodeGroupMap[ i_target ];
+    int i = port * NodeGroupArray[ i_group ].n_node_ + i_target - NodeGroupArray[ i_group ].i_node_0_;
+    double d_val = ( double ) ( weight * n );
+    atomicAddDouble( &NodeGroupArray[ i_group ].get_spike_array_[ i ], d_val );
+  }
+}
+// Count number of reverse connections per target node
+template < class ConnKeyT, class ConnStructT >
+__global__ void
+countRevConnectionsKernel( int64_t n_conn, int64_t* target_rev_connection_size_64 )
+{
+  int64_t i_conn = ( int64_t ) blockIdx.x * blockDim.x + threadIdx.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+
+  uint i_block = ( uint ) ( i_conn / ConnBlockSize );
+  int64_t i_block_conn = i_conn % ConnBlockSize;
+  ConnKeyT& conn_key = ( ( ConnKeyT** ) ConnKeyArray )[ i_block ][ i_block_conn ];
+  ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ];
+
+  // TO BE IMPROVED BY CHECKING IF THE SYNAPSE TYPE OF THE GROUP
+  // REQUIRES REVERSE CONNECTION
+  // - Check syn_group of all connections.
+  // - If syn_group>0 must create a reverse connection:
+  uint syn_group = getConnSyn< ConnKeyT, ConnStructT >( conn_key, conn_struct );
+  if ( syn_group > 0 )
+  {
+    // First get target node index
+    uint i_target = getConnTarget< ConnStructT >( conn_struct );
+    // (atomic)increase the number of reverse connections for target
+    atomicAdd( ( unsigned long long* ) &target_rev_connection_size_64[ i_target ], 1 );
+  }
+}
+
+// Fill array of reverse connection indexes
+template < class ConnKeyT, class ConnStructT >
+__global__ void
+setRevConnectionsIndexKernel( int64_t n_conn, int* target_rev_connection_size, int64_t** target_rev_connection )
+{
+  int64_t i_conn = ( int64_t ) blockIdx.x * blockDim.x + threadIdx.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+
+  uint i_block = ( uint ) ( i_conn / ConnBlockSize );
+  int64_t i_block_conn = i_conn % ConnBlockSize;
+  ConnKeyT& conn_key = ( ( ConnKeyT** ) ConnKeyArray )[ i_block ][ i_block_conn ];
+  ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ];
+
+  // TO BE IMPROVED BY CHECKING IF THE SYNAPSE TYPE OF THE GROUP
+  // REQUIRES REVERSE CONNECTION
+  // - Check syn_group of all connections.
+  // - If syn_group>0 must create a reverse connection:
+  uint syn_group = getConnSyn< ConnKeyT, ConnStructT >( conn_key, conn_struct );
+  if ( syn_group > 0 )
+  {
+    // First get target node index
+    uint i_target = getConnTarget< ConnStructT >( conn_struct );
+    // (atomic)increase the number of reverse connections for target
+    int pos = atomicAdd( &target_rev_connection_size[ i_target ], 1 );
+    // Evaluate the pointer to the rev connection position in the
+    // array of reverse connection indexes
+    int64_t* rev_conn_pt = target_rev_connection[ i_target ] + pos;
+    // Fill it with the connection index
+    *rev_conn_pt = i_conn;
+  }
+}
+
+__global__ void revConnectionInitKernel( int64_t* rev_conn, int* target_rev_conn_size, int64_t** target_rev_conn );
+
+__global__ void setConnectionSpikeTime( unsigned int n_conn, unsigned short time_idx );
+
+__global__ void
+deviceRevSpikeInit( unsigned int* rev_spike_num, unsigned int* rev_spike_target, int* rev_spike_n_conn );
+
+__global__ void setTargetRevConnectionsPtKernel( int n_spike_buffer,
+  int64_t* target_rev_connection_cumul,
+  int64_t** target_rev_connection,
+  int64_t* rev_connections );
+
+__global__ void resetConnectionSpikeTimeUpKernel( unsigned int n_conn );
+
+__global__ void resetConnectionSpikeTimeDownKernel( unsigned int n_conn );
+
+__global__ void connectCalibrateKernel( iconngroup_t* conn_group_idx0,
+  int64_t* conn_group_iconn0,
+  int* conn_group_delay,
+  int64_t block_size,
+  void* conn_key_array,
+  void* conn_struct_array,
+  unsigned short* conn_spike_time );
+
+// template <class ConnKeyT, class ConnStructT>
+// ConnectionTemplate<ConnKeyT, ConnStructT>::ConnectionTemplate()
+//{
+//   init();
+// }
 
-struct connection_struct
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::init()
 {
-  int target_port_syn;
-  float weight;
-  // unsigned char syn_group;
-};
+  /////////////////////////////////////////////////
+  // member variables initialization
+  distribution_ = NULL;
 
-extern uint h_MaxNodeNBits;
-extern __device__ uint MaxNodeNBits;
+  conn_block_size_ = 10000000;
 
-extern uint h_MaxPortSynNBits;
-extern __device__ uint MaxPortSynNBits;
+  n_conn_ = 0;
 
-extern uint h_MaxSynNBits;
-extern __device__ uint MaxSynNBits;
+  d_conn_storage_ = NULL;
 
-extern uint h_PortSynMask;
-extern __device__ uint PortSynMask;
+  time_resolution_ = 0.1;
 
-extern uint h_SynMask;
-extern __device__ uint SynMask;
+  d_conn_group_idx0_ = NULL;
 
-extern uint *d_ConnGroupIdx0;
-extern __device__ uint *ConnGroupIdx0;
+  d_conn_group_iconn0_ = NULL;
 
-extern int64_t *d_ConnGroupIConn0;
-extern __device__ int64_t *ConnGroupIConn0;
+  d_conn_group_delay_ = NULL;
 
-//extern uint *d_ConnGroupDelay;
-extern __device__ uint *ConnGroupDelay;
+  tot_conn_group_num_ = 0;
 
-extern uint tot_conn_group_num;
+  max_delay_num_ = 0;
 
-extern int64_t NConn;
+  d_conn_key_array_ = NULL;
 
-extern int64_t h_ConnBlockSize;
-extern __device__ int64_t ConnBlockSize;
+  d_conn_struct_array_ = NULL;
 
-extern uint h_MaxDelayNum;
+  d_conn_source_ids_ = NULL;
 
-// it seems that there is no relevant advantage in using a constant array
-// however better to keep this option ready and commented
-extern std::vector<uint*> KeySubarray;
-extern uint** d_SourceDelayArray;
-extern __device__ uint** SourceDelayArray;
-//extern __constant__ uint* SourceDelayArray[];
+  conn_source_ids_size_ = 0;
 
-extern std::vector<connection_struct*> ConnectionSubarray;
-extern connection_struct** d_ConnectionArray;
-extern __device__ connection_struct** ConnectionArray;
-//extern __constant__ connection_struct* ConnectionArray[];
+  //////////////////////////////////////////////////
+  // Remote-connection-related member variables
+  //////////////////////////////////////////////////
+  this_host_ = 0;
 
-int setMaxNodeNBits(int max_node_nbits);
+  n_hosts_ = 1;
 
-int setMaxSynNBits(int max_syn_nbits);
+  n_image_nodes_ = 0;
 
-int allocateNewBlocks(std::vector<uint*> &key_subarray,
-		      std::vector<connection_struct*> &conn_subarray,
-		      int64_t block_size, uint new_n_block);
+  // The arrays that map remote source nodes to local spike buffers
+  // are organized in blocks having block size:
+  node_map_block_size_ = 100000;
 
-int freeConnectionKey(std::vector<uint*> &key_subarray);
+  // number of elements in the map for each source host
+  // n_remote_source_node_map[i_source_host]
+  // with i_source_host = 0, ..., n_hosts-1 excluding this host itself
+  d_n_remote_source_node_map_ = NULL;
 
-int setConnectionWeights(curandGenerator_t &gen, void *d_storage,
-			 connection_struct *conn_subarray, int64_t n_conn,
-			 SynSpec &syn_spec);
+  d_local_spike_buffer_map_ = NULL;
 
-int setConnectionDelays(curandGenerator_t &gen, void *d_storage,
-			uint *key_subarray, int64_t n_conn,
-			SynSpec &syn_spec, float time_resolution);
+  // Arrays that map local source nodes to remote spike buffers
+  // number of elements in the map for each target host
+  // n_local_source_node_map[i_target_host]
+  // with i_target_host = 0, ..., n_hosts-1 excluding this host itself
+  d_n_local_source_node_map_ = NULL;
 
-__global__ void setPort(connection_struct *conn_subarray, uint port,
-			int64_t n_conn);
+  // local_source_node_map[i_target_host][i_block][i]
+  d_local_source_node_map_ = NULL;
 
-__global__ void setSynGroup(connection_struct *conn_subarray,
-			    unsigned char syn_group, int64_t n_conn);
+  // number of remote target hosts on which each local node
+  // has outgoing connections
+  d_n_target_hosts_ = NULL; // [n_nodes]
+  // target hosts for the node i_node
+  d_node_target_hosts_ = NULL; // [i_node]
+  // target host map indexes for the node i_node
+  d_node_target_host_i_map_ = NULL; // [i_node]
 
-int organizeConnections(float time_resolution, uint n_node, int64_t n_conn,
-			int64_t block_size,
-			std::vector<uint*> &key_subarray,
-			std::vector<connection_struct*> &conn_subarray);
+  // Boolean array with one boolean value for each connection rule
+  // - true if the rule always creates at least one outgoing connection
+  // from each source node (one_to_one, all_to_all, fixed_outdegree)
+  // - false otherwise (fixed_indegree, fixed_total_number, pairwise_bernoulli)
+  use_all_source_nodes_ = NULL; // [n_connection_rules]:
 
+  //////////////////////////////////////////////////
+  // reverse-connection-related member variables
+  //////////////////////////////////////////////////
+  rev_conn_flag_ = false;
+  spike_time_flag_ = false;
+  d_conn_spike_time_ = NULL;
 
-int ConnectInit();
+  n_rev_conn_ = 0;
+  d_rev_spike_num_ = NULL;
+  d_rev_spike_target_ = NULL;
+  d_rev_spike_n_conn_ = NULL;
+  d_rev_conn_ = NULL;             //[i] i=0,..., n_rev_conn_ - 1;
+  d_target_rev_conn_size_ = NULL; //[i] i=0,..., n_neuron-1;
+  d_target_rev_conn_ = NULL;      //[i][j] j=0,...,rev_conn_size_[i]-1
 
-__device__ __forceinline__
-uint GetNodeIndex(int i_node_0, int i_node_rel)
+  initConnRandomGenerator();
+
+  return 0;
+}
+
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::calibrate()
 {
-  return i_node_0 + i_node_rel;
+  if ( conn_source_ids_size_ > 0 && d_conn_source_ids_ != NULL )
+  {
+    CUDAFREECTRL( "d_conn_source_ids_", d_conn_source_ids_ );
+  }
+
+  if ( spike_time_flag_ )
+  {
+    CUDAMALLOCCTRL( "&d_conn_spike_time_", &d_conn_spike_time_, n_conn_ * sizeof( unsigned short ) );
+  }
+
+  connectCalibrateKernel<<< 1, 1 >>>( d_conn_group_idx0_,
+    d_conn_group_iconn0_,
+    d_conn_group_delay_,
+    conn_block_size_,
+    d_conn_key_array_,
+    d_conn_struct_array_,
+    d_conn_spike_time_ );
+  DBGCUDASYNC;
+
+  return 0;
 }
 
-__device__ __forceinline__
-uint GetNodeIndex(int *i_node_0, int i_node_rel)
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::allocateNewBlocks( int new_n_block )
 {
-  return *(i_node_0 + i_node_rel);
+  // Allocating GPU memory for new connection blocks
+  // allocate new blocks if needed
+  for ( int ib = conn_key_vect_.size(); ib < new_n_block; ib++ )
+  {
+    ConnKeyT* d_key_pt;
+    ConnStructT* d_connection_pt;
+    // allocate GPU memory for new blocks
+    CUDAMALLOCCTRL( "&d_key_pt", &d_key_pt, conn_block_size_ * sizeof( ConnKeyT ) );
+    CUDAMALLOCCTRL( "&d_connection_pt", &d_connection_pt, conn_block_size_ * sizeof( ConnStructT ) );
+    conn_key_vect_.push_back( d_key_pt );
+    conn_struct_vect_.push_back( d_connection_pt );
+  }
+
+  return 0;
 }
 
-template <class T>
-__global__ void setSource(uint *key_subarray, uint *rand_val,
-			  int64_t n_conn, T source, uint n_source)
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::freeConnectionKey()
 {
-  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_conn) return;
-  key_subarray[i_conn] = GetNodeIndex(source, rand_val[i_conn]%n_source);
+  for ( uint ib = 0; ib < conn_key_vect_.size(); ib++ )
+  {
+    ConnKeyT* d_key_pt = conn_key_vect_[ ib ];
+    if ( d_key_pt != NULL )
+    {
+      CUDAFREECTRL( "d_key_pt", d_key_pt );
+    }
+  }
+  return 0;
 }
 
-template <class T>
-__global__ void setTarget(connection_struct *conn_subarray, uint *rand_val,
-			  int64_t n_conn, T target, uint n_target)
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::setConnectionWeights( curandGenerator_t& gen,
+  void* d_storage,
+  ConnStructT* conn_struct_subarray,
+  int64_t n_conn,
+  SynSpec& syn_spec )
 {
-  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_conn) return;
-  conn_subarray[i_conn].target_port_syn =
-    GetNodeIndex(target, rand_val[i_conn]%n_target);
+  if ( syn_spec.weight_distr_ >= DISTR_TYPE_ARRAY // probability distribution
+    && syn_spec.weight_distr_ < N_DISTR_TYPE )
+  { // or array
+    if ( syn_spec.weight_distr_ == DISTR_TYPE_ARRAY )
+    {
+      gpuErrchk(
+        cudaMemcpy( d_storage, syn_spec.weight_h_array_pt_, n_conn * sizeof( float ), cudaMemcpyHostToDevice ) );
+    }
+    else if ( syn_spec.weight_distr_ == DISTR_TYPE_NORMAL_CLIPPED )
+    {
+      CURAND_CALL( curandGenerateUniform( gen, ( float* ) d_storage, n_conn ) );
+      randomNormalClipped( ( float* ) d_storage,
+        n_conn,
+        syn_spec.weight_mu_,
+        syn_spec.weight_sigma_,
+        syn_spec.weight_low_,
+        syn_spec.weight_high_ );
+    }
+    else if ( syn_spec.weight_distr_ == DISTR_TYPE_NORMAL )
+    {
+      float low = syn_spec.weight_mu_ - 5.0 * syn_spec.weight_sigma_;
+      float high = syn_spec.weight_mu_ + 5.0 * syn_spec.weight_sigma_;
+      CURAND_CALL( curandGenerateUniform( gen, ( float* ) d_storage, n_conn ) );
+      randomNormalClipped( ( float* ) d_storage, n_conn, syn_spec.weight_mu_, syn_spec.weight_sigma_, low, high );
+    }
+    else
+    {
+      throw ngpu_exception( "Invalid connection weight distribution type" );
+    }
+    setWeights< ConnStructT > <<< ( n_conn + 1023 ) / 1024, 1024 >>>(
+      conn_struct_subarray, ( float* ) d_storage, n_conn );
+    DBGCUDASYNC;
+  }
+  else
+  {
+    setWeights< ConnStructT > <<< ( n_conn + 1023 ) / 1024, 1024 >>>( conn_struct_subarray, syn_spec.weight_, n_conn );
+    DBGCUDASYNC;
+  }
+
+  return 0;
 }
 
-template <class T1, class T2>
-__global__ void setOneToOneSourceTarget(uint *key_subarray,
-					connection_struct *conn_subarray,
-					int64_t n_block_conn,
-					int64_t n_prev_conn,
-					T1 source, T2 target)
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::setConnectionDelays( curandGenerator_t& gen,
+  void* d_storage,
+  ConnKeyT* conn_key_subarray,
+  int64_t n_conn,
+  SynSpec& syn_spec )
 {
-  int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_block_conn>=n_block_conn) return;
-  int64_t i_conn = n_prev_conn + i_block_conn;
-  uint i_source = GetNodeIndex(source, (int)(i_conn));
-  uint i_target = GetNodeIndex(target, (int)(i_conn));
-  key_subarray[i_block_conn] = i_source;
-  conn_subarray[i_block_conn].target_port_syn = i_target;
+  if ( syn_spec.delay_distr_ >= DISTR_TYPE_ARRAY // probability distribution
+    && syn_spec.delay_distr_ < N_DISTR_TYPE )
+  { // or array
+    if ( syn_spec.delay_distr_ == DISTR_TYPE_ARRAY )
+    {
+      gpuErrchk(
+        cudaMemcpy( d_storage, syn_spec.delay_h_array_pt_, n_conn * sizeof( float ), cudaMemcpyHostToDevice ) );
+    }
+    else if ( syn_spec.delay_distr_ == DISTR_TYPE_NORMAL_CLIPPED )
+    {
+      CURAND_CALL( curandGenerateUniform( gen, ( float* ) d_storage, n_conn ) );
+      randomNormalClipped( ( float* ) d_storage,
+        n_conn,
+        syn_spec.delay_mu_,
+        syn_spec.delay_sigma_,
+        syn_spec.delay_low_,
+        syn_spec.delay_high_ );
+    }
+    else if ( syn_spec.delay_distr_ == DISTR_TYPE_NORMAL )
+    {
+      float low = syn_spec.delay_mu_ - 5.0 * syn_spec.delay_sigma_;
+      float high = syn_spec.delay_mu_ + 5.0 * syn_spec.delay_sigma_;
+      CURAND_CALL( curandGenerateUniform( gen, ( float* ) d_storage, n_conn ) );
+      randomNormalClipped( ( float* ) d_storage,
+        n_conn,
+        syn_spec.delay_mu_,
+        syn_spec.delay_sigma_,
+        syn_spec.delay_low_,
+        syn_spec.delay_high_ );
+    }
+    else
+    {
+      throw ngpu_exception( "Invalid connection delay distribution type" );
+    }
+
+    setDelays< ConnKeyT > <<< ( n_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_subarray, ( float* ) d_storage, n_conn, time_resolution_ );
+    DBGCUDASYNC;
+  }
+  else
+  {
+    setDelays< ConnKeyT > <<< ( n_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_subarray, syn_spec.delay_, n_conn, time_resolution_ );
+    DBGCUDASYNC;
+  }
+  return 0;
 }
 
-template <class T1, class T2>
-__global__ void setAllToAllSourceTarget(uint *key_subarray,
-					connection_struct *conn_subarray,
-					int64_t n_block_conn,
-					int64_t n_prev_conn,
-					T1 source, uint n_source,
-					T2 target, uint n_target)
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::organizeConnections( inode_t n_node )
 {
-  int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_block_conn>=n_block_conn) return;
-  int64_t i_conn = n_prev_conn + i_block_conn;
-  uint i_source = GetNodeIndex(source, (int)(i_conn / n_target));
-  uint i_target = GetNodeIndex(target, (int)(i_conn % n_target));
-  key_subarray[i_block_conn] = i_source;
-  conn_subarray[i_block_conn].target_port_syn = i_target;
+  timeval startTV;
+  timeval endTV;
+  CUDASYNC;
+  gettimeofday( &startTV, NULL );
+
+  if ( d_conn_storage_ != NULL )
+  {
+    CUDAFREECTRL( "d_conn_storage_", d_conn_storage_ );
+  }
+
+  if ( n_conn_ > 0 )
+  {
+    printf( "Allocating auxiliary GPU memory...\n" );
+    int64_t sort_storage_bytes = 0;
+    void* d_sort_storage = NULL;
+    copass_sort::sort< ConnKeyT, ConnStructT >(
+      conn_key_vect_.data(), conn_struct_vect_.data(), n_conn_, conn_block_size_, d_sort_storage, sort_storage_bytes );
+    printf( "storage bytes: %ld\n", sort_storage_bytes );
+    CUDAMALLOCCTRL( "&d_sort_storage", &d_sort_storage, sort_storage_bytes );
+
+    printf( "Sorting...\n" );
+    copass_sort::sort< ConnKeyT, ConnStructT >(
+      conn_key_vect_.data(), conn_struct_vect_.data(), n_conn_, conn_block_size_, d_sort_storage, sort_storage_bytes );
+    CUDAFREECTRL( "d_sort_storage", d_sort_storage );
+
+    size_t storage_bytes = 0;
+    size_t storage_bytes1 = 0;
+    void* d_storage = NULL;
+    printf( "Indexing connection groups...\n" );
+    // It is important to separate number of allocated blocks
+    // (determined by conn_key_vect_.size()) from number of blocks
+    // on which there are connections, which is determined by n_conn_
+    // number of used connection blocks
+    int k = ( n_conn_ - 1 ) / conn_block_size_ + 1;
+
+    // it seems that there is no relevant advantage in using a constant array
+    // however better to keep this option ready and commented
+    // gpuErrchk(cudaMemcpyToSymbol(ConnKeyArray, conn_key_vect_.data(),
+    //				 k*sizeof(ConnKeyT*)));
+    //, cudaMemcpyHostToDevice));
+    // gpuErrchk(cudaMemcpyToSymbol(ConnStructArray, conn_struct_vect_.data(),
+    //				 k*sizeof(ConnStructT*)));
+    //, cudaMemcpyHostToDevice));
+
+    CUDAMALLOCCTRL( "&d_conn_key_array_", &d_conn_key_array_, k * sizeof( ConnKeyT* ) );
+    gpuErrchk(
+      cudaMemcpy( d_conn_key_array_, conn_key_vect_.data(), k * sizeof( ConnKeyT* ), cudaMemcpyHostToDevice ) );
+
+    CUDAMALLOCCTRL( "&d_conn_struct_array_", &d_conn_struct_array_, k * sizeof( ConnStructT* ) );
+    gpuErrchk( cudaMemcpy(
+      d_conn_struct_array_, conn_struct_vect_.data(), k * sizeof( ConnStructT* ), cudaMemcpyHostToDevice ) );
+
+    //////////////////////////////////////////////////////////////////////
+
+    int* d_conn_group_iconn0_mask;
+    CUDAMALLOCCTRL( "&d_conn_group_iconn0_mask", &d_conn_group_iconn0_mask, conn_block_size_ * sizeof( int ) );
+
+    iconngroup_t* d_conn_group_iconn0_mask_cumul;
+    CUDAMALLOCCTRL( "&d_conn_group_iconn0_mask_cumul",
+      &d_conn_group_iconn0_mask_cumul,
+      ( conn_block_size_ + 1 ) * sizeof( iconngroup_t ) );
+
+    int* d_conn_group_idx0_mask;
+    CUDAMALLOCCTRL( "&d_conn_group_idx0_mask", &d_conn_group_idx0_mask, conn_block_size_ * sizeof( int ) );
+
+    inode_t* d_conn_group_idx0_mask_cumul;
+    CUDAMALLOCCTRL(
+      "&d_conn_group_idx0_mask_cumul", &d_conn_group_idx0_mask_cumul, ( conn_block_size_ + 1 ) * sizeof( inode_t ) );
+
+    iconngroup_t* d_conn_group_idx0_compact;
+    int64_t reserve_size = n_node < conn_block_size_ ? n_node : conn_block_size_;
+    CUDAMALLOCCTRL(
+      "&d_conn_group_idx0_compact", &d_conn_group_idx0_compact, ( reserve_size + 1 ) * sizeof( iconngroup_t ) );
+
+    inode_t* d_conn_group_source_compact;
+    CUDAMALLOCCTRL( "&d_conn_group_source_compact", &d_conn_group_source_compact, reserve_size * sizeof( inode_t ) );
+
+    iconngroup_t* d_iconn0_offset;
+    CUDAMALLOCCTRL( "&d_iconn0_offset", &d_iconn0_offset, sizeof( iconngroup_t ) );
+    gpuErrchk( cudaMemset( d_iconn0_offset, 0, sizeof( iconngroup_t ) ) );
+    inode_t* d_idx0_offset;
+    CUDAMALLOCCTRL( "&d_idx0_offset", &d_idx0_offset, sizeof( inode_t ) );
+    gpuErrchk( cudaMemset( d_idx0_offset, 0, sizeof( inode_t ) ) );
+
+    ConnKeyT* conn_key_subarray_prev = NULL;
+    for ( int ib = 0; ib < k; ib++ )
+    {
+      int64_t n_block_conn = ib < ( k - 1 ) ? conn_block_size_ : n_conn_ - conn_block_size_ * ( k - 1 );
+      gpuErrchk( cudaMemset( d_conn_group_iconn0_mask, 0, n_block_conn * sizeof( int ) ) );
+      buildConnGroupIConn0Mask< ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ], conn_key_subarray_prev, n_block_conn, d_conn_group_iconn0_mask );
+      CUDASYNC;
+
+      conn_key_subarray_prev = conn_key_vect_[ ib ] + conn_block_size_ - 1;
+
+      if ( ib == 0 )
+      {
+        // Determine temporary device storage requirements for prefix sum
+        //<BEGIN-CLANG-TIDY-SKIP>//
+        cub::DeviceScan::ExclusiveSum(
+          NULL, storage_bytes, d_conn_group_iconn0_mask, d_conn_group_iconn0_mask_cumul, n_block_conn + 1 );
+        //<END-CLANG-TIDY-SKIP>//
+        //  Allocate temporary storage for prefix sum
+        CUDAMALLOCCTRL( "&d_storage", &d_storage, storage_bytes );
+      }
+      // Run exclusive prefix sum
+      //<BEGIN-CLANG-TIDY-SKIP>//
+      cub::DeviceScan::ExclusiveSum(
+        d_storage, storage_bytes, d_conn_group_iconn0_mask, d_conn_group_iconn0_mask_cumul, n_block_conn + 1 );
+      //<END-CLANG-TIDY-SKIP>//
+      setConnGroupNewOffset<<< 1, 1 >>>( d_iconn0_offset, d_conn_group_iconn0_mask_cumul + n_block_conn );
+
+      CUDASYNC;
+    }
+    gpuErrchk( cudaMemcpy( &tot_conn_group_num_, d_iconn0_offset, sizeof( iconngroup_t ), cudaMemcpyDeviceToHost ) );
+    printf( "Total number of connection groups: %d\n", tot_conn_group_num_ );
+
+    if ( tot_conn_group_num_ > 0 )
+    {
+      iconngroup_t* d_conn_group_num;
+      CUDAMALLOCCTRL( "&d_conn_group_num", &d_conn_group_num, n_node * sizeof( iconngroup_t ) );
+      gpuErrchk( cudaMemset( d_conn_group_num, 0, sizeof( iconngroup_t ) ) );
+
+      ConnKeyT* conn_key_subarray_prev = NULL;
+      gpuErrchk( cudaMemset( d_iconn0_offset, 0, sizeof( iconngroup_t ) ) );
+
+      CUDAMALLOCCTRL( "&d_conn_group_iconn0_", &d_conn_group_iconn0_, ( tot_conn_group_num_ + 1 ) * sizeof( int64_t ) );
+
+      inode_t n_compact = 0;
+      for ( int ib = 0; ib < k; ib++ )
+      {
+        int64_t n_block_conn = ib < ( k - 1 ) ? conn_block_size_ : n_conn_ - conn_block_size_ * ( k - 1 );
+        gpuErrchk( cudaMemset( d_conn_group_iconn0_mask, 0, n_block_conn * sizeof( int ) ) );
+        gpuErrchk( cudaMemset( d_conn_group_idx0_mask, 0, n_block_conn * sizeof( int ) ) );
+        buildConnGroupMask< ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( conn_key_vect_[ ib ],
+          conn_key_subarray_prev,
+          n_block_conn,
+          d_conn_group_iconn0_mask,
+          d_conn_group_idx0_mask );
+        CUDASYNC;
+
+        conn_key_subarray_prev = conn_key_vect_[ ib ] + conn_block_size_ - 1;
+
+        // Run exclusive prefix sum
+        //<BEGIN-CLANG-TIDY-SKIP>//
+        cub::DeviceScan::ExclusiveSum(
+          d_storage, storage_bytes, d_conn_group_iconn0_mask, d_conn_group_iconn0_mask_cumul, n_block_conn + 1 );
+        DBGCUDASYNC;
+        cub::DeviceScan::ExclusiveSum(
+          d_storage, storage_bytes, d_conn_group_idx0_mask, d_conn_group_idx0_mask_cumul, n_block_conn + 1 );
+        //<END-CLANG-TIDY-SKIP>//
+
+        DBGCUDASYNC;
+        int64_t i_conn0 = conn_block_size_ * ib;
+        setConnGroupIConn0<<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( n_block_conn,
+          d_conn_group_iconn0_mask,
+          d_conn_group_iconn0_mask_cumul,
+          d_conn_group_iconn0_,
+          i_conn0,
+          d_iconn0_offset );
+        CUDASYNC;
+
+        setConnGroupIdx0Compact< ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( conn_key_vect_[ ib ],
+          n_block_conn,
+          d_conn_group_idx0_mask,
+          d_conn_group_iconn0_mask_cumul,
+          d_conn_group_idx0_mask_cumul,
+          d_conn_group_idx0_compact,
+          d_conn_group_source_compact,
+          d_iconn0_offset,
+          d_idx0_offset );
+        CUDASYNC;
+
+        inode_t n_block_compact;
+        gpuErrchk( cudaMemcpy(
+          &n_block_compact, d_conn_group_idx0_mask_cumul + n_block_conn, sizeof( inode_t ), cudaMemcpyDeviceToHost ) );
+        // std::cout << "number of nodes with outgoing connections "
+        //"in block " << ib << ": " << n_block_compact << "\n";
+        n_compact += n_block_compact;
+
+        setConnGroupNewOffset<<< 1, 1 >>>( d_iconn0_offset, d_conn_group_iconn0_mask_cumul + n_block_conn );
+        setConnGroupNewOffset<<< 1, 1 >>>( d_idx0_offset, d_conn_group_idx0_mask_cumul + n_block_conn );
+        CUDASYNC;
+      }
+      gpuErrchk(
+        cudaMemcpy( d_conn_group_iconn0_ + tot_conn_group_num_, &n_conn_, sizeof( int64_t ), cudaMemcpyHostToDevice ) );
+
+      setConnGroupNum<<< ( n_compact + 1023 ) / 1024, 1024 >>>(
+        n_compact, d_conn_group_num, d_conn_group_idx0_compact, d_conn_group_source_compact );
+      CUDASYNC;
+
+      CUDAMALLOCCTRL( "&d_conn_group_idx0_", &d_conn_group_idx0_, ( n_node + 1 ) * sizeof( iconngroup_t ) );
+      storage_bytes1 = 0;
+
+      // Determine temporary device storage requirements for prefix sum
+      //<BEGIN-CLANG-TIDY-SKIP>//
+      cub::DeviceScan::ExclusiveSum( NULL, storage_bytes1, d_conn_group_num, d_conn_group_idx0_, n_node + 1 );
+      //<END-CLANG-TIDY-SKIP>//
+
+      if ( storage_bytes1 > storage_bytes )
+      {
+        storage_bytes = storage_bytes1;
+        CUDAFREECTRL( "d_storage", d_storage );
+        // Allocate temporary storage for prefix sum
+        CUDAMALLOCCTRL( "&d_storage", &d_storage, storage_bytes );
+      }
+      // Run exclusive prefix sum
+      //<BEGIN-CLANG-TIDY-SKIP>//
+      cub::DeviceScan::ExclusiveSum( d_storage, storage_bytes, d_conn_group_num, d_conn_group_idx0_, n_node + 1 );
+      //<END-CLANG-TIDY-SKIP>//
+
+      // find maxumum number of connection groups (delays) over all neurons
+      int* d_max_delay_num;
+      CUDAMALLOCCTRL( "&d_max_delay_num", &d_max_delay_num, sizeof( int ) );
+
+      storage_bytes1 = 0;
+      // Determine temporary device storage requirements
+      //<BEGIN-CLANG-TIDY-SKIP>//
+      cub::DeviceReduce::Max( NULL, storage_bytes1, d_conn_group_num, d_max_delay_num, n_node );
+      //<END-CLANG-TIDY-SKIP>//
+
+      if ( storage_bytes1 > storage_bytes )
+      {
+        storage_bytes = storage_bytes1;
+        CUDAFREECTRL( "d_storage", d_storage );
+        // Allocate temporary storage for prefix sum
+        CUDAMALLOCCTRL( "&d_storage", &d_storage, storage_bytes );
+      }
+
+      // Run maximum search
+      //<BEGIN-CLANG-TIDY-SKIP>//
+      cub::DeviceReduce::Max( d_storage, storage_bytes, d_conn_group_num, d_max_delay_num, n_node );
+      //<END-CLANG-TIDY-SKIP>//
+
+      CUDASYNC;
+      gpuErrchk( cudaMemcpy( &max_delay_num_, d_max_delay_num, sizeof( int ), cudaMemcpyDeviceToHost ) );
+      CUDAFREECTRL( "d_max_delay_num", d_max_delay_num );
+
+      printf(
+        "Maximum number of connection groups (delays)"
+        " over all nodes: %d\n",
+        max_delay_num_ );
+
+      ///////////////////////////////////////////////////////////////////
+      ///////////////////////////////////////////////////////////////////
+      CUDAFREECTRL( "d_storage", d_storage ); // free temporary allocated storage
+      CUDAFREECTRL( "d_conn_group_iconn0_mask", d_conn_group_iconn0_mask );
+      CUDAFREECTRL( "d_conn_group_iconn0_mask_cumul", d_conn_group_iconn0_mask_cumul );
+      CUDAFREECTRL( "d_iconn0_offset", d_iconn0_offset );
+      CUDAFREECTRL( "d_conn_group_idx0_mask", d_conn_group_idx0_mask );
+      CUDAFREECTRL( "d_conn_group_idx0_mask_cumul", d_conn_group_idx0_mask_cumul );
+      CUDAFREECTRL( "d_idx0_offset", d_idx0_offset );
+      CUDAFREECTRL( "d_conn_group_idx0_compact", d_conn_group_idx0_compact );
+      CUDAFREECTRL( "d_conn_group_num", d_conn_group_num );
+
+#ifndef OPTIMIZE_FOR_MEMORY
+      CUDAMALLOCCTRL( "&d_conn_group_delay_", &d_conn_group_delay_, tot_conn_group_num_ * sizeof( int ) );
+
+      getConnGroupDelay< ConnKeyT > <<< ( tot_conn_group_num_ + 1023 ) / 1024, 1024 >>>(
+        conn_block_size_, d_conn_key_array_, d_conn_group_iconn0_, d_conn_group_delay_, tot_conn_group_num_ );
+      DBGCUDASYNC;
+#endif
+    }
+    else
+    {
+      throw ngpu_exception(
+        "Number of connections groups must be positive "
+        "for number of connections > 0" );
+    }
+  }
+  else
+  {
+    gpuErrchk( cudaMemset( d_conn_group_idx0_, 0, ( n_node + 1 ) * sizeof( iconngroup_t ) ) );
+    max_delay_num_ = 0;
+  }
+
+  gettimeofday( &endTV, NULL );
+  long time =
+    ( long ) ( ( endTV.tv_sec * 1000000.0 + endTV.tv_usec ) - ( startTV.tv_sec * 1000000.0 + startTV.tv_usec ) );
+  printf( "%-40s%.2f ms\n", "Time: ", ( double ) time / 1000. );
+  printf( "Done\n" );
+
+  return 0;
 }
 
-template <class T>
-__global__ void setIndegreeTarget(connection_struct *conn_subarray,
-				  int64_t n_block_conn,
-				  int64_t n_prev_conn,
-				  T target, uint indegree)
+template < class ConnKeyT, class ConnStructT >
+template < class T1, class T2 >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::_Connect( T1 source,
+  inode_t n_source,
+  T2 target,
+  inode_t n_target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
 {
-  int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_block_conn>=n_block_conn) return;
-  int64_t i_conn = n_prev_conn + i_block_conn;
-  uint i_target = GetNodeIndex(target, (int)(i_conn / indegree));
-  conn_subarray[i_block_conn].target_port_syn = i_target;
+  return _Connect( conn_random_generator_[ this_host_ ][ this_host_ ],
+    source,
+    n_source,
+    target,
+    n_target,
+    conn_spec,
+    syn_spec,
+    false );
 }
 
-template <class T>
-__global__ void setOutdegreeSource(uint *key_subarray,
-				   int64_t n_block_conn,
-				   int64_t n_prev_conn,
-				   T source, uint outdegree)
+template < class ConnKeyT, class ConnStructT >
+template < class T1, class T2 >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::_Connect( curandGenerator_t& gen,
+  T1 source,
+  inode_t n_source,
+  T2 target,
+  inode_t n_target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec,
+  bool remote_source_flag )
 {
-  int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_block_conn>=n_block_conn) return;
-  int64_t i_conn = n_prev_conn + i_block_conn;
-  uint i_source = GetNodeIndex(source, (int)(i_conn / outdegree));
-  key_subarray[i_block_conn] = i_source;
+  if ( d_conn_storage_ == NULL )
+  {
+    CUDAMALLOCCTRL( "&d_conn_storage_", &d_conn_storage_, conn_block_size_ * sizeof( uint ) );
+  }
+
+  ////////////////////////
+  // TEMPORARY, TO BE IMPROVED
+  if ( syn_spec.syn_group_ >= 1 )
+  {
+    spike_time_flag_ = true;
+    rev_conn_flag_ = true;
+  }
+
+  switch ( conn_spec.rule_ )
+  {
+  case ONE_TO_ONE:
+    if ( n_source != n_target )
+    {
+      throw ngpu_exception(
+        "Number of source and target nodes must be equal "
+        "for the one-to-one connection rule" );
+    }
+    return connectOneToOne< T1, T2 >( gen, source, target, n_source, syn_spec, remote_source_flag );
+    break;
+
+  case ALL_TO_ALL:
+    return connectAllToAll< T1, T2 >( gen, source, n_source, target, n_target, syn_spec, remote_source_flag );
+    break;
+  case FIXED_TOTAL_NUMBER:
+    return connectFixedTotalNumber< T1, T2 >(
+      gen, source, n_source, target, n_target, conn_spec.total_num_, syn_spec, remote_source_flag );
+    break;
+  case FIXED_INDEGREE:
+    return connectFixedIndegree< T1, T2 >(
+      gen, source, n_source, target, n_target, conn_spec.indegree_, syn_spec, remote_source_flag );
+    break;
+  case FIXED_OUTDEGREE:
+    return connectFixedOutdegree< T1, T2 >(
+      gen, source, n_source, target, n_target, conn_spec.outdegree_, syn_spec, remote_source_flag );
+    break;
+  default:
+    throw ngpu_exception( "Unknown connection rule" );
+  }
+  return 0;
 }
 
-template <class T1, class T2>
-int connect_one_to_one(curandGenerator_t &gen,
-		       void *d_storage, float time_resolution,
-		       std::vector<uint*> &key_subarray,
-		       std::vector<connection_struct*> &conn_subarray,
-		       int64_t &n_conn, int64_t block_size,
-		       T1 source, T2 target,  int n_node,
-		       SynSpec &syn_spec)
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::reallocConnSourceIds( int64_t n_conn )
 {
-  uint64_t old_n_conn = n_conn;
-  uint64_t n_new_conn = n_node;
-  n_conn += n_new_conn; // new number of connections
-  uint new_n_block = (uint)((n_conn + block_size - 1) / block_size);
+  if ( n_conn < conn_source_ids_size_ )
+  {
+    return 0;
+  }
+  if ( conn_source_ids_size_ > 0 && d_conn_source_ids_ != NULL )
+  {
+    CUDAFREECTRL( "d_conn_source_ids_", d_conn_source_ids_ );
+  }
+  CUDAMALLOCCTRL( "&d_conn_source_ids_", &d_conn_source_ids_, n_conn * sizeof( inode_t ) );
+  conn_source_ids_size_ = n_conn;
+
+  return 0;
+}
 
-  allocateNewBlocks(key_subarray, conn_subarray, block_size, new_n_block);
+template < class ConnKeyT, class ConnStructT >
+template < class T1, class T2 >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::connectOneToOne( curandGenerator_t& src_gen,
+  T1 source,
+  T2 target,
+  inode_t n_node,
+  SynSpec& syn_spec,
+  bool remote_source_flag )
+{
+  int64_t old_n_conn = n_conn_;
+  int64_t n_new_conn = n_node;
+  n_conn_ += n_new_conn; // new number of connections
+  int new_n_block = ( int ) ( ( n_conn_ + conn_block_size_ - 1 ) / conn_block_size_ );
+  if ( remote_source_flag )
+  {
+    reallocConnSourceIds( n_new_conn );
+  }
+  else
+  {
+    allocateNewBlocks( new_n_block );
+  }
 
-  //printf("Generating connections with one-to-one rule...\n");
+  // printf("Generating connections with one-to-one rule...\n");
   int64_t n_prev_conn = 0;
-  uint ib0 = (uint)(old_n_conn / block_size);
-  for (uint ib=ib0; ib<new_n_block; ib++) {
-    uint64_t n_block_conn; // number of connections in a block
-    uint64_t i_conn0; // index of first connection in a block
-    if (new_n_block == ib0 + 1) {  // all connections are in the same block
-      i_conn0 = old_n_conn % block_size;
+  int ib0 = ( int ) ( old_n_conn / conn_block_size_ );
+  for ( int ib = ib0; ib < new_n_block; ib++ )
+  {
+    int64_t n_block_conn; // number of connections in a block
+    int64_t i_conn0;      // index of first connection in a block
+    if ( new_n_block == ib0 + 1 )
+    { // all connections are in the same block
+      i_conn0 = old_n_conn % conn_block_size_;
       n_block_conn = n_new_conn;
     }
-    else if (ib == ib0) { // first block
-      i_conn0 = old_n_conn % block_size;
-      n_block_conn = block_size - i_conn0;
+    else if ( ib == ib0 )
+    { // first block
+      i_conn0 = old_n_conn % conn_block_size_;
+      n_block_conn = conn_block_size_ - i_conn0;
     }
-    else if (ib == new_n_block-1) { // last block
+    else if ( ib == new_n_block - 1 )
+    { // last block
       i_conn0 = 0;
-      n_block_conn = (n_conn - 1) % block_size + 1;
+      n_block_conn = ( n_conn_ - 1 ) % conn_block_size_ + 1;
     }
-    else {
+    else
+    {
       i_conn0 = 0;
-      n_block_conn = block_size;
+      n_block_conn = conn_block_size_;
+    }
+    if ( remote_source_flag )
+    {
+      setOneToOneSource< T1 > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        d_conn_source_ids_, n_block_conn, n_prev_conn, source );
+      DBGCUDASYNC;
+    }
+    else
+    {
+      setOneToOneSourceTarget< T1, T2, ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, n_prev_conn, source, target );
+      DBGCUDASYNC;
+      setConnectionWeights(
+        local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+      setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+      setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn );
+      DBGCUDASYNC;
+      setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn );
+      DBGCUDASYNC;
+      // CUDASYNC;
     }
 
-    setOneToOneSourceTarget<<<(n_block_conn+1023)/1024, 1024>>>
-      (key_subarray[ib] + i_conn0, conn_subarray[ib] + i_conn0,
-       n_block_conn, n_prev_conn, source, target);
-    DBGCUDASYNC
-    setConnectionWeights(gen, d_storage, conn_subarray[ib] + i_conn0,
-			 n_block_conn, syn_spec);
-
-    setConnectionDelays(gen, d_storage, key_subarray[ib] + i_conn0,
-			n_block_conn, syn_spec, time_resolution);
+    n_prev_conn += n_block_conn;
+  }
 
-    setPort<<<(n_block_conn+1023)/1024, 1024>>>
-      (conn_subarray[ib] + i_conn0, syn_spec.port_, n_block_conn);
-    DBGCUDASYNC
-    setSynGroup<<<(n_block_conn+1023)/1024, 1024>>>
-      (conn_subarray[ib] + i_conn0, syn_spec.syn_group_, n_block_conn);
-    DBGCUDASYNC
+  return 0;
+}
 
+template < class ConnKeyT, class ConnStructT >
+template < class T1, class T2 >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::connectAllToAll( curandGenerator_t& src_gen,
+  T1 source,
+  inode_t n_source,
+  T2 target,
+  inode_t n_target,
+  SynSpec& syn_spec,
+  bool remote_source_flag )
+{
+  int64_t old_n_conn = n_conn_;
+  int64_t n_new_conn = n_source * n_target;
+  n_conn_ += n_new_conn; // new number of connections
+  int new_n_block = ( int ) ( ( n_conn_ + conn_block_size_ - 1 ) / conn_block_size_ );
+
+  if ( remote_source_flag )
+  {
+    reallocConnSourceIds( n_new_conn );
+  }
+  else
+  {
+    allocateNewBlocks( new_n_block );
+  }
+  // printf("Generating connections with all-to-all rule...\n");
+  int64_t n_prev_conn = 0;
+  int ib0 = ( int ) ( old_n_conn / conn_block_size_ );
+  for ( int ib = ib0; ib < new_n_block; ib++ )
+  {
+    int64_t n_block_conn; // number of connections in a block
+    int64_t i_conn0;      // index of first connection in a block
+    if ( new_n_block == ib0 + 1 )
+    { // all connections are in the same block
+      i_conn0 = old_n_conn % conn_block_size_;
+      n_block_conn = n_new_conn;
+    }
+    else if ( ib == ib0 )
+    { // first block
+      i_conn0 = old_n_conn % conn_block_size_;
+      n_block_conn = conn_block_size_ - i_conn0;
+    }
+    else if ( ib == new_n_block - 1 )
+    { // last block
+      i_conn0 = 0;
+      n_block_conn = ( n_conn_ - 1 ) % conn_block_size_ + 1;
+    }
+    else
+    {
+      i_conn0 = 0;
+      n_block_conn = conn_block_size_;
+    }
+    if ( remote_source_flag )
+    {
+      setAllToAllSource< T1 > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        d_conn_source_ids_, n_block_conn, n_prev_conn, source, n_source, n_target );
+      DBGCUDASYNC;
+    }
+    else
+    {
+      setAllToAllSourceTarget< T1, T2, ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ] + i_conn0,
+        conn_struct_vect_[ ib ] + i_conn0,
+        n_block_conn,
+        n_prev_conn,
+        source,
+        n_source,
+        target,
+        n_target );
+      DBGCUDASYNC;
+      setConnectionWeights(
+        local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+
+      setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+
+      setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn );
+      DBGCUDASYNC;
+      setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn );
+      DBGCUDASYNC;
+    }
     n_prev_conn += n_block_conn;
   }
 
   return 0;
 }
 
+template < class ConnKeyT, class ConnStructT >
+template < class T1, class T2 >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::connectFixedTotalNumber( curandGenerator_t& src_gen,
+  T1 source,
+  inode_t n_source,
+  T2 target,
+  inode_t n_target,
+  int64_t total_num,
+  SynSpec& syn_spec,
+  bool remote_source_flag )
+{
+  if ( total_num == 0 )
+  {
+    return 0;
+  }
+  int64_t old_n_conn = n_conn_;
+  int64_t n_new_conn = total_num;
+  n_conn_ += n_new_conn; // new number of connections
+  int new_n_block = ( int ) ( ( n_conn_ + conn_block_size_ - 1 ) / conn_block_size_ );
+
+  if ( remote_source_flag )
+  {
+    reallocConnSourceIds( n_new_conn );
+  }
+  else
+  {
+    allocateNewBlocks( new_n_block );
+  }
+  // printf("Generating connections with fixed_total_number rule...\n");
+  int ib0 = ( int ) ( old_n_conn / conn_block_size_ );
+  for ( int ib = ib0; ib < new_n_block; ib++ )
+  {
+    int64_t n_block_conn; // number of connections in a block
+    int64_t i_conn0;      // index of first connection in a block
+    if ( new_n_block == ib0 + 1 )
+    { // all connections are in the same block
+      i_conn0 = old_n_conn % conn_block_size_;
+      n_block_conn = n_new_conn;
+    }
+    else if ( ib == ib0 )
+    { // first block
+      i_conn0 = old_n_conn % conn_block_size_;
+      n_block_conn = conn_block_size_ - i_conn0;
+    }
+    else if ( ib == new_n_block - 1 )
+    { // last block
+      i_conn0 = 0;
+      n_block_conn = ( n_conn_ - 1 ) % conn_block_size_ + 1;
+    }
+    else
+    {
+      i_conn0 = 0;
+      n_block_conn = conn_block_size_;
+    }
+    // generate random source index in range 0 - n_neuron
+    CURAND_CALL( curandGenerate( src_gen, ( uint* ) d_conn_storage_, n_block_conn ) );
+    if ( remote_source_flag )
+    {
+      setSource< T1 > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        d_conn_source_ids_, ( uint* ) d_conn_storage_, n_block_conn, source, n_source );
+      DBGCUDASYNC;
+    }
+    else
+    {
+      setSource< T1, ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, source, n_source );
+      DBGCUDASYNC;
+
+      // generate random target index in range 0 - n_neuron
+      CURAND_CALL( curandGenerate( local_rnd_gen_, ( uint* ) d_conn_storage_, n_block_conn ) );
+      setTarget< T2, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_struct_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, target, n_target );
+      DBGCUDASYNC;
+
+      setConnectionWeights(
+        local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+
+      setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+
+      setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn );
+      DBGCUDASYNC;
+      setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn );
+      DBGCUDASYNC;
+    }
+  }
 
+  return 0;
+}
 
-
-template <class T1, class T2>
-int connect_all_to_all(curandGenerator_t &gen,
-		       void *d_storage, float time_resolution,
-		       std::vector<uint*> &key_subarray,
-		       std::vector<connection_struct*> &conn_subarray,
-		       int64_t &n_conn, int64_t block_size,
-		       T1 source, int n_source,
-		       T2 target, int n_target,
-		       SynSpec &syn_spec)
+template < class ConnKeyT, class ConnStructT >
+template < class T1, class T2 >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::connectFixedIndegree( curandGenerator_t& src_gen,
+  T1 source,
+  inode_t n_source,
+  T2 target,
+  inode_t n_target,
+  int indegree,
+  SynSpec& syn_spec,
+  bool remote_source_flag )
 {
-  uint64_t old_n_conn = n_conn;
-  uint64_t n_new_conn = n_source*n_target;
-  n_conn += n_new_conn; // new number of connections
-  uint new_n_block = (uint)((n_conn + block_size - 1) / block_size);
-
-  allocateNewBlocks(key_subarray, conn_subarray, block_size, new_n_block);
+  if ( indegree <= 0 )
+  {
+    return 0;
+  }
+  int64_t old_n_conn = n_conn_;
+  int64_t n_new_conn = n_target * indegree;
+  n_conn_ += n_new_conn; // new number of connections
+  int new_n_block = ( int ) ( ( n_conn_ + conn_block_size_ - 1 ) / conn_block_size_ );
+
+  if ( remote_source_flag )
+  {
+    reallocConnSourceIds( n_new_conn );
+  }
+  else
+  {
+    allocateNewBlocks( new_n_block );
+  }
 
-  //printf("Generating connections with all-to-all rule...\n");
+  // printf("Generating connections with fixed_indegree rule...\n");
   int64_t n_prev_conn = 0;
-  uint ib0 = (uint)(old_n_conn / block_size);
-  for (uint ib=ib0; ib<new_n_block; ib++) {
-    uint64_t n_block_conn; // number of connections in a block
-    uint64_t i_conn0; // index of first connection in a block
-    if (new_n_block == ib0 + 1) {  // all connections are in the same block
-      i_conn0 = old_n_conn % block_size;
+  int ib0 = ( int ) ( old_n_conn / conn_block_size_ );
+  for ( int ib = ib0; ib < new_n_block; ib++ )
+  {
+    int64_t n_block_conn; // number of connections in a block
+    int64_t i_conn0;      // index of first connection in a block
+    if ( new_n_block == ib0 + 1 )
+    { // all connections are in the same block
+      i_conn0 = old_n_conn % conn_block_size_;
       n_block_conn = n_new_conn;
     }
-    else if (ib == ib0) { // first block
-      i_conn0 = old_n_conn % block_size;
-      n_block_conn = block_size - i_conn0;
+    else if ( ib == ib0 )
+    { // first block
+      i_conn0 = old_n_conn % conn_block_size_;
+      n_block_conn = conn_block_size_ - i_conn0;
     }
-    else if (ib == new_n_block-1) { // last block
+    else if ( ib == new_n_block - 1 )
+    { // last block
       i_conn0 = 0;
-      n_block_conn = (n_conn - 1) % block_size + 1;
+      n_block_conn = ( n_conn_ - 1 ) % conn_block_size_ + 1;
     }
-    else {
+    else
+    {
       i_conn0 = 0;
-      n_block_conn = block_size;
+      n_block_conn = conn_block_size_;
+    }
+    // generate random source index in range 0 - n_neuron
+    CURAND_CALL( curandGenerate( src_gen, ( uint* ) d_conn_storage_, n_block_conn ) );
+    if ( remote_source_flag )
+    {
+      setSource< T1 > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        d_conn_source_ids_, ( uint* ) d_conn_storage_, n_block_conn, source, n_source );
+      DBGCUDASYNC;
+    }
+    else
+    {
+      setSource< T1, ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, source, n_source );
+      DBGCUDASYNC;
+
+      setIndegreeTarget< T2, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_struct_vect_[ ib ] + i_conn0, n_block_conn, n_prev_conn, target, indegree );
+      DBGCUDASYNC;
+
+      setConnectionWeights(
+        local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+
+      setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+
+      setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn );
+      DBGCUDASYNC;
+      setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn );
+      DBGCUDASYNC;
     }
-
-    setAllToAllSourceTarget<<<(n_block_conn+1023)/1024, 1024>>>
-      (key_subarray[ib] + i_conn0, conn_subarray[ib] + i_conn0,
-       n_block_conn, n_prev_conn, source, n_source, target, n_target);
-    DBGCUDASYNC
-    setConnectionWeights(gen, d_storage, conn_subarray[ib] + i_conn0,
-			 n_block_conn, syn_spec);
-
-    setConnectionDelays(gen, d_storage, key_subarray[ib] + i_conn0,
-			n_block_conn, syn_spec, time_resolution);
-
-    setPort<<<(n_block_conn+1023)/1024, 1024>>>
-      (conn_subarray[ib] + i_conn0, syn_spec.port_, n_block_conn);
-    DBGCUDASYNC
-    setSynGroup<<<(n_block_conn+1023)/1024, 1024>>>
-      (conn_subarray[ib] + i_conn0, syn_spec.syn_group_, n_block_conn);
-    DBGCUDASYNC
-
     n_prev_conn += n_block_conn;
   }
 
   return 0;
 }
 
-
-template <class T1, class T2>
-int connect_fixed_total_number(curandGenerator_t &gen,
-			       void *d_storage, float time_resolution,
-			       std::vector<uint*> &key_subarray,
-			       std::vector<connection_struct*> &conn_subarray,
-			       int64_t &n_conn, int64_t block_size,
-			       int64_t total_num, T1 source, int n_source,
-			       T2 target, int n_target,
-			       SynSpec &syn_spec)
+template < class ConnKeyT, class ConnStructT >
+template < class T1, class T2 >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::connectFixedOutdegree( curandGenerator_t& src_gen,
+  T1 source,
+  inode_t n_source,
+  T2 target,
+  inode_t n_target,
+  int outdegree,
+  SynSpec& syn_spec,
+  bool remote_source_flag )
 {
-  if (total_num==0) return 0;
-  uint64_t old_n_conn = n_conn;
-  uint64_t n_new_conn = total_num;
-  n_conn += n_new_conn; // new number of connections
-  uint new_n_block = (uint)((n_conn + block_size - 1) / block_size);
-
-  allocateNewBlocks(key_subarray, conn_subarray, block_size, new_n_block);
+  if ( outdegree <= 0 )
+  {
+    return 0;
+  }
+  int64_t old_n_conn = n_conn_;
+  int64_t n_new_conn = n_source * outdegree;
+  n_conn_ += n_new_conn; // new number of connections
+  int new_n_block = ( int ) ( ( n_conn_ + conn_block_size_ - 1 ) / conn_block_size_ );
+
+  if ( remote_source_flag )
+  {
+    reallocConnSourceIds( n_new_conn );
+  }
+  else
+  {
+    allocateNewBlocks( new_n_block );
+  }
 
-  //printf("Generating connections with fixed_total_number rule...\n");
-  uint ib0 = (uint)(old_n_conn / block_size);
-  for (uint ib=ib0; ib<new_n_block; ib++) {
-    uint64_t n_block_conn; // number of connections in a block
-    uint64_t i_conn0; // index of first connection in a block
-    if (new_n_block == ib0 + 1) {  // all connections are in the same block
-        i_conn0 = old_n_conn % block_size;
-	n_block_conn =   n_new_conn;
+  // printf("Generating connections with fixed_outdegree rule...\n");
+  int64_t n_prev_conn = 0;
+  int ib0 = ( int ) ( old_n_conn / conn_block_size_ );
+  for ( int ib = ib0; ib < new_n_block; ib++ )
+  {
+    int64_t n_block_conn; // number of connections in a block
+    int64_t i_conn0;      // index of first connection in a block
+    if ( new_n_block == ib0 + 1 )
+    { // all connections are in the same block
+      i_conn0 = old_n_conn % conn_block_size_;
+      n_block_conn = n_new_conn;
     }
-    else if (ib == ib0) { // first block
-      i_conn0 = old_n_conn % block_size;
-      n_block_conn = block_size - i_conn0;
+    else if ( ib == ib0 )
+    { // first block
+      i_conn0 = old_n_conn % conn_block_size_;
+      n_block_conn = conn_block_size_ - i_conn0;
     }
-    else if (ib == new_n_block-1) { // last block
+    else if ( ib == new_n_block - 1 )
+    { // last block
       i_conn0 = 0;
-      n_block_conn = (n_conn - 1) % block_size + 1;
+      n_block_conn = ( n_conn_ - 1 ) % conn_block_size_ + 1;
     }
-    else {
+    else
+    {
       i_conn0 = 0;
-      n_block_conn = block_size;
+      n_block_conn = conn_block_size_;
+    }
+    if ( remote_source_flag )
+    {
+      setOutdegreeSource< T1 > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        d_conn_source_ids_, n_block_conn, n_prev_conn, source, outdegree );
+      DBGCUDASYNC;
+    }
+    else
+    {
+      setOutdegreeSource< T1, ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ] + i_conn0, n_block_conn, n_prev_conn, source, outdegree );
+      DBGCUDASYNC;
+
+      // generate random target index in range 0 - n_neuron
+      CURAND_CALL( curandGenerate( local_rnd_gen_, ( uint* ) d_conn_storage_, n_block_conn ) );
+      setTarget< T2, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_struct_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, target, n_target );
+      DBGCUDASYNC;
+
+      setConnectionWeights(
+        local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+
+      setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+
+      setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn );
+      DBGCUDASYNC;
+      setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn );
+      DBGCUDASYNC;
     }
-    // generate random source index in range 0 - n_neuron
-    CURAND_CALL(curandGenerate(gen, (uint*)d_storage, n_block_conn));
-    //printf("old_n_conn: %ld\n", old_n_conn);
-    //printf("n_new_conn: %ld\n", n_new_conn);
-    //printf("new_n_block: %d\n", new_n_block);
-    //printf("ib: %d\n", ib);
-    //printf("n_block_conn: %ld\n", n_block_conn);
-    setSource<<<(n_block_conn+1023)/1024, 1024>>>
-      (key_subarray[ib] + i_conn0, (uint*)d_storage, n_block_conn,
-       source, n_source);
-    DBGCUDASYNC
 
-    // generate random target index in range 0 - n_neuron
-    CURAND_CALL(curandGenerate(gen, (uint*)d_storage, n_block_conn));
-    setTarget<<<(n_block_conn+1023)/1024, 1024>>>
-      (conn_subarray[ib] + i_conn0, (uint*)d_storage, n_block_conn,
-       target, n_target);
-    DBGCUDASYNC
+    n_prev_conn += n_block_conn;
+  }
 
-    setConnectionWeights(gen, d_storage, conn_subarray[ib] + i_conn0,
-			 n_block_conn, syn_spec);
+  return 0;
+}
 
-    setConnectionDelays(gen, d_storage, key_subarray[ib] + i_conn0,
-			n_block_conn, syn_spec, time_resolution);
+//////////////////////////////////////////////////////////////////////
+// Get the float parameter param_name of an array of n_conn connections,
+// identified by the indexes conn_ids[i], and put it in the array
+// h_param_arr
+// NOTE: host array should be pre-allocated to store n_conn elements
+//////////////////////////////////////////////////////////////////////
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::getConnectionFloatParam( int64_t* conn_ids,
+  int64_t n_conn,
+  float* h_param_arr,
+  std::string param_name )
+{
+  // Check if param_name is a connection float parameter
+  int i_param = getConnectionFloatParamIndex( param_name );
+  if ( i_param < 0 )
+  {
+    throw ngpu_exception( std::string( "Unrecognized connection float parameter " ) + param_name );
+  }
+  if ( n_conn > 0 )
+  {
+    // declare pointers to arrays in device memory
+    int64_t* d_conn_ids;
+    float* d_arr;
+    // allocate array of connection ids in device memory
+    // and copy the ids from host to device array
+    CUDAMALLOCCTRL( "&d_conn_ids", &d_conn_ids, n_conn * sizeof( int64_t ) );
+    gpuErrchk( cudaMemcpy( d_conn_ids, conn_ids, n_conn * sizeof( int64_t ), cudaMemcpyHostToDevice ) );
+
+    // allocate connection parameter array in device memory
+    CUDAMALLOCCTRL( "&d_arr", &d_arr, n_conn * sizeof( float ) );
+
+    // launch kernel to get connection parameters
+    getConnectionFloatParamKernel< ConnKeyT, ConnStructT > <<< ( n_conn + 1023 ) / 1024, 1024 >>>(
+      d_conn_ids, n_conn, d_arr, i_param );
+
+    // copy connection parameter array from device to host memory
+    gpuErrchk( cudaMemcpy( h_param_arr, d_arr, n_conn * sizeof( float ), cudaMemcpyDeviceToHost ) );
+    // free allocated device memory
+    CUDAFREECTRL( "d_conn_ids", d_conn_ids );
+    CUDAFREECTRL( "d_arr", d_arr );
+  }
 
-    setPort<<<(n_block_conn+1023)/1024, 1024>>>
-      (conn_subarray[ib] + i_conn0, syn_spec.port_, n_block_conn);
-    DBGCUDASYNC
-    setSynGroup<<<(n_block_conn+1023)/1024, 1024>>>
-      (conn_subarray[ib] + i_conn0, syn_spec.syn_group_, n_block_conn);
-    DBGCUDASYNC
+  return 0;
+}
 
+//////////////////////////////////////////////////////////////////////
+// Get the integer parameter param_name of an array of n_conn connections,
+// identified by the indexes conn_ids[i], and put it in the array
+// h_param_arr
+// NOTE: host array should be pre-allocated to store n_conn elements
+//////////////////////////////////////////////////////////////////////
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::getConnectionIntParam( int64_t* conn_ids,
+  int64_t n_conn,
+  int* h_param_arr,
+  std::string param_name )
+{
+  // Check if param_name is a connection integer parameter
+  int i_param = getConnectionIntParamIndex( param_name );
+  if ( i_param < 0 )
+  {
+    throw ngpu_exception( std::string( "Unrecognized connection "
+                                       "integer parameter " )
+      + param_name );
+  }
+  if ( n_conn > 0 )
+  {
+    // declare pointers to arrays in device memory
+    int64_t* d_conn_ids;
+    int* d_arr;
+    // allocate array of connection ids in device memory
+    // and copy the ids from host to device array
+    CUDAMALLOCCTRL( "&d_conn_ids", &d_conn_ids, n_conn * sizeof( int64_t ) );
+    gpuErrchk( cudaMemcpy( d_conn_ids, conn_ids, n_conn * sizeof( int64_t ), cudaMemcpyHostToDevice ) );
+
+    // allocate connection parameter array in device memory
+    CUDAMALLOCCTRL( "&d_arr", &d_arr, n_conn * sizeof( int ) );
+
+    // launch kernel to get connection parameters
+    getConnectionIntParamKernel< ConnKeyT, ConnStructT > <<< ( n_conn + 1023 ) / 1024, 1024 >>>(
+      d_conn_ids, n_conn, d_arr, i_param );
+
+    // copy connection parameter array from device to host memory
+    gpuErrchk( cudaMemcpy( h_param_arr, d_arr, n_conn * sizeof( int ), cudaMemcpyDeviceToHost ) );
+    // free allocated device memory
+    CUDAFREECTRL( "d_conn_ids", d_conn_ids );
+    CUDAFREECTRL( "d_arr", d_arr );
   }
 
   return 0;
 }
 
-template <class T1, class T2>
-int connect_fixed_indegree(curandGenerator_t &gen,
-			   void *d_storage, float time_resolution,
-			   std::vector<uint*> &key_subarray,
-			   std::vector<connection_struct*> &conn_subarray,
-			   int64_t &n_conn, int64_t block_size,
-			   int indegree, T1 source, int n_source,
-			   T2 target, int n_target,
-			   SynSpec &syn_spec)
+//////////////////////////////////////////////////////////////////////
+// Set the float parameter param_name of an array of n_conn connections,
+// identified by the indexes conn_ids[i], to the value val
+//////////////////////////////////////////////////////////////////////
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::setConnectionFloatParam( int64_t* conn_ids,
+  int64_t n_conn,
+  float val,
+  std::string param_name )
 {
-  if (indegree<=0) return 0;
-  uint64_t old_n_conn = n_conn;
-  uint64_t n_new_conn = n_target*indegree;
-  n_conn += n_new_conn; // new number of connections
-  uint new_n_block = (uint)((n_conn + block_size - 1) / block_size);
+  // Check if param_name is a connection float parameter
+  int i_param = getConnectionFloatParamIndex( param_name );
+  if ( i_param < 0 )
+  {
+    throw ngpu_exception( std::string( "Unrecognized connection float parameter " ) + param_name );
+  }
+  if ( i_param == i_delay_param )
+  {
+    throw ngpu_exception( "Connection delay cannot be modified" );
+  }
 
-  allocateNewBlocks(key_subarray, conn_subarray, block_size, new_n_block);
+  if ( n_conn > 0 )
+  {
+    // declare pointers to arrays in device memory
+    int64_t* d_conn_ids;
+    // allocate array of connection ids in device memory
+    // and copy the ids from host to device array
+    CUDAMALLOCCTRL( "&d_conn_ids", &d_conn_ids, n_conn * sizeof( int64_t ) );
+    gpuErrchk( cudaMemcpy( d_conn_ids, conn_ids, n_conn * sizeof( int64_t ), cudaMemcpyHostToDevice ) );
+
+    // launch kernel to set connection parameters
+    setConnectionFloatParamKernel< ConnStructT > <<< ( n_conn + 1023 ) / 1024, 1024 >>>(
+      d_conn_ids, n_conn, val, i_param );
+    // free allocated device memory
+    CUDAFREECTRL( "d_conn_ids", d_conn_ids );
+  }
 
-  //printf("Generating connections with fixed_indegree rule...\n");
-  int64_t n_prev_conn = 0;
-  uint ib0 = (uint)(old_n_conn / block_size);
-  for (uint ib=ib0; ib<new_n_block; ib++) {
-    uint64_t n_block_conn; // number of connections in a block
-    uint64_t i_conn0; // index of first connection in a block
-    if (new_n_block == ib0 + 1) {  // all connections are in the same block
-        i_conn0 = old_n_conn % block_size;
-	n_block_conn =   n_new_conn;
-    }
-    else if (ib == ib0) { // first block
-      i_conn0 = old_n_conn % block_size;
-      n_block_conn = block_size - i_conn0;
-    }
-    else if (ib == new_n_block-1) { // last block
-      i_conn0 = 0;
-      n_block_conn = (n_conn - 1) % block_size + 1;
-    }
-    else {
-      i_conn0 = 0;
-      n_block_conn = block_size;
-    }
-    // generate random source index in range 0 - n_neuron
-    CURAND_CALL(curandGenerate(gen, (uint*)d_storage, n_block_conn));
-    //printf("old_n_conn: %ld\n", old_n_conn);
-    //printf("n_new_conn: %ld\n", n_new_conn);
-    //printf("new_n_block: %d\n", new_n_block);
-    //printf("ib: %d\n", ib);
-    //printf("n_block_conn: %ld\n", n_block_conn);
-    setSource<<<(n_block_conn+1023)/1024, 1024>>>
-      (key_subarray[ib] + i_conn0, (uint*)d_storage, n_block_conn,
-       source, n_source);
-    DBGCUDASYNC
+  return 0;
+}
 
-    setIndegreeTarget<<<(n_block_conn+1023)/1024, 1024>>>
-      (conn_subarray[ib] + i_conn0, n_block_conn, n_prev_conn,
-       target, indegree);
-    DBGCUDASYNC
+//////////////////////////////////////////////////////////////////////
+// Set the float parameter param_name of an array of n_conn connections,
+// identified by the indexes conn_ids[i], using values from a distribution
+// or from an array
+//////////////////////////////////////////////////////////////////////
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::setConnectionFloatParamDistr( int64_t* conn_ids,
+  int64_t n_conn,
+  std::string param_name )
+{
+  // Check if param_name is a connection float parameter
+  int i_param = getConnectionFloatParamIndex( param_name );
+  if ( i_param < 0 )
+  {
+    throw ngpu_exception( std::string( "Unrecognized connection float parameter " ) + param_name );
+  }
+  if ( i_param == i_delay_param )
+  {
+    throw ngpu_exception( "Connection delay cannot be modified" );
+  }
 
-    setConnectionWeights(gen, d_storage, conn_subarray[ib] + i_conn0,
-			 n_block_conn, syn_spec);
+  if ( n_conn > 0 )
+  {
+    // declare pointers to arrays in device memory
+    int64_t* d_conn_ids;
+    // allocate array of connection ids in device memory
+    // and copy the ids from host to device array
+    CUDAMALLOCCTRL( "&d_conn_ids", &d_conn_ids, n_conn * sizeof( int64_t ) );
+    gpuErrchk( cudaMemcpy( d_conn_ids, conn_ids, n_conn * sizeof( int64_t ), cudaMemcpyHostToDevice ) );
+
+    // get values from array or distribution
+    float* d_arr = distribution_->getArray( conn_random_generator_[ this_host_ ][ this_host_ ], n_conn );
+    // launch kernel to set connection parameters
+    setConnectionFloatParamKernel< ConnStructT > <<< ( n_conn + 1023 ) / 1024, 1024 >>>(
+      d_conn_ids, n_conn, d_arr, i_param );
+    // free allocated device memory
+    CUDAFREECTRL( "d_conn_ids", d_conn_ids );
+    CUDAFREECTRL( "d_arr", d_arr );
+  }
 
-    setConnectionDelays(gen, d_storage, key_subarray[ib] + i_conn0,
-			n_block_conn, syn_spec, time_resolution);
+  return 0;
+}
 
-    setPort<<<(n_block_conn+1023)/1024, 1024>>>
-      (conn_subarray[ib] + i_conn0, syn_spec.port_, n_block_conn);
-    DBGCUDASYNC
-    setSynGroup<<<(n_block_conn+1023)/1024, 1024>>>
-      (conn_subarray[ib] + i_conn0, syn_spec.syn_group_, n_block_conn);
-    DBGCUDASYNC
+//////////////////////////////////////////////////////////////////////
+// Set the integer parameter param_name of an array of n_conn connections,
+// identified by the indexes conn_ids[i], using the values from the array
+// h_param_arr
+//////////////////////////////////////////////////////////////////////
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::setConnectionIntParamArr( int64_t* conn_ids,
+  int64_t n_conn,
+  int* h_param_arr,
+  std::string param_name )
+{
+  // Check if param_name is a connection int parameter
+  int i_param = getConnectionIntParamIndex( param_name );
+  if ( i_param < 0 )
+  {
+    throw ngpu_exception( std::string( "Unrecognized connection int parameter " ) + param_name );
+  }
+  if ( i_param == i_source_param )
+  {
+    throw ngpu_exception( "Connection source node cannot be modified" );
+  }
 
-    n_prev_conn += n_block_conn;
+  if ( n_conn > 0 )
+  {
+    // declare pointers to arrays in device memory
+    int64_t* d_conn_ids;
+    int* d_arr;
+    // allocate array of connection ids in device memory
+    // and copy the ids from host to device array
+    CUDAMALLOCCTRL( "&d_conn_ids", &d_conn_ids, n_conn * sizeof( int64_t ) );
+    gpuErrchk( cudaMemcpy( d_conn_ids, conn_ids, n_conn * sizeof( int64_t ), cudaMemcpyHostToDevice ) );
+
+    // allocate connection parameter array in device memory
+    CUDAMALLOCCTRL( "&d_arr", &d_arr, n_conn * sizeof( int ) );
+
+    // copy connection parameter array from host to device memory
+    gpuErrchk( cudaMemcpy( d_arr, h_param_arr, n_conn * sizeof( int ), cudaMemcpyHostToDevice ) );
+
+    // launch kernel to set connection parameters
+    setConnectionIntParamKernel< ConnKeyT, ConnStructT > <<< ( n_conn + 1023 ) / 1024, 1024 >>>(
+      d_conn_ids, n_conn, d_arr, i_param );
+    // free allocated device memory
+    CUDAFREECTRL( "d_conn_ids", d_conn_ids );
+    CUDAFREECTRL( "d_arr", d_arr );
   }
 
   return 0;
 }
 
-template <class T1, class T2>
-int connect_fixed_outdegree(curandGenerator_t &gen,
-			   void *d_storage, float time_resolution,
-			   std::vector<uint*> &key_subarray,
-			   std::vector<connection_struct*> &conn_subarray,
-			   int64_t &n_conn, int64_t block_size,
-			   int outdegree, T1 source, int n_source,
-			   T2 target, int n_target,
-			   SynSpec &syn_spec)
+//////////////////////////////////////////////////////////////////////
+// Set the int parameter param_name of an array of n_conn connections,
+// identified by the indexes conn_ids[i], to the value val
+//////////////////////////////////////////////////////////////////////
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::setConnectionIntParam( int64_t* conn_ids,
+  int64_t n_conn,
+  int val,
+  std::string param_name )
 {
-  if (outdegree<=0) return 0;
-  uint64_t old_n_conn = n_conn;
-  uint64_t n_new_conn = n_source*outdegree;
-  n_conn += n_new_conn; // new number of connections
-  uint new_n_block = (uint)((n_conn + block_size - 1) / block_size);
+  // Check if param_name is a connection float parameter
+  int i_param = getConnectionIntParamIndex( param_name );
+  if ( i_param < 0 )
+  {
+    throw ngpu_exception( std::string( "Unrecognized connection int parameter " ) + param_name );
+  }
+  if ( i_param == i_source_param )
+  {
+    throw ngpu_exception( "Connection source node cannot be modified" );
+  }
 
-  allocateNewBlocks(key_subarray, conn_subarray, block_size, new_n_block);
+  if ( n_conn > 0 )
+  {
+    // declare pointers to arrays in device memory
+    int64_t* d_conn_ids;
+    // allocate array of connection ids in device memory
+    // and copy the ids from host to device array
+    CUDAMALLOCCTRL( "&d_conn_ids", &d_conn_ids, n_conn * sizeof( int64_t ) );
+    gpuErrchk( cudaMemcpy( d_conn_ids, conn_ids, n_conn * sizeof( int64_t ), cudaMemcpyHostToDevice ) );
+
+    // launch kernel to set connection parameters
+    setConnectionIntParamKernel< ConnKeyT, ConnStructT > <<< ( n_conn + 1023 ) / 1024, 1024 >>>(
+      d_conn_ids, n_conn, val, i_param );
+    // free allocated device memory
+    CUDAFREECTRL( "d_conn_ids", d_conn_ids );
+  }
 
-  //printf("Generating connections with fixed_outdegree rule...\n");
-  int64_t n_prev_conn = 0;
-  uint ib0 = (uint)(old_n_conn / block_size);
-  for (uint ib=ib0; ib<new_n_block; ib++) {
-    uint64_t n_block_conn; // number of connections in a block
-    uint64_t i_conn0; // index of first connection in a block
-    if (new_n_block == ib0 + 1) {  // all connections are in the same block
-        i_conn0 = old_n_conn % block_size;
-	n_block_conn =   n_new_conn;
-    }
-    else if (ib == ib0) { // first block
-      i_conn0 = old_n_conn % block_size;
-      n_block_conn = block_size - i_conn0;
-    }
-    else if (ib == new_n_block-1) { // last block
-      i_conn0 = 0;
-      n_block_conn = (n_conn - 1) % block_size + 1;
+  return 0;
+}
+
+template < class ConnKeyT, class ConnStructT >
+int64_t*
+ConnectionTemplate< ConnKeyT, ConnStructT >::getConnections( inode_t* i_source_pt,
+  inode_t n_source,
+  inode_t* i_target_pt,
+  inode_t n_target,
+  int syn_group,
+  int64_t* n_conn )
+{
+  int64_t* h_conn_ids = NULL;
+  int64_t* d_conn_ids = NULL;
+  uint64_t n_src_tgt = ( uint64_t ) n_source * n_target;
+  int64_t n_conn_ids = 0;
+
+  if ( n_src_tgt > 0 )
+  {
+    // std::cout << "n_src_tgt " << n_src_tgt << "n_source " << n_source
+    //	      << "n_target " << n_target << "\n";
+    //  sort source node index array in GPU memory
+    inode_t* d_src_arr = sortArray( i_source_pt, n_source );
+    // sort target node index array in GPU memory
+    inode_t* d_tgt_arr = sortArray( i_target_pt, n_target );
+    // Allocate array of combined source-target indexes (src_arr x tgt_arr)
+    uint64_t* d_src_tgt_arr;
+    CUDAMALLOCCTRL( "&d_src_tgt_arr", &d_src_tgt_arr, n_src_tgt * sizeof( uint64_t ) );
+    // Fill it with combined source-target indexes
+    setSourceTargetIndexKernel<<< ( n_src_tgt + 1023 ) / 1024, 1024 >>>(
+      n_src_tgt, n_source, n_target, d_src_tgt_arr, d_src_arr, d_tgt_arr );
+    // Allocate array of number of connections per source-target couple
+    // and initialize it to 0
+    uint64_t* d_src_tgt_conn_num;
+    CUDAMALLOCCTRL( "&d_src_tgt_conn_num", &d_src_tgt_conn_num, ( n_src_tgt + 1 ) * sizeof( uint64_t ) );
+    gpuErrchk( cudaMemset( d_src_tgt_conn_num, 0, ( n_src_tgt + 1 ) * sizeof( uint64_t ) ) );
+
+    // Count number of connections per source-target couple
+    countConnectionsKernel< ConnKeyT, ConnStructT > <<< ( n_conn_ + 1023 ) / 1024, 1024 >>>(
+      n_conn_, n_source, n_target, d_src_tgt_arr, d_src_tgt_conn_num, syn_group );
+    // Evaluate exclusive sum of connections per source-target couple
+    // Allocate array for cumulative sum
+    uint64_t* d_src_tgt_conn_cumul;
+    CUDAMALLOCCTRL( "&d_src_tgt_conn_cumul", &d_src_tgt_conn_cumul, ( n_src_tgt + 1 ) * sizeof( uint64_t ) );
+    // Determine temporary device storage requirements
+    void* d_storage = NULL;
+    size_t storage_bytes = 0;
+    //<BEGIN-CLANG-TIDY-SKIP>//
+    cub::DeviceScan::ExclusiveSum( d_storage, storage_bytes, d_src_tgt_conn_num, d_src_tgt_conn_cumul, n_src_tgt + 1 );
+    //<END-CLANG-TIDY-SKIP>//
+
+    // Allocate temporary storage
+    CUDAMALLOCCTRL( "&d_storage", &d_storage, storage_bytes );
+    // Run exclusive prefix sum
+    //<BEGIN-CLANG-TIDY-SKIP>//
+    cub::DeviceScan::ExclusiveSum( d_storage, storage_bytes, d_src_tgt_conn_num, d_src_tgt_conn_cumul, n_src_tgt + 1 );
+    //<END-CLANG-TIDY-SKIP>//
+
+    CUDAFREECTRL( "d_storage", d_storage );
+
+    // The last element is the total number of required connection Ids
+    cudaMemcpy( &n_conn_ids, &d_src_tgt_conn_cumul[ n_src_tgt ], sizeof( int64_t ), cudaMemcpyDeviceToHost );
+
+    if ( n_conn_ids > 0 )
+    {
+      // Allocate array of connection indexes
+      CUDAMALLOCCTRL( "&d_conn_ids", &d_conn_ids, n_conn_ids * sizeof( int64_t ) );
+      // Set number of connections per source-target couple to 0 again
+      gpuErrchk( cudaMemset( d_src_tgt_conn_num, 0, ( n_src_tgt + 1 ) * sizeof( uint64_t ) ) );
+      // Fill array of connection indexes
+      setConnectionsIndexKernel< ConnKeyT, ConnStructT > <<< ( n_conn_ + 1023 ) / 1024, 1024 >>>(
+        n_conn_, n_source, n_target, d_src_tgt_arr, d_src_tgt_conn_num, d_src_tgt_conn_cumul, syn_group, d_conn_ids );
+
+      /// check if allocating with new is more appropriate
+      h_conn_ids = ( int64_t* ) malloc( n_conn_ids * sizeof( int64_t ) );
+      gpuErrchk( cudaMemcpy( h_conn_ids, d_conn_ids, n_conn_ids * sizeof( int64_t ), cudaMemcpyDeviceToHost ) );
+
+      CUDAFREECTRL( "d_src_tgt_arr", d_src_tgt_arr );
+      CUDAFREECTRL( "d_src_tgt_conn_num", d_src_tgt_conn_num );
+      CUDAFREECTRL( "d_src_tgt_conn_cumul", d_src_tgt_conn_cumul );
+      CUDAFREECTRL( "d_conn_ids", d_conn_ids );
     }
-    else {
-      i_conn0 = 0;
-      n_block_conn = block_size;
+  }
+  *n_conn = n_conn_ids;
+
+  return h_conn_ids;
+}
+
+//////////////////////////////////////////////////////////////////////
+// Get all parameters of an array of n_conn connections, identified by
+// the indexes conn_ids[i], and put them in the arrays
+// i_source, i_target, port, syn_group, delay, weight
+// NOTE: host arrays should be pre-allocated to store n_conn elements
+//////////////////////////////////////////////////////////////////////
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::getConnectionStatus( int64_t* conn_ids,
+  int64_t n_conn,
+  inode_t* source,
+  inode_t* target,
+  int* port,
+  int* syn_group,
+  float* delay,
+  float* weight )
+{
+  if ( n_conn > 0 )
+  {
+    // declare pointers to arrays in device memory
+    int64_t* d_conn_ids;
+    inode_t* d_source;
+    inode_t* d_target;
+    int* d_port;
+    int* d_syn_group;
+    float* d_delay;
+    float* d_weight;
+
+    // allocate array of connection ids in device memory
+    // and copy the ids from host to device array
+    CUDAMALLOCCTRL( "&d_conn_ids", &d_conn_ids, n_conn * sizeof( int64_t ) );
+    gpuErrchk( cudaMemcpy( d_conn_ids, conn_ids, n_conn * sizeof( int64_t ), cudaMemcpyHostToDevice ) );
+
+    // allocate arrays of connection parameters in device memory
+    CUDAMALLOCCTRL( "&d_source", &d_source, n_conn * sizeof( inode_t ) );
+    CUDAMALLOCCTRL( "&d_target", &d_target, n_conn * sizeof( inode_t ) );
+    CUDAMALLOCCTRL( "&d_port", &d_port, n_conn * sizeof( int ) );
+    CUDAMALLOCCTRL( "&d_syn_group", &d_syn_group, n_conn * sizeof( int ) );
+    CUDAMALLOCCTRL( "&d_delay", &d_delay, n_conn * sizeof( float ) );
+    CUDAMALLOCCTRL( "&d_weight", &d_weight, n_conn * sizeof( float ) );
+    // host arrays
+
+    // launch kernel to get connection parameters
+    getConnectionStatusKernel< ConnKeyT, ConnStructT > <<< ( n_conn + 1023 ) / 1024, 1024 >>>(
+      d_conn_ids, n_conn, d_source, d_target, d_port, d_syn_group, d_delay, d_weight );
+
+    // copy connection parameters from device to host memory
+    gpuErrchk( cudaMemcpy( source, d_source, n_conn * sizeof( inode_t ), cudaMemcpyDeviceToHost ) );
+
+    gpuErrchk( cudaMemcpy( target, d_target, n_conn * sizeof( inode_t ), cudaMemcpyDeviceToHost ) );
+    gpuErrchk( cudaMemcpy( port, d_port, n_conn * sizeof( int ), cudaMemcpyDeviceToHost ) );
+    gpuErrchk( cudaMemcpy( syn_group, d_syn_group, n_conn * sizeof( int ), cudaMemcpyDeviceToHost ) );
+    gpuErrchk( cudaMemcpy( delay, d_delay, n_conn * sizeof( float ), cudaMemcpyDeviceToHost ) );
+    gpuErrchk( cudaMemcpy( weight, d_weight, n_conn * sizeof( float ), cudaMemcpyDeviceToHost ) );
+  }
+
+  return 0;
+}
+
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::freeConnRandomGenerator()
+{
+  if ( conn_random_generator_.size() > 0 )
+  {
+    for ( int i_host = 0; i_host < n_hosts_; i_host++ )
+    {
+      for ( int j_host = 0; j_host < n_hosts_; j_host++ )
+      {
+        CURAND_CALL( curandDestroyGenerator( conn_random_generator_[ i_host ][ j_host ] ) );
+      }
     }
+  }
 
-    setOutdegreeSource<<<(n_block_conn+1023)/1024, 1024>>>
-      (key_subarray[ib] + i_conn0, n_block_conn, n_prev_conn,
-       source, outdegree);
-    DBGCUDASYNC
+  return 0;
+}
 
-    // generate random target index in range 0 - n_neuron
-    CURAND_CALL(curandGenerate(gen, (uint*)d_storage, n_block_conn));
-    setTarget<<<(n_block_conn+1023)/1024, 1024>>>
-      (conn_subarray[ib] + i_conn0, (uint*)d_storage, n_block_conn,
-       target, n_target);
-    DBGCUDASYNC
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::initConnRandomGenerator()
+{
+  conn_random_generator_.resize( n_hosts_ );
+  for ( int i_host = 0; i_host < n_hosts_; i_host++ )
+  {
+    conn_random_generator_[ i_host ].resize( n_hosts_ );
+    for ( int j_host = 0; j_host < n_hosts_; j_host++ )
+    {
+      CURAND_CALL( curandCreateGenerator( &conn_random_generator_[ i_host ][ j_host ], CURAND_RNG_PSEUDO_DEFAULT ) );
+    }
+  }
 
-    setConnectionWeights(gen, d_storage, conn_subarray[ib] + i_conn0,
-			 n_block_conn, syn_spec);
+  local_rnd_gen_ = conn_random_generator_[ this_host_ ][ this_host_ ];
 
-    setConnectionDelays(gen, d_storage, key_subarray[ib] + i_conn0,
-			n_block_conn, syn_spec, time_resolution);
+  return 0;
+}
 
-    setPort<<<(n_block_conn+1023)/1024, 1024>>>
-      (conn_subarray[ib] + i_conn0, syn_spec.port_, n_block_conn);
-    DBGCUDASYNC
-    setSynGroup<<<(n_block_conn+1023)/1024, 1024>>>
-      (conn_subarray[ib] + i_conn0, syn_spec.syn_group_, n_block_conn);
-    DBGCUDASYNC
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::setTimeResolution( float time_resolution )
+{
+  time_resolution_ = time_resolution;
 
-    n_prev_conn += n_block_conn;
+  return 0;
+}
+
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::setRandomSeed( unsigned long long seed )
+{
+  for ( int i_host = 0; i_host < n_hosts_; i_host++ )
+  {
+    for ( int j_host = 0; j_host < n_hosts_; j_host++ )
+    {
+      CURAND_CALL( curandSetPseudoRandomGeneratorSeed(
+        conn_random_generator_[ i_host ][ j_host ], seed + conn_seed_offset_ + i_host * n_hosts_ + j_host ) );
+    }
   }
 
   return 0;
 }
 
-template <class T1, class T2>
-int NESTGPU::_ConnectOneToOne
-(curandGenerator_t &gen, T1 source, T2 target, int n_node, SynSpec &syn_spec)
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::setNHosts( int n_hosts )
 {
-  //printf("In new specialized connection one-to-one\n");
+  // free previous instances before creating new
+  freeConnRandomGenerator();
+  n_hosts_ = n_hosts;
+  initConnRandomGenerator();
 
-  void *d_storage;
-  CUDAMALLOCCTRL("&d_storage",&d_storage, h_ConnBlockSize*sizeof(int));
+  return 0;
+}
 
-  connect_one_to_one(gen, d_storage, time_resolution_,
-		     KeySubarray, ConnectionSubarray, NConn,
-		     h_ConnBlockSize, source, target, n_node, syn_spec);
-  CUDAFREECTRL("d_storage",d_storage);
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::setThisHost( int this_host )
+{
+  this_host_ = this_host;
 
   return 0;
 }
 
-template <class T1, class T2>
-int NESTGPU::_ConnectAllToAll
-(curandGenerator_t &gen, T1 source, int n_source, T2 target, int n_target,
- SynSpec &syn_spec)
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::organizeDirectConnections( void*& d_poiss_key_array_data_pt,
+  void*& d_poiss_subarray,
+  int64_t*& d_poiss_num,
+  int64_t*& d_poiss_sum,
+  void*& d_poiss_thresh )
 {
-  //printf("In new specialized connection all-to-all\n");
+  int k = conn_key_vect_.size();
+  ConnKeyT** conn_key_array = conn_key_vect_.data();
+
+  CUDAMALLOCCTRL( "&d_poiss_key_array_data_pt", &d_poiss_key_array_data_pt, k * sizeof( ConnKeyT* ) );
+  gpuErrchk( cudaMemcpy( d_poiss_key_array_data_pt, conn_key_array, k * sizeof( ConnKeyT* ), cudaMemcpyHostToDevice ) );
+
+  regular_block_array< ConnKeyT > h_poiss_subarray[ k ];
+  for ( int i = 0; i < k; i++ )
+  {
+    h_poiss_subarray[ i ].h_data_pt = conn_key_array;
+    h_poiss_subarray[ i ].data_pt = ( ConnKeyT** ) d_poiss_key_array_data_pt;
+    h_poiss_subarray[ i ].block_size = conn_block_size_;
+    h_poiss_subarray[ i ].offset = i * conn_block_size_;
+    h_poiss_subarray[ i ].size = i < k - 1 ? conn_block_size_ : n_conn_ - ( k - 1 ) * conn_block_size_;
+  }
 
-  void *d_storage;
-  CUDAMALLOCCTRL("&d_storage",&d_storage, h_ConnBlockSize*sizeof(int));
+  CUDAMALLOCCTRL( "&d_poiss_subarray", &d_poiss_subarray, k * sizeof( regular_block_array< ConnKeyT > ) );
+  gpuErrchk( cudaMemcpyAsync(
+    d_poiss_subarray, h_poiss_subarray, k * sizeof( regular_block_array< ConnKeyT > ), cudaMemcpyHostToDevice ) );
 
-  connect_all_to_all(gen, d_storage, time_resolution_,
-		     KeySubarray, ConnectionSubarray, NConn,
-		     h_ConnBlockSize, source, n_source,
-		     target, n_target, syn_spec);
-  CUDAFREECTRL("d_storage",d_storage);
+  CUDAMALLOCCTRL( "&d_poiss_num", &d_poiss_num, 2 * k * sizeof( int64_t ) );
+  CUDAMALLOCCTRL( "&d_poiss_sum", &d_poiss_sum, 2 * sizeof( int64_t ) );
+
+  CUDAMALLOCCTRL( "&d_poiss_thresh", &d_poiss_thresh, 2 * sizeof( ConnKeyT ) );
 
   return 0;
 }
 
-template <class T1, class T2>
-int NESTGPU::_ConnectFixedTotalNumber
-(curandGenerator_t &gen, T1 source, int n_source, T2 target, int n_target,
- int total_num, SynSpec &syn_spec)
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::buildDirectConnections( inode_t i_node_0,
+  inode_t n_node,
+  int64_t& i_conn0,
+  int64_t& n_dir_conn,
+  int& max_delay,
+  float*& d_mu_arr,
+  void*& d_poiss_key_array )
 {
-  //printf("In new specialized connection fixed-total-number\n");
+  int k = conn_key_vect_.size();
+
+  ConnKeyT** conn_key_array = ( ConnKeyT** ) conn_key_vect_.data();
+  ConnKeyT h_poiss_thresh[ 2 ];
+  h_poiss_thresh[ 0 ] = 0;
+  setConnSource( h_poiss_thresh[ 0 ], i_node_0 );
+
+  h_poiss_thresh[ 1 ] = 0;
+  setConnSource( h_poiss_thresh[ 1 ], i_node_0 + n_node );
+
+  gpuErrchk( cudaMemcpy( poiss_conn::d_poiss_thresh, h_poiss_thresh, 2 * sizeof( ConnKeyT ), cudaMemcpyHostToDevice ) );
+
+  int64_t h_poiss_num[ 2 * k ];
+  int64_t* d_num0 = &poiss_conn::d_poiss_num[ 0 ];
+  int64_t* d_num1 = &poiss_conn::d_poiss_num[ k ];
+  int64_t* h_num0 = &h_poiss_num[ 0 ];
+  int64_t* h_num1 = &h_poiss_num[ k ];
+
+  search_multi_down< ConnKeyT, regular_block_array< ConnKeyT >, 1024 >(
+    ( regular_block_array< ConnKeyT >* ) poiss_conn::d_poiss_subarray,
+    k,
+    &( ( ( ConnKeyT* ) poiss_conn::d_poiss_thresh )[ 0 ] ),
+    d_num0,
+    &poiss_conn::d_poiss_sum[ 0 ] );
+  CUDASYNC;
+
+  search_multi_down< ConnKeyT, regular_block_array< ConnKeyT >, 1024 >(
+    ( regular_block_array< ConnKeyT >* ) poiss_conn::d_poiss_subarray,
+    k,
+    &( ( ( ConnKeyT* ) poiss_conn::d_poiss_thresh )[ 1 ] ),
+    d_num1,
+    &poiss_conn::d_poiss_sum[ 1 ] );
+  CUDASYNC;
+
+  gpuErrchk( cudaMemcpy( h_poiss_num, poiss_conn::d_poiss_num, 2 * k * sizeof( int64_t ), cudaMemcpyDeviceToHost ) );
+
+  i_conn0 = 0;
+  int64_t i_conn1 = 0;
+  int ib0 = 0;
+  int ib1 = 0;
+  for ( int i = 0; i < k; i++ )
+  {
+    if ( h_num0[ i ] < conn_block_size_ )
+    {
+      i_conn0 = conn_block_size_ * i + h_num0[ i ];
+      ib0 = i;
+      break;
+    }
+  }
+
+  for ( int i = 0; i < k; i++ )
+  {
+    if ( h_num1[ i ] < conn_block_size_ )
+    {
+      i_conn1 = conn_block_size_ * i + h_num1[ i ];
+      ib1 = i;
+      break;
+    }
+  }
+
+  n_dir_conn = i_conn1 - i_conn0;
+
+  if ( n_dir_conn > 0 )
+  {
+    CUDAMALLOCCTRL( "&d_poiss_key_array", &d_poiss_key_array, n_dir_conn * sizeof( ConnKeyT ) );
+
+    int64_t offset = 0;
+    for ( int ib = ib0; ib <= ib1; ib++ )
+    {
+      if ( ib == ib0 && ib == ib1 )
+      {
+        gpuErrchk( cudaMemcpy( d_poiss_key_array,
+          conn_key_array[ ib ] + h_num0[ ib ],
+          n_dir_conn * sizeof( ConnKeyT ),
+          cudaMemcpyDeviceToDevice ) );
+        break;
+      }
+      else if ( ib == ib0 )
+      {
+        offset = conn_block_size_ - h_num0[ ib ];
+        gpuErrchk( cudaMemcpy( d_poiss_key_array,
+          conn_key_array[ ib ] + h_num0[ ib ],
+          offset * sizeof( ConnKeyT ),
+          cudaMemcpyDeviceToDevice ) );
+      }
+      else if ( ib == ib1 )
+      {
+        gpuErrchk( cudaMemcpy( ( ConnKeyT* ) d_poiss_key_array + offset,
+          conn_key_array[ ib ],
+          h_num1[ ib ] * sizeof( ConnKeyT ),
+          cudaMemcpyDeviceToDevice ) );
+        break;
+      }
+      else
+      {
+        gpuErrchk( cudaMemcpy( ( ConnKeyT* ) d_poiss_key_array + offset,
+          conn_key_array[ ib ],
+          conn_block_size_ * sizeof( ConnKeyT ),
+          cudaMemcpyDeviceToDevice ) );
+        offset += conn_block_size_;
+      }
+    }
+
+    unsigned int grid_dim_x, grid_dim_y;
+
+    if ( n_dir_conn < 65536 * 1024 )
+    { // max grid dim * max block dim
+      grid_dim_x = ( n_dir_conn + 1023 ) / 1024;
+      grid_dim_y = 1;
+    }
+    else
+    {
+      grid_dim_x = 64; // I think it's not necessary to increase it
+      if ( n_dir_conn > grid_dim_x * 1024 * 65535 )
+      {
+        throw ngpu_exception( std::string( "Number of direct connections " ) + std::to_string( n_dir_conn )
+          + " larger than threshold " + std::to_string( grid_dim_x * 1024 * 65535 ) );
+      }
+      grid_dim_y = ( n_dir_conn + grid_dim_x * 1024 - 1 ) / ( grid_dim_x * 1024 );
+    }
+    dim3 numBlocks( grid_dim_x, grid_dim_y );
+    poissGenSubstractFirstNodeIndexKernel< ConnKeyT > <<< numBlocks, 1024 >>>(
+      n_dir_conn, ( ConnKeyT* ) d_poiss_key_array, i_node_0 );
+    DBGCUDASYNC
+  }
 
-  void *d_storage;
-  CUDAMALLOCCTRL("&d_storage",&d_storage, h_ConnBlockSize*sizeof(int));
+  // Find maximum delay of poisson direct connections
+  // int *d_max_delay; // maximum delay pointer in device memory
+  // CUDAMALLOCCTRL("&d_max_delay",&d_max_delay, sizeof(int));
+  // pointer to connection key with maximum delay in device memory
+  ConnKeyT* d_max_delay_key;
+  ConnKeyT h_max_delay_key;
+  CUDAMALLOCCTRL( "&d_max_delay_key", &d_max_delay_key, sizeof( ConnKeyT ) );
+
+  MaxDelay< ConnKeyT > max_op; // comparison operator used by Reduce function
+  // Determine temporary device storage requirements
+  void* d_temp_storage = NULL;
+  size_t temp_storage_bytes = 0;
+  ConnKeyT init_delay_key = 0;
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceReduce::Reduce( d_temp_storage,
+    temp_storage_bytes,
+    ( ConnKeyT* ) d_poiss_key_array,
+    d_max_delay_key,
+    n_dir_conn,
+    max_op,
+    init_delay_key );
+  //<END-CLANG-TIDY-SKIP>//
+
+  // Allocate temporary storage
+  CUDAMALLOCCTRL( "&d_temp_storage", &d_temp_storage, temp_storage_bytes );
+  // Run reduction
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceReduce::Reduce( d_temp_storage,
+    temp_storage_bytes,
+    ( ConnKeyT* ) d_poiss_key_array,
+    d_max_delay_key,
+    n_dir_conn,
+    max_op,
+    init_delay_key );
+  //<END-CLANG-TIDY-SKIP>//
+
+  // gpuErrchk(cudaMemcpy(&max_delay, d_max_delay, sizeof(int),
+  //		       cudaMemcpyDeviceToHost));
+  gpuErrchk( cudaMemcpy( &h_max_delay_key, d_max_delay_key, sizeof( ConnKeyT ), cudaMemcpyDeviceToHost ) );
+  // std::cout << "Conn key of direct connections having max delay: "
+  //	    << h_max_delay_key << "\n";
+
+  max_delay = getConnDelay( h_max_delay_key );
+  printf( "Max delay of direct (poisson generator) connections: %d\n", max_delay );
+  CUDAMALLOCCTRL( "&d_mu_arr", &d_mu_arr, n_node * max_delay * sizeof( float ) );
+  gpuErrchk( cudaMemset( d_mu_arr, 0, n_node * max_delay * sizeof( float ) ) );
+
+  /*
+  CUDAFREECTRL("d_key_array_data_pt",d_key_array_data_pt);
+  CUDAFREECTRL("d_subarray",d_subarray);
+  CUDAFREECTRL("d_num",d_num);
+  CUDAFREECTRL("d_sum",d_sum);
+  CUDAFREECTRL("d_thresh",d_thresh);
+  */
 
-  connect_fixed_total_number(gen, d_storage, time_resolution_,
-			     KeySubarray, ConnectionSubarray, NConn,
-			     h_ConnBlockSize, total_num, source, n_source,
-			     target, n_target, syn_spec);
-  CUDAFREECTRL("d_storage",d_storage);
+  return 0;
+}
+
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::sendDirectSpikes( long long time_idx,
+  int64_t i_conn0,
+  int64_t n_dir_conn,
+  inode_t n_node,
+  int max_delay,
+  float* d_mu_arr,
+  void* d_poiss_key_array,
+  curandState* d_curand_state )
+{
+  unsigned int grid_dim_x, grid_dim_y;
+
+  if ( n_dir_conn < 65536 * 1024 )
+  { // max grid dim * max block dim
+    grid_dim_x = ( n_dir_conn + 1023 ) / 1024;
+    grid_dim_y = 1;
+  }
+  else
+  {
+    grid_dim_x = 64; // I think it's not necessary to increase it
+    if ( n_dir_conn > grid_dim_x * 1024 * 65535 )
+    {
+      throw ngpu_exception( std::string( "Number of direct connections " ) + std::to_string( n_dir_conn )
+        + " larger than threshold " + std::to_string( grid_dim_x * 1024 * 65535 ) );
+    }
+    grid_dim_y = ( n_dir_conn + grid_dim_x * 1024 - 1 ) / ( grid_dim_x * 1024 );
+  }
+  dim3 numBlocks( grid_dim_x, grid_dim_y );
+  sendDirectSpikeKernel< ConnKeyT, ConnStructT > <<< numBlocks, 1024 >>>( d_curand_state,
+    time_idx,
+    d_mu_arr,
+    ( ConnKeyT* ) d_poiss_key_array,
+    n_dir_conn,
+    i_conn0,
+    conn_block_size_,
+    n_node,
+    max_delay );
+
+  DBGCUDASYNC
 
   return 0;
 }
 
-template <class T1, class T2>
-int NESTGPU::_ConnectFixedIndegree
-(curandGenerator_t &gen, T1 source, int n_source, T2 target, int n_target,
- int indegree, SynSpec &syn_spec)
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::revSpikeInit( uint n_spike_buffers )
 {
-  //printf("In new specialized connection fixed-indegree\n");
+  // printf("n_spike_buffers: %d\n", n_spike_buffers);
+
+  //////////////////////////////////////////////////////////////////////
+  /////// Organize reverse connections (new version)
+  // CHECK THE GLOBAL VARIABLES THAT MUST BE CONVERTED TO 64 bit ARRAYS
+  //////////////////////////////////////////////////////////////////////
+  // Alloc 64 bit array of number of reverse connections per target node
+  // and initialize it to 0
+  int64_t* d_target_rev_conn_size_64;
+  int64_t* d_target_rev_conn_cumul;
+  CUDAMALLOCCTRL(
+    "&d_target_rev_conn_size_64", &d_target_rev_conn_size_64, ( n_spike_buffers + 1 ) * sizeof( int64_t ) );
+  gpuErrchk( cudaMemset( d_target_rev_conn_size_64, 0, ( n_spike_buffers + 1 ) * sizeof( int64_t ) ) );
+  // Count number of reverse connections per target node
+  countRevConnectionsKernel< ConnKeyT, ConnStructT > <<< ( n_conn_ + 1023 ) / 1024, 1024 >>>(
+    n_conn_, d_target_rev_conn_size_64 );
+  // Evaluate exclusive sum of reverse connections per target node
+  // Allocate array for cumulative sum
+  CUDAMALLOCCTRL( "&d_target_rev_conn_cumul", &d_target_rev_conn_cumul, ( n_spike_buffers + 1 ) * sizeof( int64_t ) );
+  // Determine temporary device storage requirements
+  void* d_temp_storage = NULL;
+  size_t temp_storage_bytes = 0;
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceScan::ExclusiveSum(
+    d_temp_storage, temp_storage_bytes, d_target_rev_conn_size_64, d_target_rev_conn_cumul, n_spike_buffers + 1 );
+  //<END-CLANG-TIDY-SKIP>//
+
+  // Allocate temporary storage
+  CUDAMALLOCCTRL( "&d_temp_storage", &d_temp_storage, temp_storage_bytes );
+  // Run exclusive prefix sum
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceScan::ExclusiveSum(
+    d_temp_storage, temp_storage_bytes, d_target_rev_conn_size_64, d_target_rev_conn_cumul, n_spike_buffers + 1 );
+  //<END-CLANG-TIDY-SKIP>//
+
+  // The last element is the total number of reverse connections
+  gpuErrchk( cudaMemcpy(
+    &n_rev_conn_, &d_target_rev_conn_cumul[ n_spike_buffers ], sizeof( int64_t ), cudaMemcpyDeviceToHost ) );
+  if ( n_rev_conn_ > 0 )
+  {
+    // Allocate array of reverse connection indexes
+    // CHECK THAT d_RevConnections is of type int64_t array
+    CUDAMALLOCCTRL( "&d_rev_conn_", &d_rev_conn_, n_rev_conn_ * sizeof( int64_t ) );
+    // For each target node evaluate the pointer
+    // to its first reverse connection using the exclusive sum
+    // CHECK THAT d_target_rev_conn_ is of type int64_t* pointer
+    CUDAMALLOCCTRL( "&d_target_rev_conn_", &d_target_rev_conn_, n_spike_buffers * sizeof( int64_t* ) );
+    setTargetRevConnectionsPtKernel<<< ( n_spike_buffers + 1023 ) / 1024, 1024 >>>(
+      n_spike_buffers, d_target_rev_conn_cumul, d_target_rev_conn_, d_rev_conn_ );
+
+    // alloc 32 bit array of number of reverse connections per target node
+    CUDAMALLOCCTRL( "&d_target_rev_conn_size_", &d_target_rev_conn_size_, n_spike_buffers * sizeof( int ) );
+    // and initialize it to 0
+    gpuErrchk( cudaMemset( d_target_rev_conn_size_, 0, n_spike_buffers * sizeof( int ) ) );
+    // Fill array of reverse connection indexes
+    setRevConnectionsIndexKernel< ConnKeyT, ConnStructT > <<< ( n_conn_ + 1023 ) / 1024, 1024 >>>(
+      n_conn_, d_target_rev_conn_size_, d_target_rev_conn_ );
+
+    revConnectionInitKernel<<< 1, 1 >>>( d_rev_conn_, d_target_rev_conn_size_, d_target_rev_conn_ );
+
+    setConnectionSpikeTime <<< ( n_conn_ + 1023 ) / 1024, 1024 >>>( n_conn_, 0x8000 );
+    gpuErrchk( cudaPeekAtLastError() );
+    gpuErrchk( cudaDeviceSynchronize() );
+
+    CUDAMALLOCCTRL( "&d_rev_spike_num_", &d_rev_spike_num_, sizeof( uint ) );
+
+    CUDAMALLOCCTRL( "&d_rev_spike_target_", &d_rev_spike_target_, n_spike_buffers * sizeof( uint ) );
+
+    CUDAMALLOCCTRL( "&d_rev_spike_n_conn", &d_rev_spike_n_conn_, n_spike_buffers * sizeof( int ) );
+
+    deviceRevSpikeInit<<< 1, 1 >>>( d_rev_spike_num_, d_rev_spike_target_, d_rev_spike_n_conn_ );
+    gpuErrchk( cudaPeekAtLastError() );
+    gpuErrchk( cudaDeviceSynchronize() );
+  }
+
+  CUDAFREECTRL( "d_temp_storage", d_temp_storage );
+  CUDAFREECTRL( "d_target_rev_conn_size_64", d_target_rev_conn_size_64 );
+  CUDAFREECTRL( "d_target_rev_conn_cumul", d_target_rev_conn_cumul );
 
-  void *d_storage;
-  CUDAMALLOCCTRL("&d_storage",&d_storage, h_ConnBlockSize*sizeof(int));
+  return 0;
+}
 
-  connect_fixed_indegree(gen, d_storage, time_resolution_,
-			 KeySubarray, ConnectionSubarray, NConn,
-			 h_ConnBlockSize, indegree, source, n_source,
-			 target, n_target, syn_spec);
-  CUDAFREECTRL("d_storage",d_storage);
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::revSpikeFree()
+{
+  CUDAFREECTRL( "&d_rev_spike_num_", &d_rev_spike_num_ );
+  CUDAFREECTRL( "&d_rev_spike_target_", &d_rev_spike_target_ );
+  CUDAFREECTRL( "&d_rev_spike_n_conn_", &d_rev_spike_n_conn_ );
 
   return 0;
 }
 
-template <class T1, class T2>
-int NESTGPU::_ConnectFixedOutdegree
-(curandGenerator_t &gen, T1 source, int n_source, T2 target, int n_target,
- int outdegree, SynSpec &syn_spec)
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::resetConnectionSpikeTimeUp()
 {
-  //printf("In new specialized connection fixed-outdegree\n");
+  resetConnectionSpikeTimeUpKernel <<< ( n_conn_ + 1023 ) / 1024, 1024 >>>( n_conn_ );
+  gpuErrchk( cudaPeekAtLastError() );
 
-  void *d_storage;
-  CUDAMALLOCCTRL("&d_storage",&d_storage, h_ConnBlockSize*sizeof(int));
+  return 0;
+}
 
-  connect_fixed_outdegree(gen, d_storage, time_resolution_,
-			  KeySubarray, ConnectionSubarray, NConn,
-			  h_ConnBlockSize, outdegree, source, n_source,
-			  target, n_target, syn_spec);
-  CUDAFREECTRL("d_storage",d_storage);
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::resetConnectionSpikeTimeDown()
+{
+  resetConnectionSpikeTimeDownKernel <<< ( n_conn_ + 1023 ) / 1024, 1024 >>>( n_conn_ );
+  gpuErrchk( cudaPeekAtLastError() );
 
   return 0;
 }
diff --git a/src/connect_rules.cu b/src/connect_rules.cu
index 5b12c6f20..f485d8e88 100644
--- a/src/connect_rules.cu
+++ b/src/connect_rules.cu
@@ -20,21 +20,17 @@
  *
  */
 
-
-
-
-
-
-#include <config.h>
-#include <iostream>
-#include "ngpu_exception.h"
-//#include "connect.h"
-#include "nestgpu.h"
-#include "connect_rules.h"
 #include "connect.h"
+#include "connect_rules.h"
 #include "distribution.h"
+#include "nestgpu.h"
+#include "ngpu_exception.h"
+#include "remote_connect.h"
+#include <config.h>
+#include <iostream>
 
-int ConnSpec::Init()
+int
+ConnSpec::Init()
 {
   rule_ = ALL_TO_ALL;
   total_num_ = 0;
@@ -42,82 +38,98 @@ int ConnSpec::Init()
   outdegree_ = 0;
   return 0;
 }
-			    
+
 ConnSpec::ConnSpec()
 {
   Init();
 }
 
-int ConnSpec::Init(int rule, int degree /*=0*/)
+int
+ConnSpec::Init( int rule, int degree /*=0*/ )
 {
   Init();
-  if (rule<0 || rule>N_CONN_RULE) {
-    throw ngpu_exception("Unknown connection rule");
+  if ( rule < 0 || rule > N_CONN_RULE )
+  {
+    throw ngpu_exception( "Unknown connection rule" );
   }
-  if ((rule==ALL_TO_ALL || rule==ONE_TO_ONE) && (degree != 0)) {
-    throw ngpu_exception(std::string("Connection rule ") + conn_rule_name[rule]
-			 + " does not have a degree");
+  if ( ( rule == ALL_TO_ALL || rule == ONE_TO_ONE ) && ( degree != 0 ) )
+  {
+    throw ngpu_exception( std::string( "Connection rule " ) + conn_rule_name[ rule ] + " does not have a degree" );
   }
   rule_ = rule;
-  if (rule==FIXED_TOTAL_NUMBER) {
+  if ( rule == FIXED_TOTAL_NUMBER )
+  {
     total_num_ = degree;
   }
-  else if (rule==FIXED_INDEGREE) {
+  else if ( rule == FIXED_INDEGREE )
+  {
     indegree_ = degree;
   }
-  else if (rule==FIXED_OUTDEGREE) {
+  else if ( rule == FIXED_OUTDEGREE )
+  {
     outdegree_ = degree;
   }
-  
+
   return 0;
 }
 
-ConnSpec::ConnSpec(int rule, int degree /*=0*/)
+ConnSpec::ConnSpec( int rule, int degree /*=0*/ )
 {
-  Init(rule, degree);
+  Init( rule, degree );
 }
 
-int ConnSpec::SetParam(std::string param_name, int value)
+int
+ConnSpec::SetParam( std::string param_name, int value )
 {
-  if (param_name=="rule") {
-    if (value<0 || value>N_CONN_RULE) {
-      throw ngpu_exception("Unknown connection rule");
+  if ( param_name == "rule" )
+  {
+    if ( value < 0 || value > N_CONN_RULE )
+    {
+      throw ngpu_exception( "Unknown connection rule" );
     }
     rule_ = value;
     return 0;
   }
-  else if (param_name=="indegree") {
-    if (value<0) {
-      throw ngpu_exception("Indegree must be >=0");
+  else if ( param_name == "indegree" )
+  {
+    if ( value < 0 )
+    {
+      throw ngpu_exception( "Indegree must be >=0" );
     }
     indegree_ = value;
     return 0;
   }
-  else if (param_name=="outdegree") {
-    if (value<0) {
-      throw ngpu_exception("Outdegree must be >=0");
+  else if ( param_name == "outdegree" )
+  {
+    if ( value < 0 )
+    {
+      throw ngpu_exception( "Outdegree must be >=0" );
     }
     outdegree_ = value;
     return 0;
   }
-  else if (param_name=="total_num") {
-    if (value<0) {
-      throw ngpu_exception("total_num must be >=0");
+  else if ( param_name == "total_num" )
+  {
+    if ( value < 0 )
+    {
+      throw ngpu_exception( "total_num must be >=0" );
     }
     total_num_ = value;
     return 0;
   }
 
-  throw ngpu_exception("Unknown connection int parameter");
+  throw ngpu_exception( "Unknown connection int parameter" );
 }
 
-bool ConnSpec::IsParam(std::string param_name)
+bool
+ConnSpec::IsParam( std::string param_name )
 {
-  if (param_name=="rule" || param_name=="indegree" || param_name=="outdegree"
-      || param_name=="total_num") {
+  if ( param_name == "rule" || param_name == "indegree" || param_name == "outdegree" || param_name == "total_num" )
+  {
     return true;
   }
-  else {
+  else
+  {
     return false;
   }
 }
@@ -127,8 +139,8 @@ SynSpec::SynSpec()
   Init();
 }
 
-
-int SynSpec::Init()
+int
+SynSpec::Init()
 {
   syn_group_ = 0;
   port_ = 0;
@@ -136,361 +148,442 @@ int SynSpec::Init()
   delay_ = 0;
   weight_distr_ = DISTR_TYPE_NONE;
   delay_distr_ = DISTR_TYPE_NONE;
-  weight_h_array_pt_ = NULL;
-  delay_h_array_pt_ = NULL;
+  weight_h_array_pt_ = nullptr;
+  delay_h_array_pt_ = nullptr;
 
   return 0;
 }
 
-
-SynSpec::SynSpec(float weight, float delay)
+SynSpec::SynSpec( float weight, float delay )
 {
-  Init(weight, delay);
+  Init( weight, delay );
 }
 
-int SynSpec::Init(float weight, float delay)
+int
+SynSpec::Init( float weight, float delay )
 {
-  if (delay<0) {
-    throw ngpu_exception("Delay must be >=0");
+  if ( delay < 0 )
+  {
+    throw ngpu_exception( "Delay must be >=0" );
   }
   Init();
   weight_ = weight;
   delay_ = delay;
 
   return 0;
- }
+}
 
-SynSpec::SynSpec(int syn_group, float weight, float delay, int port /*=0*/)
+SynSpec::SynSpec( int syn_group, float weight, float delay, int port /*=0*/ )
 {
-  Init(syn_group, weight, delay, port);
+  Init( syn_group, weight, delay, port );
 }
 
-int SynSpec::Init(int syn_group, float weight, float delay, int port /*=0*/)
+int
+SynSpec::Init( int syn_group, float weight, float delay, int port /*=0*/ )
 {
-  if (syn_group<0) { // || syn_group>n_syn_group) {
-    throw ngpu_exception("Unknown synapse group");
+  if ( syn_group < 0 )
+  { // || syn_group>n_syn_group) {
+    throw ngpu_exception( "Unknown synapse group" );
   }
-  if (port<0) {
-    throw ngpu_exception("Port index must be >=0");
+  if ( port < 0 )
+  {
+    throw ngpu_exception( "Port index must be >=0" );
   }
-  Init(weight, delay);
+  Init( weight, delay );
   syn_group_ = syn_group;
   port_ = port;
 
   return 0;
- }
+}
 
-int SynSpec::SetParam(std::string param_name, int value)
+int
+SynSpec::SetParam( std::string param_name, int value )
 {
-  if (param_name=="synapse_group") {
-    if (value<0) { // || value>n_syn_group) {
-      throw ngpu_exception("Unknown synapse group");
+  if ( param_name == "synapse_group" )
+  {
+    if ( value < 0 )
+    { // || value>n_syn_group) {
+      throw ngpu_exception( "Unknown synapse group" );
     }
     syn_group_ = value;
   }
-  else if (param_name=="receptor") {
-    if (value<0) {
-      throw ngpu_exception("Port index must be >=0");
+  else if ( param_name == "receptor" )
+  {
+    if ( value < 0 )
+    {
+      throw ngpu_exception( "Port index must be >=0" );
     }
     port_ = value;
   }
-  else if (param_name=="weight_distribution") {
+  else if ( param_name == "weight_distribution" )
+  {
     weight_distr_ = value;
-    //printf("weight_distribution_ idx: %d\n", value);
+    // printf("weight_distribution_ idx: %d\n", value);
   }
-  else if (param_name=="delay_distribution") {
+  else if ( param_name == "delay_distribution" )
+  {
     delay_distr_ = value;
-    //printf("delay_distribution_ idx: %d\n", value);
+    // printf("delay_distribution_ idx: %d\n", value);
   }
-  else  {
-    throw ngpu_exception("Unknown synapse int parameter");
+  else
+  {
+    throw ngpu_exception( "Unknown synapse int parameter" );
   }
-  
+
   return 0;
 }
 
-bool SynSpec::IsIntParam(std::string param_name)
+bool
+SynSpec::IsIntParam( std::string param_name )
 {
-  if (param_name=="synapse_group" || param_name=="receptor"
-      || param_name=="weight_distribution"
-      || param_name=="delay_distribution"
-      ) {
+  if ( param_name == "synapse_group" || param_name == "receptor" || param_name == "weight_distribution"
+    || param_name == "delay_distribution" )
+  {
     return true;
   }
-  else {
+  else
+  {
     return false;
   }
 }
 
-int SynSpec::SetParam(std::string param_name, float value)
+int
+SynSpec::SetParam( std::string param_name, float value )
 {
-  if (param_name=="weight") {
+  if ( param_name == "weight" )
+  {
     weight_ = value;
   }
-  else if (param_name=="delay") {
-    if (value<0) {
-      throw ngpu_exception("Delay must be >=0");
+  else if ( param_name == "delay" )
+  {
+    if ( value < 0 )
+    {
+      throw ngpu_exception( "Delay must be >=0" );
     }
     delay_ = value;
   }
-  else if (param_name=="weight_mu") {
+  else if ( param_name == "weight_mu" )
+  {
     weight_mu_ = value;
-    //printf("weight_mu_: %f\n", value);
+    // printf("weight_mu_: %f\n", value);
   }
-  else if (param_name=="weight_low") {
+  else if ( param_name == "weight_low" )
+  {
     weight_low_ = value;
-    //printf("weight_low_: %f\n", value);
+    // printf("weight_low_: %f\n", value);
   }
-  else if (param_name=="weight_high") {
+  else if ( param_name == "weight_high" )
+  {
     weight_high_ = value;
-    //printf("weight_high_: %f\n", value);
+    // printf("weight_high_: %f\n", value);
   }
-  else if (param_name=="weight_sigma") {
+  else if ( param_name == "weight_sigma" )
+  {
     weight_sigma_ = value;
-    //printf("weight_sigma_: %f\n", value);
+    // printf("weight_sigma_: %f\n", value);
   }
-  else if (param_name=="delay_mu") {
+  else if ( param_name == "delay_mu" )
+  {
     delay_mu_ = value;
-    //printf("delay_mu_: %f\n", value);
+    // printf("delay_mu_: %f\n", value);
   }
-  else if (param_name=="delay_low") {
+  else if ( param_name == "delay_low" )
+  {
     delay_low_ = value;
-    //printf("delay_low_: %f\n", value);
+    // printf("delay_low_: %f\n", value);
   }
-  else if (param_name=="delay_high") {
+  else if ( param_name == "delay_high" )
+  {
     delay_high_ = value;
-    //printf("delay_high_: %f\n", value);
+    // printf("delay_high_: %f\n", value);
   }
-  else if (param_name=="delay_sigma") {
+  else if ( param_name == "delay_sigma" )
+  {
     delay_sigma_ = value;
-    //printf("delay_sigma_: %f\n", value);
+    // printf("delay_sigma_: %f\n", value);
   }
-  else {
-    throw ngpu_exception("Unknown synapse float parameter");
+  else
+  {
+    throw ngpu_exception( "Unknown synapse float parameter" );
   }
   return 0;
 }
 
-bool SynSpec::IsFloatParam(std::string param_name)
+bool
+SynSpec::IsFloatParam( std::string param_name )
 {
-  if (param_name=="weight" || param_name=="delay"
-      || param_name=="weight_mu" || param_name=="weight_low"
-      || param_name=="weight_high" || param_name=="weight_sigma"
-      || param_name=="delay_mu" || param_name=="delay_low"
-      || param_name=="delay_high" || param_name=="delay_sigma"
-      ) {
+  if ( param_name == "weight" || param_name == "delay" || param_name == "weight_mu" || param_name == "weight_low"
+    || param_name == "weight_high" || param_name == "weight_sigma" || param_name == "delay_mu"
+    || param_name == "delay_low" || param_name == "delay_high" || param_name == "delay_sigma" )
+  {
     return true;
   }
-  else {
+  else
+  {
     return false;
   }
 }
- 
-int SynSpec::SetParam(std::string param_name, float *array_pt)
+
+int
+SynSpec::SetParam( std::string param_name, float* array_pt )
 {
-  if (param_name=="weight_array") {
+  if ( param_name == "weight_array" )
+  {
     weight_h_array_pt_ = array_pt;
     weight_distr_ = DISTR_TYPE_ARRAY;
   }
-  else if (param_name=="delay_array") {
+  else if ( param_name == "delay_array" )
+  {
     delay_h_array_pt_ = array_pt;
     delay_distr_ = DISTR_TYPE_ARRAY;
   }
-  else {
-    throw ngpu_exception("Unknown synapse array parameter");
+  else
+  {
+    throw ngpu_exception( "Unknown synapse array parameter" );
   }
-  
+
   return 0;
 }
 
-bool SynSpec::IsFloatPtParam(std::string param_name)
+bool
+SynSpec::IsFloatPtParam( std::string param_name )
 {
-  if (param_name=="weight_array" || param_name=="delay_array") {
+  if ( param_name == "weight_array" || param_name == "delay_array" )
+  {
     return true;
   }
-  else {
+  else
+  {
     return false;
   }
 }
 
-
-
-int NESTGPU::Connect(int i_source, int n_source, int i_target, int n_target,
-		       ConnSpec &conn_spec, SynSpec &syn_spec)
+int
+NESTGPU::Connect( inode_t i_source,
+  inode_t n_source,
+  inode_t i_target,
+  inode_t n_target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
 {
-  return _Connect<int, int>(i_source, n_source, i_target, n_target,
-			    conn_spec, syn_spec);
+  CheckUncalibrated( "Connections cannot be created after calibration" );
+
+  return conn_->connect( i_source, n_source, i_target, n_target, conn_spec, syn_spec );
 }
 
-int NESTGPU::Connect(int i_source, int n_source, int* target, int n_target,
-		       ConnSpec &conn_spec, SynSpec &syn_spec)
+int
+NESTGPU::Connect( inode_t i_source,
+  inode_t n_source,
+  inode_t* target,
+  inode_t n_target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
 {
-  int *d_target;
-  CUDAMALLOCCTRL("&d_target",&d_target, n_target*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_target, target, n_target*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  int ret = _Connect<int, int*>(i_source, n_source, d_target, n_target,
-				conn_spec, syn_spec);
-  CUDAFREECTRL("d_target",d_target);
+  CheckUncalibrated( "Connections cannot be created after calibration" );
+
+  inode_t* d_target;
+  CUDAMALLOCCTRL( "&d_target", &d_target, n_target * sizeof( inode_t ) );
+  gpuErrchk( cudaMemcpy( d_target, target, n_target * sizeof( inode_t ), cudaMemcpyHostToDevice ) );
+  int ret = conn_->connect( i_source, n_source, d_target, n_target, conn_spec, syn_spec );
+  CUDAFREECTRL( "d_target", d_target );
 
   return ret;
 }
-int NESTGPU::Connect(int* source, int n_source, int i_target, int n_target,
-		       ConnSpec &conn_spec, SynSpec &syn_spec)
-{
-  int *d_source;
-  CUDAMALLOCCTRL("&d_source",&d_source, n_source*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_source, source, n_source*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  int ret = _Connect<int*, int>(d_source, n_source, i_target, n_target,
-				conn_spec, syn_spec);
-  CUDAFREECTRL("d_source",d_source);
-  
+
+int
+NESTGPU::Connect( inode_t* source,
+  inode_t n_source,
+  inode_t i_target,
+  inode_t n_target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
+{
+  CheckUncalibrated( "Connections cannot be created after calibration" );
+
+  inode_t* d_source;
+  CUDAMALLOCCTRL( "&d_source", &d_source, n_source * sizeof( inode_t ) );
+  gpuErrchk( cudaMemcpy( d_source, source, n_source * sizeof( inode_t ), cudaMemcpyHostToDevice ) );
+  int ret = conn_->connect( d_source, n_source, i_target, n_target, conn_spec, syn_spec );
+  CUDAFREECTRL( "d_source", d_source );
+
   return ret;
 }
-int NESTGPU::Connect(int* source, int n_source, int* target, int n_target,
-		       ConnSpec &conn_spec, SynSpec &syn_spec)
-{
-  int *d_source;
-  CUDAMALLOCCTRL("&d_source",&d_source, n_source*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_source, source, n_source*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  int *d_target;
-  CUDAMALLOCCTRL("&d_target",&d_target, n_target*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_target, target, n_target*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  int ret = _Connect<int*, int*>(d_source, n_source, d_target, n_target,
-				 conn_spec, syn_spec);
-  CUDAFREECTRL("d_source",d_source);
-  CUDAFREECTRL("d_target",d_target);
+
+int
+NESTGPU::Connect( inode_t* source,
+  inode_t n_source,
+  inode_t* target,
+  inode_t n_target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
+{
+  CheckUncalibrated( "Connections cannot be created after calibration" );
+  inode_t* d_source;
+  CUDAMALLOCCTRL( "&d_source", &d_source, n_source * sizeof( inode_t ) );
+  gpuErrchk( cudaMemcpy( d_source, source, n_source * sizeof( inode_t ), cudaMemcpyHostToDevice ) );
+  inode_t* d_target;
+  CUDAMALLOCCTRL( "&d_target", &d_target, n_target * sizeof( inode_t ) );
+  gpuErrchk( cudaMemcpy( d_target, target, n_target * sizeof( inode_t ), cudaMemcpyHostToDevice ) );
+  int ret = conn_->connect( d_source, n_source, d_target, n_target, conn_spec, syn_spec );
+  CUDAFREECTRL( "d_source", d_source );
+  CUDAFREECTRL( "d_target", d_target );
 
   return ret;
 }
 
-int NESTGPU::Connect(NodeSeq source, NodeSeq target,
-		     ConnSpec &conn_spec, SynSpec &syn_spec)
+int
+NESTGPU::Connect( NodeSeq source, NodeSeq target, ConnSpec& conn_spec, SynSpec& syn_spec )
 {
-  return Connect(source.i0, source.n, target.i0, target.n,
-		 conn_spec, syn_spec);
+  return Connect( source.i0, source.n, target.i0, target.n, conn_spec, syn_spec );
 }
 
-int NESTGPU::Connect(NodeSeq source, std::vector<int> target,
-		     ConnSpec &conn_spec, SynSpec &syn_spec)
+int
+NESTGPU::Connect( NodeSeq source, std::vector< inode_t > target, ConnSpec& conn_spec, SynSpec& syn_spec )
 {
-  return Connect(source.i0, source.n, target.data(),
-		 target.size(), conn_spec, syn_spec);
+  return Connect( source.i0, source.n, target.data(), target.size(), conn_spec, syn_spec );
 }
 
-int NESTGPU::Connect(std::vector<int> source, NodeSeq target,
-		     ConnSpec &conn_spec, SynSpec &syn_spec)
+int
+NESTGPU::Connect( std::vector< inode_t > source, NodeSeq target, ConnSpec& conn_spec, SynSpec& syn_spec )
 {
-  return Connect(source.data(), source.size(), target.i0,
-		 target.n, conn_spec, syn_spec);
+  return Connect( source.data(), source.size(), target.i0, target.n, conn_spec, syn_spec );
 }
 
-int NESTGPU::Connect(std::vector<int> source, std::vector<int> target,
-		     ConnSpec &conn_spec, SynSpec &syn_spec)
+int
+NESTGPU::Connect( std::vector< inode_t > source, std::vector< inode_t > target, ConnSpec& conn_spec, SynSpec& syn_spec )
 {
-  return Connect(source.data(), source.size(), target.data(),
-		 target.size(), conn_spec, syn_spec);
+  return Connect( source.data(), source.size(), target.data(), target.size(), conn_spec, syn_spec );
 }
 
-
-int NESTGPU::RemoteConnect(int i_source_host, int i_source, int n_source,
-			   int i_target_host, int i_target, int n_target,
-			   ConnSpec &conn_spec, SynSpec &syn_spec)
+int
+NESTGPU::RemoteConnect( int i_source_host,
+  inode_t i_source,
+  inode_t n_source,
+  int i_target_host,
+  inode_t i_target,
+  inode_t n_target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
 {
-  return _RemoteConnect<int, int>(i_source_host, i_source, n_source,
-				  i_target_host, i_target, n_target,
-				  conn_spec, syn_spec);
+  CheckUncalibrated( "Connections cannot be created after calibration" );
+
+  return conn_->remoteConnect(
+    i_source_host, i_source, n_source, i_target_host, i_target, n_target, conn_spec, syn_spec );
 }
 
-int NESTGPU::RemoteConnect(int i_source_host, int i_source, int n_source,
-			   int i_target_host, int* target, int n_target,
-			   ConnSpec &conn_spec, SynSpec &syn_spec)
+int
+NESTGPU::RemoteConnect( int i_source_host,
+  inode_t i_source,
+  inode_t n_source,
+  int i_target_host,
+  inode_t* target,
+  inode_t n_target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
 {
-  int *d_target;
-  CUDAMALLOCCTRL("&d_target",&d_target, n_target*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_target, target, n_target*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  int ret = _RemoteConnect<int, int*>(i_source_host, i_source, n_source,
-				      i_target_host, d_target, n_target,
-				      conn_spec, syn_spec);
-  CUDAFREECTRL("d_target",d_target);
+  CheckUncalibrated( "Connections cannot be created after calibration" );
+
+  inode_t* d_target;
+  CUDAMALLOCCTRL( "&d_target", &d_target, n_target * sizeof( inode_t ) );
+  gpuErrchk( cudaMemcpy( d_target, target, n_target * sizeof( inode_t ), cudaMemcpyHostToDevice ) );
+  int ret =
+    conn_->remoteConnect( i_source_host, i_source, n_source, i_target_host, d_target, n_target, conn_spec, syn_spec );
+  CUDAFREECTRL( "d_target", d_target );
 
   return ret;
 }
 
-int NESTGPU::RemoteConnect(int i_source_host, int* source, int n_source,
-			   int i_target_host, int i_target, int n_target,
-			   ConnSpec &conn_spec, SynSpec &syn_spec)
+int
+NESTGPU::RemoteConnect( int i_source_host,
+  inode_t* source,
+  inode_t n_source,
+  int i_target_host,
+  inode_t i_target,
+  inode_t n_target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
 {
-  int *d_source;
-  CUDAMALLOCCTRL("&d_source",&d_source, n_source*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_source, source, n_source*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  int ret = _RemoteConnect<int*, int>(i_source_host, d_source, n_source,
-				      i_target_host, i_target, n_target,
-				      conn_spec, syn_spec);
-  CUDAFREECTRL("d_source",d_source);
-  
+  CheckUncalibrated( "Connections cannot be created after calibration" );
+
+  inode_t* d_source;
+  CUDAMALLOCCTRL( "&d_source", &d_source, n_source * sizeof( inode_t ) );
+  gpuErrchk( cudaMemcpy( d_source, source, n_source * sizeof( inode_t ), cudaMemcpyHostToDevice ) );
+  int ret =
+    conn_->remoteConnect( i_source_host, d_source, n_source, i_target_host, i_target, n_target, conn_spec, syn_spec );
+  CUDAFREECTRL( "d_source", d_source );
+
   return ret;
 }
 
-int NESTGPU::RemoteConnect(int i_source_host, int* source, int n_source,
-			   int i_target_host, int* target, int n_target,
-			   ConnSpec &conn_spec, SynSpec &syn_spec)
-{
-  int *d_source;
-  CUDAMALLOCCTRL("&d_source",&d_source, n_source*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_source, source, n_source*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  int *d_target;
-  CUDAMALLOCCTRL("&d_target",&d_target, n_target*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_target, target, n_target*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  int ret = _RemoteConnect<int*, int*>(i_source_host, d_source, n_source,
-				       i_target_host, d_target, n_target,
-				       conn_spec, syn_spec);
-  CUDAFREECTRL("d_source",d_source);
-  CUDAFREECTRL("d_target",d_target);
+int
+NESTGPU::RemoteConnect( int i_source_host,
+  inode_t* source,
+  inode_t n_source,
+  int i_target_host,
+  inode_t* target,
+  inode_t n_target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
+{
+  CheckUncalibrated( "Connections cannot be created after calibration" );
+
+  inode_t* d_source;
+  CUDAMALLOCCTRL( "&d_source", &d_source, n_source * sizeof( inode_t ) );
+  gpuErrchk( cudaMemcpy( d_source, source, n_source * sizeof( inode_t ), cudaMemcpyHostToDevice ) );
+  inode_t* d_target;
+  CUDAMALLOCCTRL( "&d_target", &d_target, n_target * sizeof( inode_t ) );
+  gpuErrchk( cudaMemcpy( d_target, target, n_target * sizeof( inode_t ), cudaMemcpyHostToDevice ) );
+  int ret =
+    conn_->remoteConnect( i_source_host, d_source, n_source, i_target_host, d_target, n_target, conn_spec, syn_spec );
+  CUDAFREECTRL( "d_source", d_source );
+  CUDAFREECTRL( "d_target", d_target );
 
   return ret;
 }
 
-int NESTGPU::RemoteConnect(int i_source_host, NodeSeq source,
-			   int i_target_host, NodeSeq target,
-			   ConnSpec &conn_spec, SynSpec &syn_spec)
+int
+NESTGPU::RemoteConnect( int i_source_host,
+  NodeSeq source,
+  int i_target_host,
+  NodeSeq target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
 {
-  return RemoteConnect(i_source_host, source.i0, source.n,
-		       i_target_host, target.i0, target.n,
-		       conn_spec, syn_spec);
+  return RemoteConnect( i_source_host, source.i0, source.n, i_target_host, target.i0, target.n, conn_spec, syn_spec );
 }
 
-int NESTGPU::RemoteConnect(int i_source_host, NodeSeq source,
-			   int i_target_host, std::vector<int> target,
-			   ConnSpec &conn_spec, SynSpec &syn_spec)
+int
+NESTGPU::RemoteConnect( int i_source_host,
+  NodeSeq source,
+  int i_target_host,
+  std::vector< inode_t > target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
 {
-  return RemoteConnect(i_source_host, source.i0, source.n,
-		       i_target_host, target.data(), target.size(),
-		       conn_spec, syn_spec);
+  return RemoteConnect(
+    i_source_host, source.i0, source.n, i_target_host, target.data(), target.size(), conn_spec, syn_spec );
 }
 
-int NESTGPU::RemoteConnect(int i_source_host, std::vector<int> source,
-			   int i_target_host, NodeSeq target,
-			   ConnSpec &conn_spec, SynSpec &syn_spec)
+int
+NESTGPU::RemoteConnect( int i_source_host,
+  std::vector< inode_t > source,
+  int i_target_host,
+  NodeSeq target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
 {
-  return RemoteConnect(i_source_host, source.data(), source.size(),
-			i_target_host, target.i0, target.n,
-			conn_spec, syn_spec);
+  return RemoteConnect(
+    i_source_host, source.data(), source.size(), i_target_host, target.i0, target.n, conn_spec, syn_spec );
 }
 
-int NESTGPU::RemoteConnect(int i_source_host, std::vector<int> source,
-			   int i_target_host, std::vector<int> target,
-			   ConnSpec &conn_spec, SynSpec &syn_spec)
+int
+NESTGPU::RemoteConnect( int i_source_host,
+  std::vector< inode_t > source,
+  int i_target_host,
+  std::vector< inode_t > target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
 {
-  return RemoteConnect(i_source_host, source.data(), source.size(),
-		       i_target_host, target.data(), target.size(),
-		       conn_spec, syn_spec);
+  return RemoteConnect(
+    i_source_host, source.data(), source.size(), i_target_host, target.data(), target.size(), conn_spec, syn_spec );
 }
-
diff --git a/src/connect_rules.h b/src/connect_rules.h
index 4dc6f0537..c41b32b33 100644
--- a/src/connect_rules.h
+++ b/src/connect_rules.h
@@ -20,123 +20,7 @@
  *
  */
 
-
-
-
-
 #ifndef CONNECTRULES_H
 #define CONNECTRULES_H
 
-#include <iostream>
-#include <numeric>
-#include <stdio.h>
-#include "nestgpu.h"
-
-extern bool ConnectionSpikeTimeFlag;
-
-
-template <class T1, class T2>
-int NESTGPU::_Connect(curandGenerator_t &gen, T1 source, int n_source,
-		      T2 target, int n_target,
-		      ConnSpec &conn_spec, SynSpec &syn_spec)
-{
-  CheckUncalibrated("Connections cannot be created after calibration");
-  ////////////////////////
-    //TEMPORARY, TO BE IMPROVED
-  if (syn_spec.syn_group_>=1) {
-    ConnectionSpikeTimeFlag = true;
-    rev_conn_flag_ = true;
-  }
-
-  switch (conn_spec.rule_) {
-  case ONE_TO_ONE:
-    if (n_source != n_target) {
-      throw ngpu_exception("Number of source and target nodes must be equal "
-			   "for the one-to-one connection rule");
-    }
-    return _ConnectOneToOne<T1, T2>(gen, source, target, n_source, syn_spec);
-    break;
-
-  case ALL_TO_ALL:
-    return _ConnectAllToAll<T1, T2>(gen, source, n_source, target, n_target,
-				    syn_spec);
-    break;
-  case FIXED_TOTAL_NUMBER:
-    return _ConnectFixedTotalNumber<T1, T2>(gen, source, n_source,
-					    target, n_target,
-					    conn_spec.total_num_, syn_spec);
-    break;
-  case FIXED_INDEGREE:
-    return _ConnectFixedIndegree<T1, T2>(gen, source, n_source,
-					 target, n_target,
-					 conn_spec.indegree_, syn_spec);
-    break;
-  case FIXED_OUTDEGREE:
-    return _ConnectFixedOutdegree<T1, T2>(gen, source, n_source,
-					  target, n_target,
-					  conn_spec.outdegree_, syn_spec);
-    break;
-  default:
-    throw ngpu_exception("Unknown connection rule");
-  }
-  return 0;
-}
-
-template
-int NESTGPU::_Connect<int, int>(curandGenerator_t &gen,
-				int source, int n_source,
-				int target, int n_target,
-				ConnSpec &conn_spec, SynSpec &syn_spec);
-
-template
-int NESTGPU::_Connect<int, int*>(curandGenerator_t &gen,
-				 int source, int n_source,
-				 int *target, int n_target,
-				 ConnSpec &conn_spec, SynSpec &syn_spec);
-
-template
-int NESTGPU::_Connect<int*, int>(curandGenerator_t &gen,
-				 int *source, int n_source,
-				 int target, int n_target,
-				 ConnSpec &conn_spec, SynSpec &syn_spec);
-
-template
-int NESTGPU::_Connect<int*, int*>(curandGenerator_t &gen,
-				  int *source, int n_source,
-				  int *target, int n_target,
-				  ConnSpec &conn_spec, SynSpec &syn_spec);
-
-template <class T1, class T2>
-int NESTGPU::_Connect(T1 source, int n_source,
-		      T2 target, int n_target,
-		      ConnSpec &conn_spec, SynSpec &syn_spec)
-{
-  return _Connect(conn_random_generator_[this_host_][this_host_],
-		  source, n_source, target, n_target, conn_spec, syn_spec);
-}
-
-template
-int NESTGPU::_Connect<int, int>(int source, int n_source,
-				int target, int n_target,
-				ConnSpec &conn_spec, SynSpec &syn_spec);
-
-template
-int NESTGPU::_Connect<int, int*>(int source, int n_source,
-				 int *target, int n_target,
-				 ConnSpec &conn_spec, SynSpec &syn_spec);
-
-template
-int NESTGPU::_Connect<int*, int>(int *source, int n_source,
-				 int target, int n_target,
-				 ConnSpec &conn_spec, SynSpec &syn_spec);
-
-template
-int NESTGPU::_Connect<int*, int*>(int *source, int n_source,
-				  int *target, int n_target,
-				  ConnSpec &conn_spec, SynSpec &syn_spec);
-
-
-
-
-
 #endif
diff --git a/src/connect_spec.h b/src/connect_spec.h
index 1b6a803a8..16d55a840 100644
--- a/src/connect_spec.h
+++ b/src/connect_spec.h
@@ -20,10 +20,6 @@
  *
  */
 
-
-
-
-
 #ifndef CONNECTSPEC_H
 #define CONNECTSPEC_H
 
@@ -33,57 +29,63 @@
 
 class NESTGPU;
 
-template <class T>
+template < class T >
 class RemoteNode
 {
- public:
+public:
   int i_host_;
   T i_node_;
- RemoteNode(int i_host, T node): i_host_(i_host), i_node_(node) {}
-  int GetINode(int in);
+  RemoteNode( int i_host, T node )
+    : i_host_( i_host )
+    , i_node_( node )
+  {
+  }
+  int GetINode( int in );
 };
 
 enum ConnectionRules
-  {
-   ONE_TO_ONE=0, ALL_TO_ALL, FIXED_TOTAL_NUMBER, FIXED_INDEGREE,
-   FIXED_OUTDEGREE, N_CONN_RULE
-  };
-
-const std::string conn_rule_name[N_CONN_RULE] =
-  {
-   "one_to_one", "all_to_all", "fixed_total_number", "fixed_indegree",
-   "fixed_outdegree"
+{
+  ONE_TO_ONE = 0,
+  ALL_TO_ALL,
+  FIXED_TOTAL_NUMBER,
+  FIXED_INDEGREE,
+  FIXED_OUTDEGREE,
+  N_CONN_RULE
 };
 
+const std::string conn_rule_name[ N_CONN_RULE ] = { "one_to_one",
+  "all_to_all",
+  "fixed_total_number",
+  "fixed_indegree",
+  "fixed_outdegree" };
+
 class ConnSpec
 {
+public:
   int rule_;
   int total_num_;
   int indegree_;
   int outdegree_;
-public:
+
   ConnSpec();
-  ConnSpec(int rule, int degree=0);
+  ConnSpec( int rule, int degree = 0 );
   int Init();
-  int Init(int rule, int degree=0);
-  int SetParam(std::string param_name, int value);
-  int GetParam(std::string param_name);
-  static bool IsParam(std::string param_name);
-
-  friend class NESTGPU;
+  int Init( int rule, int degree = 0 );
+  int SetParam( std::string param_name, int value );
+  int GetParam( std::string param_name );
+  static bool IsParam( std::string param_name );
 };
 
-
 class SynSpec
 {
- public:
-  unsigned char syn_group_;
+public:
+  int syn_group_;
   int port_;
   int weight_distr_;
-  float *weight_h_array_pt_;
+  float* weight_h_array_pt_;
   float weight_;
   int delay_distr_;
-  float *delay_h_array_pt_;
+  float* delay_h_array_pt_;
   float delay_;
   float weight_mu_;
   float weight_low_;
@@ -93,21 +95,21 @@ class SynSpec
   float delay_low_;
   float delay_high_;
   float delay_sigma_;
-  
- public:
+
+public:
   SynSpec();
-  SynSpec(float weight, float delay);
-  SynSpec(int syn_group, float weight, float delay, int port=0);
+  SynSpec( float weight, float delay );
+  SynSpec( int syn_group, float weight, float delay, int port = 0 );
   int Init();
-  int Init(float weight, float delay);
-  int Init(int syn_group, float weight, float delay, int port=0);
-  int SetParam(std::string param_name, int value);
-  int SetParam(std::string param_name, float value);
-  int SetParam(std::string param_name, float *array_pt);
-  float GetParam(std::string param_name);
-  static bool IsIntParam(std::string param_name);
-  static bool IsFloatParam(std::string param_name);
-  static bool IsFloatPtParam(std::string param_name);
+  int Init( float weight, float delay );
+  int Init( int syn_group, float weight, float delay, int port = 0 );
+  int SetParam( std::string param_name, int value );
+  int SetParam( std::string param_name, float value );
+  int SetParam( std::string param_name, float* array_pt );
+  float GetParam( std::string param_name );
+  static bool IsIntParam( std::string param_name );
+  static bool IsFloatParam( std::string param_name );
+  static bool IsFloatPtParam( std::string param_name );
 
   friend class NESTGPU;
 };
diff --git a/src/copass_kernels.cu b/src/copass_kernels.cu
index 17bc5477d..33d19638c 100644
--- a/src/copass_kernels.cu
+++ b/src/copass_kernels.cu
@@ -12,141 +12,163 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include <stdio.h>
 #include "copass_kernels.h"
+#include <stdio.h>
 
-//#define PRINT_VRB
+// #define PRINT_VRB
 
-unsigned int nextPowerOf2(unsigned int n) 
+unsigned int
+nextPowerOf2( unsigned int n )
 {
-    n--;
-    n |= n >> 1;
-    n |= n >> 2;
-    n |= n >> 4;
-    n |= n >> 8;
-    n |= n >> 16;
-    n++;
-    return n;
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  n++;
+  return n;
 }
 
-
 /*
 //template <class T>
 void cudaReusableAlloc(void *d_storage, int64_t &st_bytes,
-		       void **variable_pt, const int64_t &num_elems,
-		       const size_t &elem_size)
+                       void **variable_pt, const int64_t &num_elems,
+                       const size_t &elem_size)
 {
   int64_t align_bytes = elem_size;
   int64_t align_mask = ~(align_bytes - 1);
   int64_t allocation_offset = (st_bytes + align_bytes - 1) & align_mask;
   st_bytes = allocation_offset + num_elems*elem_size;
-  if (d_storage != NULL) {
+  if (d_storage != nullptr) {
     *variable_pt = (void*)((char*)d_storage + allocation_offset);
   }
 }
 */
 
-// atomically set old_index = *arg_max_pt, 
+// atomically set old_index = *arg_max_pt,
 // check whether array[index]>array[old_index].
 // If it is true, set *arg_max_pt=index
-__device__ int atomicArgMax(position_t *array, int *arg_max_pt, int index)
+__device__ int
+atomicArgMax( position_t* array, int* arg_max_pt, int index )
 {
   int old_index = *arg_max_pt;
   int assumed_index;
-  do {
-    if (array[old_index]>=array[index]) {
+  do
+  {
+    if ( array[ old_index ] >= array[ index ] )
+    {
       break;
     }
     assumed_index = old_index;
-    old_index = atomicCAS(arg_max_pt, assumed_index, index);
-  } while (assumed_index != old_index);
-  
+    old_index = atomicCAS( arg_max_pt, assumed_index, index );
+  } while ( assumed_index != old_index );
+
   return old_index;
 }
 
-
-__global__ void copass_last_step_kernel(position_t *part_size, position_t *m_d,
-					uint k, position_t tot_diff,
-					position_t *diff,
-					position_t *diff_cumul,
-					position_t *num_down)
+__global__ void
+copass_last_step_kernel( position_t* part_size,
+  position_t* m_d,
+  uint k,
+  position_t tot_diff,
+  position_t* diff,
+  position_t* diff_cumul,
+  position_t* num_down )
 {
-  int i=threadIdx.x;
-  if (i >= k) return;
+  int i = threadIdx.x;
+  if ( i >= k )
+  {
+    return;
+  }
   position_t nd = *num_down;
-  
-  if (i < nd) {
-    part_size[i] = m_d[i] + diff[i];
+
+  if ( i < nd )
+  {
+    part_size[ i ] = m_d[ i ] + diff[ i ];
   }
-  else if (i == nd) {
-    part_size[i] = m_d[i] + tot_diff - diff_cumul[i];
+  else if ( i == nd )
+  {
+    part_size[ i ] = m_d[ i ] + tot_diff - diff_cumul[ i ];
   }
-  else {
-    part_size[i] = m_d[i];
+  else
+  {
+    part_size[ i ] = m_d[ i ];
   }
 #ifdef PRINT_VRB
-  printf("kernel i: %d\tm_d: %ld\tpart_size: %ld\n", i, m_d[i], part_size[i]);
+  printf( "kernel i: %d\tm_d: %ld\tpart_size: %ld\n", i, m_d[ i ], part_size[ i ] );
 #endif
 }
 
-
-__global__ void case2_inc_partitions_kernel(position_t *part_size,
-					    int *sorted_extra_elem_idx,
-					    position_t tot_diff)
+__global__ void
+case2_inc_partitions_kernel( position_t* part_size, int* sorted_extra_elem_idx, position_t tot_diff )
 {
   int i_elem = threadIdx.x;
-  if (i_elem >= tot_diff) return;
-  int i = sorted_extra_elem_idx[i_elem];
-  part_size[i]++;
+  if ( i_elem >= tot_diff )
+  {
+    return;
+  }
+  int i = sorted_extra_elem_idx[ i_elem ];
+  part_size[ i ]++;
 }
 
-void GPUMemCpyOverlap(char *t_addr, char *s_addr, position_t size)
+void
+GPUMemCpyOverlap( char* t_addr, char* s_addr, position_t size )
 {
-  position_t diff = (position_t)(t_addr - s_addr);
-  if (diff==0) return;
-  if (diff<0) {
-    printf("GPUMemCpyOvelap error: translation cannot be <0\n");
-    exit(0);
+  position_t diff = ( position_t ) ( t_addr - s_addr );
+  if ( diff == 0 )
+  {
+    return;
   }
-  if (diff>=size) {
-    gpuErrchk(cudaMemcpyAsync(t_addr, s_addr, size, cudaMemcpyDeviceToDevice));
+  if ( diff < 0 )
+  {
+    printf( "GPUMemCpyOvelap error: translation cannot be <0\n" );
+    exit( 0 );
   }
-  int nb = (int)((size + diff - 1)/diff);
-  for (int ib=nb-1; ib>=0; ib--) {
-    position_t b_size = ib<nb-1 ? diff : size - diff*(nb - 1);
-    char *s_b_addr = s_addr + diff*ib;
-    char *t_b_addr = s_b_addr + diff;
-    gpuErrchk(cudaMemcpyAsync(t_b_addr, s_b_addr, b_size,
-			      cudaMemcpyDeviceToDevice));
+  if ( diff >= size )
+  {
+    gpuErrchk( cudaMemcpyAsync( t_addr, s_addr, size, cudaMemcpyDeviceToDevice ) );
+  }
+  int nb = ( int ) ( ( size + diff - 1 ) / diff );
+  for ( int ib = nb - 1; ib >= 0; ib-- )
+  {
+    position_t b_size = ib < nb - 1 ? diff : size - diff * ( nb - 1 );
+    char* s_b_addr = s_addr + diff * ib;
+    char* t_b_addr = s_b_addr + diff;
+    gpuErrchk( cudaMemcpyAsync( t_b_addr, s_b_addr, b_size, cudaMemcpyDeviceToDevice ) );
   }
 }
 
-void GPUMemCpyBuffered(char *t_addr, char *s_addr, position_t size,
-		       char *d_buffer, position_t buffer_size)
+void
+GPUMemCpyBuffered( char* t_addr, char* s_addr, position_t size, char* d_buffer, position_t buffer_size )
 {
-  position_t diff = (position_t)(t_addr - s_addr);
-  if (diff==0) return;
-  if (diff<0) {
-    printf("GPUMemCpyBuffer error: translation cannot be <0\n");
-    exit(0);
+  position_t diff = ( position_t ) ( t_addr - s_addr );
+  if ( diff == 0 )
+  {
+    return;
   }
-  if (diff>=size) {
-    gpuErrchk(cudaMemcpyAsync(t_addr, s_addr, size, cudaMemcpyDeviceToDevice));
+  if ( diff < 0 )
+  {
+    printf( "GPUMemCpyBuffer error: translation cannot be <0\n" );
+    exit( 0 );
+  }
+  if ( diff >= size )
+  {
+    gpuErrchk( cudaMemcpyAsync( t_addr, s_addr, size, cudaMemcpyDeviceToDevice ) );
     return;
   }
-  if (diff>buffer_size/2) {
-    GPUMemCpyOverlap(t_addr, s_addr, size);
+  if ( diff > buffer_size / 2 )
+  {
+    GPUMemCpyOverlap( t_addr, s_addr, size );
     return;
   }
-  int nb = (int)((size + buffer_size - 1)/buffer_size);
-  for (int ib=nb-1; ib>=0; ib--) {
-    position_t b_size = ib<nb-1 ? buffer_size : size - buffer_size*(nb - 1);
-    char *s_b_addr = s_addr + buffer_size*ib;
-    char *t_b_addr = s_b_addr + diff;
-    gpuErrchk(cudaMemcpyAsync(d_buffer, s_b_addr, b_size,
-			      cudaMemcpyDeviceToDevice));
-    gpuErrchk(cudaMemcpyAsync(t_b_addr, d_buffer, b_size,
-			      cudaMemcpyDeviceToDevice));
+  int nb = ( int ) ( ( size + buffer_size - 1 ) / buffer_size );
+  for ( int ib = nb - 1; ib >= 0; ib-- )
+  {
+    position_t b_size = ib < nb - 1 ? buffer_size : size - buffer_size * ( nb - 1 );
+    char* s_b_addr = s_addr + buffer_size * ib;
+    char* t_b_addr = s_b_addr + diff;
+    gpuErrchk( cudaMemcpyAsync( d_buffer, s_b_addr, b_size, cudaMemcpyDeviceToDevice ) );
+    gpuErrchk( cudaMemcpyAsync( t_b_addr, d_buffer, b_size, cudaMemcpyDeviceToDevice ) );
   }
 }
-
diff --git a/src/copass_kernels.h b/src/copass_kernels.h
index 49ea30fb5..f47afa613 100644
--- a/src/copass_kernels.h
+++ b/src/copass_kernels.h
@@ -14,50 +14,57 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 #ifndef COPASS_KERNEL_H
 #define COPASS_KERNEL_H
-#include <vector>
-#include <utility>
 #include <algorithm>
+// #include <new>
+#include <utility>
+#include <vector>
+
+//<BEGIN-CLANG-TIDY-SKIP>//
 #include <cub/device/device_radix_sort.cuh>
+//<END-CLANG-TIDY-SKIP>//
 
 #include "cuda_error.h"
 
-//#define PRINT_VRB
+// #define PRINT_VRB
 
 typedef int64_t position_t;
 typedef unsigned long long int uposition_t;
-//typedef int32_t position_t;
-//typedef unsigned int uposition_t;
+// typedef int32_t position_t;
+// typedef unsigned int uposition_t;
 
-void GPUMemCpyOverlap(char *t_addr, char *s_addr, position_t size);
-void GPUMemCpyBuffered(char *t_addr, char *s_addr, position_t size,
-		     char *d_buffer, position_t buffer_size);
+void GPUMemCpyOverlap( char* t_addr, char* s_addr, position_t size );
+void GPUMemCpyBuffered( char* t_addr, char* s_addr, position_t size, char* d_buffer, position_t buffer_size );
 
-template <class T>
-void cudaReusableAlloc(void *d_storage, int64_t &st_bytes,
-		       T** variable_pt, const int64_t &num_elems,
-		       const size_t &elem_size)
+template < class T >
+void
+cudaReusableAlloc( void* d_storage,
+  int64_t& st_bytes,
+  T** variable_pt,
+  const int64_t& num_elems,
+  const size_t& elem_size )
 {
   int64_t align_bytes = elem_size;
-  int64_t align_mask = ~(align_bytes - 1);
-  int64_t allocation_offset = (st_bytes + align_bytes - 1) & align_mask;
-  st_bytes = allocation_offset + num_elems*elem_size;
-  if (d_storage != NULL) {
-    *variable_pt = (T*)((char*)d_storage + allocation_offset);
+  int64_t align_mask = ~( align_bytes - 1 );
+  int64_t allocation_offset = ( st_bytes + align_bytes - 1 ) & align_mask;
+  st_bytes = allocation_offset + num_elems * elem_size;
+  if ( d_storage != NULL )
+  {
+    *variable_pt = ( T* ) ( ( char* ) d_storage + allocation_offset );
   }
-  else {
+  else
+  {
     *variable_pt = NULL;
   }
 }
 
-
-template <class KeyT, class ValueT>
+template < class KeyT, class ValueT >
 struct key_value
 {
   KeyT key;
   ValueT value;
 };
 
-template <class KeyT, class ValueT>
+template < class KeyT, class ValueT >
 struct contiguous_key_value
 {
   KeyT* key_pt;
@@ -66,7 +73,7 @@ struct contiguous_key_value
   position_t size;
 };
 
-template <class KeyT, class ValueT>
+template < class KeyT, class ValueT >
 struct regular_block_key_value
 {
   KeyT** h_key_pt;
@@ -78,76 +85,81 @@ struct regular_block_key_value
   position_t size;
 };
 
-//namespace array
+// namespace array
 //{
 
-template <class KeyT, class ValueT>
-__device__ key_value<KeyT, ValueT>
-getElem(contiguous_key_value<KeyT, ValueT> &arr, position_t i)
+template < class KeyT, class ValueT >
+__device__ key_value< KeyT, ValueT >
+getElem( contiguous_key_value< KeyT, ValueT >& arr, position_t i )
 {
-  key_value<KeyT, ValueT> kv;
-  kv.key = *(arr.key_pt + arr.offset + i);
-  kv.value = *(arr.value_pt + arr.offset + i);
+  key_value< KeyT, ValueT > kv;
+  kv.key = *( arr.key_pt + arr.offset + i );
+  kv.value = *( arr.value_pt + arr.offset + i );
   return kv;
 }
 
-template <class KeyT, class ValueT>
-__device__ KeyT getKey(contiguous_key_value<KeyT, ValueT> &arr, position_t i)
+template < class KeyT, class ValueT >
+__device__ KeyT
+getKey( contiguous_key_value< KeyT, ValueT >& arr, position_t i )
 {
-  return *(arr.key_pt + arr.offset + i);
+  return *( arr.key_pt + arr.offset + i );
 }
 
-template <class KeyT, class ValueT>
-KeyT *getKeyPt(contiguous_key_value<KeyT, ValueT> &arr)
+template < class KeyT, class ValueT >
+KeyT*
+getKeyPt( contiguous_key_value< KeyT, ValueT >& arr )
 {
   return arr.key_pt + arr.offset;
 }
 
-template <class KeyT, class ValueT>
-ValueT *getValuePt(contiguous_key_value<KeyT, ValueT> &arr)
+template < class KeyT, class ValueT >
+ValueT*
+getValuePt( contiguous_key_value< KeyT, ValueT >& arr )
 {
   return arr.value_pt + arr.offset;
 }
 
-template <class KeyT, class ValueT>
-__device__ void setElem(contiguous_key_value<KeyT, ValueT> &arr, position_t i,
-			const key_value<KeyT, ValueT> &kv)
+template < class KeyT, class ValueT >
+__device__ void
+setElem( contiguous_key_value< KeyT, ValueT >& arr, position_t i, const key_value< KeyT, ValueT >& kv )
 {
-  *(arr.key_pt + arr.offset + i) = kv.key;
-  *(arr.value_pt + arr.offset + i) = kv.value;
+  *( arr.key_pt + arr.offset + i ) = kv.key;
+  *( arr.value_pt + arr.offset + i ) = kv.value;
 }
 
-
-template <class KeyT, class ValueT>
-void array_GPUMalloc(void *d_storage, int64_t &st_bytes,
-		     contiguous_key_value<KeyT, ValueT> &arr, position_t size)
+template < class KeyT, class ValueT >
+void
+array_GPUMalloc( void* d_storage, int64_t& st_bytes, contiguous_key_value< KeyT, ValueT >& arr, position_t size )
 {
-  cudaReusableAlloc(d_storage, st_bytes, &(arr.key_pt), size, sizeof(KeyT));
-  cudaReusableAlloc(d_storage, st_bytes, &(arr.value_pt), size, sizeof(ValueT));
+  cudaReusableAlloc( d_storage, st_bytes, &( arr.key_pt ), size, sizeof( KeyT ) );
+  cudaReusableAlloc( d_storage, st_bytes, &( arr.value_pt ), size, sizeof( ValueT ) );
   arr.offset = 0;
   arr.size = size;
 }
 
-template <class KeyT, class ValueT>
-void array_GPUFree(contiguous_key_value<KeyT, ValueT> &arr, position_t size)
+template < class KeyT, class ValueT >
+void
+array_GPUFree( contiguous_key_value< KeyT, ValueT >& arr, position_t size )
 {
-  CUDAFREECTRL("arr.key_pt",arr.key_pt);
-  CUDAFREECTRL("arr.value_pt",arr.value_pt);
+  CUDAFREECTRL( "arr.key_pt", arr.key_pt );
+  CUDAFREECTRL( "arr.value_pt", arr.value_pt );
   arr.offset = 0;
   arr.size = 0;
 }
 
-template <class KeyT, class ValueT>
-void array_Malloc(contiguous_key_value<KeyT, ValueT> &arr, position_t size)
+template < class KeyT, class ValueT >
+void
+array_Malloc( contiguous_key_value< KeyT, ValueT >& arr, position_t size )
 {
-  arr.key_pt = new KeyT[size];
-  arr.value_pt = new ValueT[size];
+  arr.key_pt = new KeyT[ size ];
+  arr.value_pt = new ValueT[ size ];
   arr.offset = 0;
   arr.size = size;
 }
 
-template <class KeyT, class ValueT>
-void array_Free(contiguous_key_value<KeyT, ValueT> &arr, position_t size)
+template < class KeyT, class ValueT >
+void
+array_Free( contiguous_key_value< KeyT, ValueT >& arr, position_t size )
 {
   delete[] arr.key_pt;
   delete[] arr.value_pt;
@@ -156,167 +168,178 @@ void array_Free(contiguous_key_value<KeyT, ValueT> &arr, position_t size)
 }
 
 // note: this does not allocate memory fo target array,
-// use array_Malloc for that 
-template <class KeyT, class ValueT>
-void array_GPUtoCPUCopyContent(contiguous_key_value<KeyT, ValueT> &target_arr,
-			       contiguous_key_value<KeyT, ValueT> &source_arr)
-{
-  gpuErrchk(cudaMemcpy(target_arr.key_pt + target_arr.offset,
-		       source_arr.key_pt + source_arr.offset,
-		       source_arr.size*sizeof(KeyT),
-		       cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(target_arr.value_pt + target_arr.offset,
-		       source_arr.value_pt + source_arr.offset,
-		       source_arr.size*sizeof(ValueT),
-		       cudaMemcpyDeviceToHost));
+// use array_Malloc for that
+template < class KeyT, class ValueT >
+void
+array_GPUtoCPUCopyContent( contiguous_key_value< KeyT, ValueT >& target_arr,
+  contiguous_key_value< KeyT, ValueT >& source_arr )
+{
+  gpuErrchk( cudaMemcpy( target_arr.key_pt + target_arr.offset,
+    source_arr.key_pt + source_arr.offset,
+    source_arr.size * sizeof( KeyT ),
+    cudaMemcpyDeviceToHost ) );
+  gpuErrchk( cudaMemcpy( target_arr.value_pt + target_arr.offset,
+    source_arr.value_pt + source_arr.offset,
+    source_arr.size * sizeof( ValueT ),
+    cudaMemcpyDeviceToHost ) );
   target_arr.size = source_arr.size;
 }
 
 // note: this does not allocate memory fo target array,
-// use array_GPUMalloc for that 
-template <class KeyT, class ValueT>
-void array_CPUtoGPUCopyContent(contiguous_key_value<KeyT, ValueT> &target_arr,
-			       contiguous_key_value<KeyT, ValueT> &source_arr)
-{
-  gpuErrchk(cudaMemcpy(target_arr.key_pt + target_arr.offset,
-		       source_arr.key_pt + source_arr.offset,
-		       source_arr.size*sizeof(KeyT),
-		       cudaMemcpyHostToDevice));
-  gpuErrchk(cudaMemcpy(target_arr.value_pt + target_arr.offset,
-		       source_arr.value_pt + source_arr.offset,
-		       source_arr.size*sizeof(ValueT),
-		       cudaMemcpyHostToDevice));
+// use array_GPUMalloc for that
+template < class KeyT, class ValueT >
+void
+array_CPUtoGPUCopyContent( contiguous_key_value< KeyT, ValueT >& target_arr,
+  contiguous_key_value< KeyT, ValueT >& source_arr )
+{
+  gpuErrchk( cudaMemcpy( target_arr.key_pt + target_arr.offset,
+    source_arr.key_pt + source_arr.offset,
+    source_arr.size * sizeof( KeyT ),
+    cudaMemcpyHostToDevice ) );
+  gpuErrchk( cudaMemcpy( target_arr.value_pt + target_arr.offset,
+    source_arr.value_pt + source_arr.offset,
+    source_arr.size * sizeof( ValueT ),
+    cudaMemcpyHostToDevice ) );
   target_arr.size = source_arr.size;
 }
 
-template <class KeyT, class ValueT>
-void array_Sort(contiguous_key_value<KeyT, ValueT> &arr)
+template < class KeyT, class ValueT >
+void
+array_Sort( contiguous_key_value< KeyT, ValueT >& arr )
 {
   // build pair vector
-  std::vector<std::pair<KeyT, ValueT>> kv;
+  std::vector< std::pair< KeyT, ValueT > > kv;
   position_t i0 = arr.offset;
-  for (position_t i=0; i<arr.size; i++) {
-    std::pair<KeyT, ValueT> p(arr.key_pt[i0+i], arr.value_pt[i0+i]);
-    kv.push_back(p);
+  for ( position_t i = 0; i < arr.size; i++ )
+  {
+    std::pair< KeyT, ValueT > p( arr.key_pt[ i0 + i ], arr.value_pt[ i0 + i ] );
+    kv.push_back( p );
   }
   // sort pair
-  std::sort(kv.begin(), kv.end(),
-	    [](auto &left, auto &right) {
-	      return left.first < right.first;});
+  std::sort( kv.begin(), kv.end(), []( auto& left, auto& right ) { return left.first < right.first; } );
   // extract elements from sorted vector
-  for (position_t i=0; i<arr.size; i++) {
-    arr.key_pt[i0+i] = kv[i].first;
-    arr.value_pt[i0+i] = kv[i].second;
+  for ( position_t i = 0; i < arr.size; i++ )
+  {
+    arr.key_pt[ i0 + i ] = kv[ i ].first;
+    arr.value_pt[ i0 + i ] = kv[ i ].second;
   }
 }
 
-template <class KeyT, class ValueT>
-void array_GPUSort(contiguous_key_value<KeyT, ValueT> &arr_in,
-		   void *d_storage, int64_t &ext_st_bytes)
+template < class KeyT, class ValueT >
+void
+array_GPUSort( contiguous_key_value< KeyT, ValueT >& arr_in, void* d_storage, int64_t& ext_st_bytes )
 {
   ext_st_bytes = 0;
   int num_elems = arr_in.size;
-  contiguous_key_value<KeyT, ValueT> arr_out;
+  contiguous_key_value< KeyT, ValueT > arr_out;
   arr_out.offset = 0;
   arr_out.size = num_elems;
-  cudaReusableAlloc(d_storage, ext_st_bytes, &arr_out.key_pt, num_elems,
-		    sizeof(KeyT));
-  cudaReusableAlloc(d_storage, ext_st_bytes, &arr_out.value_pt, num_elems,
-		    sizeof(ValueT));
+  cudaReusableAlloc( d_storage, ext_st_bytes, &arr_out.key_pt, num_elems, sizeof( KeyT ) );
+  cudaReusableAlloc( d_storage, ext_st_bytes, &arr_out.value_pt, num_elems, sizeof( ValueT ) );
   // the following is just for memory alignement
-  void *dummy_pt;
-  cudaReusableAlloc(d_storage, ext_st_bytes, &dummy_pt, 1, 256);
-
-  size_t sort_storage_bytes = 0;  
-  cub::DeviceRadixSort::SortPairs(NULL, sort_storage_bytes,
-				  arr_in.key_pt + arr_in.offset,
-				  arr_out.key_pt,
-				  arr_in.value_pt + arr_in.offset,
-				  arr_out.value_pt, num_elems);
-
-  if (d_storage != NULL) {
-    void *d_sort_storage = (void*)((char*)d_storage + ext_st_bytes);
-    cub::DeviceRadixSort::SortPairs(d_sort_storage, sort_storage_bytes,
-				    arr_in.key_pt + arr_in.offset,
-				    arr_out.key_pt,
-				    arr_in.value_pt + arr_in.offset,
-				    arr_out.value_pt, num_elems);
-    
-    gpuErrchk(cudaMemcpyAsync(arr_in.key_pt + arr_in.offset, arr_out.key_pt,
-			      num_elems*sizeof(KeyT),
-			      cudaMemcpyDeviceToDevice));
-    gpuErrchk(cudaMemcpy(arr_in.value_pt + arr_in.offset, arr_out.value_pt,
-			 num_elems*sizeof(ValueT),
-			 cudaMemcpyDeviceToDevice));
+  void* dummy_pt;
+  cudaReusableAlloc( d_storage, ext_st_bytes, &dummy_pt, 1, 256 );
+
+  size_t sort_storage_bytes = 0;
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceRadixSort::SortPairs( NULL,
+    sort_storage_bytes,
+    arr_in.key_pt + arr_in.offset,
+    arr_out.key_pt,
+    arr_in.value_pt + arr_in.offset,
+    arr_out.value_pt,
+    num_elems );
+  //<END-CLANG-TIDY-SKIP>//
+
+  if ( d_storage != NULL )
+  {
+    void* d_sort_storage = ( void* ) ( ( char* ) d_storage + ext_st_bytes );
+    //<BEGIN-CLANG-TIDY-SKIP>//
+    cub::DeviceRadixSort::SortPairs( d_sort_storage,
+      sort_storage_bytes,
+      arr_in.key_pt + arr_in.offset,
+      arr_out.key_pt,
+      arr_in.value_pt + arr_in.offset,
+      arr_out.value_pt,
+      num_elems );
+    //<END-CLANG-TIDY-SKIP>//
+
+    gpuErrchk( cudaMemcpyAsync(
+      arr_in.key_pt + arr_in.offset, arr_out.key_pt, num_elems * sizeof( KeyT ), cudaMemcpyDeviceToDevice ) );
+    gpuErrchk( cudaMemcpy(
+      arr_in.value_pt + arr_in.offset, arr_out.value_pt, num_elems * sizeof( ValueT ), cudaMemcpyDeviceToDevice ) );
   }
-  
+
   ext_st_bytes += sort_storage_bytes;
 }
 
-template <class KeyT, class ValueT>
-__device__ key_value<KeyT, ValueT>
-getElem(regular_block_key_value<KeyT, ValueT> &arr, position_t i)
+template < class KeyT, class ValueT >
+__device__ key_value< KeyT, ValueT >
+getElem( regular_block_key_value< KeyT, ValueT >& arr, position_t i )
 {
-  key_value<KeyT, ValueT> kv;
+  key_value< KeyT, ValueT > kv;
   position_t position = arr.offset + i;
-  kv.key = arr.key_pt[position / arr.block_size][position % arr.block_size];
-  kv.value = arr.value_pt[position / arr.block_size][position % arr.block_size];
+  kv.key = arr.key_pt[ position / arr.block_size ][ position % arr.block_size ];
+  kv.value = arr.value_pt[ position / arr.block_size ][ position % arr.block_size ];
   return kv;
 }
 
-template <class KeyT, class ValueT>
-__device__ KeyT getKey(regular_block_key_value<KeyT, ValueT> &arr, position_t i)
+template < class KeyT, class ValueT >
+__device__ KeyT
+getKey( regular_block_key_value< KeyT, ValueT >& arr, position_t i )
 {
   position_t position = arr.offset + i;
-  return arr.key_pt[position / arr.block_size][position % arr.block_size];
+  return arr.key_pt[ position / arr.block_size ][ position % arr.block_size ];
 }
 
-template <class KeyT, class ValueT>
-KeyT *getKeyPt(regular_block_key_value<KeyT, ValueT> &arr)
+template < class KeyT, class ValueT >
+KeyT*
+getKeyPt( regular_block_key_value< KeyT, ValueT >& arr )
 {
   position_t position = arr.offset;
-  return &arr.key_pt[position / arr.block_size][position % arr.block_size];
+  return &arr.key_pt[ position / arr.block_size ][ position % arr.block_size ];
 }
 
-template <class KeyT, class ValueT>
-ValueT *getValuePt(regular_block_key_value<KeyT, ValueT> &arr)
+template < class KeyT, class ValueT >
+ValueT*
+getValuePt( regular_block_key_value< KeyT, ValueT >& arr )
 {
   position_t position = arr.offset;
-  return &arr.value_pt[position / arr.block_size][position % arr.block_size];
+  return &arr.value_pt[ position / arr.block_size ][ position % arr.block_size ];
 }
 
-template <class KeyT, class ValueT>
-__device__ void setElem(regular_block_key_value<KeyT, ValueT> &arr,
-			position_t i,
-			const key_value<KeyT, ValueT> &kv)
+template < class KeyT, class ValueT >
+__device__ void
+setElem( regular_block_key_value< KeyT, ValueT >& arr, position_t i, const key_value< KeyT, ValueT >& kv )
 {
   position_t position = arr.offset + i;
-  arr.key_pt[position / arr.block_size][position % arr.block_size] = kv.key;
-  arr.value_pt[position / arr.block_size][position % arr.block_size] = kv.value;
+  arr.key_pt[ position / arr.block_size ][ position % arr.block_size ] = kv.key;
+  arr.value_pt[ position / arr.block_size ][ position % arr.block_size ] = kv.value;
 }
 
-template <class KeyT, class ValueT>
-contiguous_key_value<KeyT, ValueT> getBlock
-(regular_block_key_value <KeyT, ValueT> &arr, int i_block)
+template < class KeyT, class ValueT >
+contiguous_key_value< KeyT, ValueT >
+getBlock( regular_block_key_value< KeyT, ValueT >& arr, int i_block )
 {
-  contiguous_key_value<KeyT, ValueT> c_arr;
-  c_arr.key_pt = arr.key_pt[i_block];
-  c_arr.value_pt = arr.value_pt[i_block];
+  contiguous_key_value< KeyT, ValueT > c_arr;
+  c_arr.key_pt = arr.key_pt[ i_block ];
+  c_arr.value_pt = arr.value_pt[ i_block ];
   c_arr.offset = 0;
 
-  position_t diff = arr.size - i_block*arr.block_size;   
-  if (diff <= 0) {
-    printf("i_block out of range in getBlock\n");
-    exit(0);
+  position_t diff = arr.size - i_block * arr.block_size;
+  if ( diff <= 0 )
+  {
+    printf( "i_block out of range in getBlock\n" );
+    exit( 0 );
   }
-  c_arr.size = min(diff, arr.block_size);
-  
+  c_arr.size = std::min( diff, arr.block_size );
+
   return c_arr;
 }
 
 /////////////////////////////////////////////////////////////////
 
-template <class ElementT>
+template < class ElementT >
 struct contiguous_array
 {
   ElementT* data_pt;
@@ -324,7 +347,7 @@ struct contiguous_array
   position_t size;
 };
 
-template <class ElementT>
+template < class ElementT >
 struct regular_block_array
 {
   ElementT** h_data_pt;
@@ -334,66 +357,72 @@ struct regular_block_array
   position_t size;
 };
 
-template <class ElementT>
-__device__ ElementT getElem(contiguous_array<ElementT> &arr, position_t i)
+template < class ElementT >
+__device__ ElementT
+getElem( contiguous_array< ElementT >& arr, position_t i )
 {
-  return *(arr.data_pt + arr.offset + i);
+  return *( arr.data_pt + arr.offset + i );
 }
 
-template <class ElementT>
-__device__ ElementT getKey(contiguous_array<ElementT> &arr, position_t i)
+template < class ElementT >
+__device__ ElementT
+getKey( contiguous_array< ElementT >& arr, position_t i )
 {
-  return *(arr.data_pt + arr.offset + i);
+  return *( arr.data_pt + arr.offset + i );
 }
 
-template <class ElementT>
-ElementT *getKeyPt(contiguous_array<ElementT> &arr)
+template < class ElementT >
+ElementT*
+getKeyPt( contiguous_array< ElementT >& arr )
 {
   return arr.data_pt + arr.offset;
 }
 
-template <class ElementT>
-ElementT *getValuePt(contiguous_array<ElementT> &arr)
+template < class ElementT >
+ElementT*
+getValuePt( contiguous_array< ElementT >& arr )
 {
   return NULL;
 }
 
-template <class ElementT>
-__device__ void setElem(contiguous_array<ElementT> &arr, position_t i,
-			const ElementT &val)
+template < class ElementT >
+__device__ void
+setElem( contiguous_array< ElementT >& arr, position_t i, const ElementT& val )
 {
-  *(arr.data_pt + arr.offset + i) = val;
+  *( arr.data_pt + arr.offset + i ) = val;
 }
 ////////////////////////////////////////////////////////////
 
-template <class ElementT>
-void array_GPUMalloc(void *d_storage, int64_t &st_bytes,
-		     contiguous_array<ElementT> &arr, position_t size)
+template < class ElementT >
+void
+array_GPUMalloc( void* d_storage, int64_t& st_bytes, contiguous_array< ElementT >& arr, position_t size )
 {
-  cudaReusableAlloc(d_storage, st_bytes, &(arr.data_pt), size,
-		    sizeof(ElementT));
+  cudaReusableAlloc( d_storage, st_bytes, &( arr.data_pt ), size, sizeof( ElementT ) );
   arr.offset = 0;
   arr.size = size;
 }
 
-template <class ElementT>
-void array_GPUFree(contiguous_array<ElementT> &arr, position_t size)
+template < class ElementT >
+void
+array_GPUFree( contiguous_array< ElementT >& arr, position_t size )
 {
-  CUDAFREECTRL("arr.data_pt",arr.data_pt);
+  CUDAFREECTRL( "arr.data_pt", arr.data_pt );
   arr.offset = 0;
   arr.size = 0;
 }
-template <class ElementT>
+template < class ElementT >
 
-void array_Malloc(contiguous_array<ElementT> &arr, position_t size)
+void
+array_Malloc( contiguous_array< ElementT >& arr, position_t size )
 {
-  arr.data_pt = new ElementT[size];
+  arr.data_pt = new ElementT[ size ];
   arr.offset = 0;
   arr.size = size;
 }
 
-template <class ElementT>
-void array_Free(contiguous_array<ElementT> &arr, position_t size)
+template < class ElementT >
+void
+array_Free( contiguous_array< ElementT >& arr, position_t size )
 {
   delete[] arr.data_pt;
   arr.offset = 0;
@@ -401,606 +430,655 @@ void array_Free(contiguous_array<ElementT> &arr, position_t size)
 }
 
 // note: this does not allocate memory fo target array,
-// use array_Malloc for that 
-template <class ElementT>
-void array_GPUtoCPUCopyContent(contiguous_array<ElementT> &target_arr,
-			       contiguous_array<ElementT> &source_arr)
-{
-  gpuErrchk(cudaMemcpy(target_arr.data_pt + target_arr.offset,
-		       source_arr.data_pt + source_arr.offset,
-		       source_arr.size*sizeof(ElementT),
-		       cudaMemcpyDeviceToHost));
+// use array_Malloc for that
+template < class ElementT >
+void
+array_GPUtoCPUCopyContent( contiguous_array< ElementT >& target_arr, contiguous_array< ElementT >& source_arr )
+{
+  gpuErrchk( cudaMemcpy( target_arr.data_pt + target_arr.offset,
+    source_arr.data_pt + source_arr.offset,
+    source_arr.size * sizeof( ElementT ),
+    cudaMemcpyDeviceToHost ) );
   target_arr.size = source_arr.size;
 }
 
 // note: this does not allocate memory fo target array,
-// use array_GPUMalloc for that 
-template <class ElementT>
-void array_CPUtoGPUCopyContent(contiguous_array<ElementT> &target_arr,
-			       contiguous_array<ElementT> &source_arr)
-{
-  gpuErrchk(cudaMemcpy(target_arr.data_pt + target_arr.offset,
-		       source_arr.data_pt + source_arr.offset,
-		       source_arr.size*sizeof(ElementT),
-		       cudaMemcpyHostToDevice));
+// use array_GPUMalloc for that
+template < class ElementT >
+void
+array_CPUtoGPUCopyContent( contiguous_array< ElementT >& target_arr, contiguous_array< ElementT >& source_arr )
+{
+  gpuErrchk( cudaMemcpy( target_arr.data_pt + target_arr.offset,
+    source_arr.data_pt + source_arr.offset,
+    source_arr.size * sizeof( ElementT ),
+    cudaMemcpyHostToDevice ) );
   target_arr.size = source_arr.size;
 }
 
-template <class ElementT>
-void array_Sort(contiguous_array<ElementT> &arr)
+template < class ElementT >
+void
+array_Sort( contiguous_array< ElementT >& arr )
 {
   // sort array
-  std::sort(arr.data_pt+arr.offset, arr.data_pt+arr.offset+arr.size);
+  std::sort( arr.data_pt + arr.offset, arr.data_pt + arr.offset + arr.size );
   // extract elements from sorted vector
 }
 
-template <class ElementT>
-void array_GPUSort(contiguous_array<ElementT> &arr_in,
-		   void *d_storage, int64_t &ext_st_bytes)
+template < class ElementT >
+void
+array_GPUSort( contiguous_array< ElementT >& arr_in, void* d_storage, int64_t& ext_st_bytes )
 {
   ext_st_bytes = 0;
   int num_elems = arr_in.size;
-  contiguous_array<ElementT> arr_out;
+  contiguous_array< ElementT > arr_out;
   arr_out.offset = 0;
   arr_out.size = num_elems;
-  cudaReusableAlloc(d_storage, ext_st_bytes, &arr_out.data_pt, num_elems,
-		    sizeof(ElementT));
+  cudaReusableAlloc( d_storage, ext_st_bytes, &arr_out.data_pt, num_elems, sizeof( ElementT ) );
   // the following is just for memory alignement
-  void *dummy_pt;
-  cudaReusableAlloc(d_storage, ext_st_bytes, &dummy_pt, 1, 256);
-
-  size_t sort_storage_bytes = 0;  
-  cub::DeviceRadixSort::SortKeys(NULL, sort_storage_bytes,
-				 arr_in.data_pt + arr_in.offset,
-				 arr_out.data_pt, num_elems);
-
-  if (d_storage != NULL) {
-    void *d_sort_storage = (void*)((char*)d_storage + ext_st_bytes);
-    cub::DeviceRadixSort::SortKeys(d_sort_storage, sort_storage_bytes,
-				   arr_in.data_pt + arr_in.offset,
-				   arr_out.data_pt, num_elems);
-
-    gpuErrchk(cudaMemcpy(arr_in.data_pt + arr_in.offset, arr_out.data_pt,
-			 num_elems*sizeof(ElementT),
-			 cudaMemcpyDeviceToDevice));
+  void* dummy_pt;
+  cudaReusableAlloc( d_storage, ext_st_bytes, &dummy_pt, 1, 256 );
+
+  size_t sort_storage_bytes = 0;
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceRadixSort::SortKeys(
+    NULL, sort_storage_bytes, arr_in.data_pt + arr_in.offset, arr_out.data_pt, num_elems );
+  //<END-CLANG-TIDY-SKIP>//
+
+  if ( d_storage != NULL )
+  {
+    void* d_sort_storage = ( void* ) ( ( char* ) d_storage + ext_st_bytes );
+    //<BEGIN-CLANG-TIDY-SKIP>//
+    cub::DeviceRadixSort::SortKeys(
+      d_sort_storage, sort_storage_bytes, arr_in.data_pt + arr_in.offset, arr_out.data_pt, num_elems );
+    //<END-CLANG-TIDY-SKIP>//
+
+    gpuErrchk( cudaMemcpy(
+      arr_in.data_pt + arr_in.offset, arr_out.data_pt, num_elems * sizeof( ElementT ), cudaMemcpyDeviceToDevice ) );
   }
-  
+
   ext_st_bytes += sort_storage_bytes;
 }
 
 ////////////////////////////////////////////////////////
 
-template <class ElementT>
-contiguous_array<ElementT> getBlock(regular_block_array<ElementT> &arr,
-				    int i_block)
+template < class ElementT >
+contiguous_array< ElementT >
+getBlock( regular_block_array< ElementT >& arr, int i_block )
 {
-  contiguous_array<ElementT> c_arr;
-  c_arr.data_pt = arr.data_pt[i_block];
+  contiguous_array< ElementT > c_arr;
+  c_arr.data_pt = arr.data_pt[ i_block ];
   c_arr.offset = 0;
 
-  position_t diff = arr.size - i_block*arr.block_size;   
-  if (diff <= 0) {
-    printf("i_block out of range in getBlock\n");
-    exit(0);
+  position_t diff = arr.size - i_block * arr.block_size;
+  if ( diff <= 0 )
+  {
+    printf( "i_block out of range in getBlock\n" );
+    exit( 0 );
   }
-  c_arr.size = min(diff, arr.block_size);
-  
+  c_arr.size = std::min( diff, arr.block_size );
+
   return c_arr;
 }
 
-
-template <class ElementT>
-__device__ ElementT getElem(regular_block_array<ElementT> &arr, position_t i)
+template < class ElementT >
+__device__ ElementT
+getElem( regular_block_array< ElementT >& arr, position_t i )
 {
   position_t position = arr.offset + i;
-  return arr.data_pt[position / arr.block_size][position % arr.block_size];
+  return arr.data_pt[ position / arr.block_size ][ position % arr.block_size ];
 }
 
-template <class ElementT>
-__device__ ElementT getKey(regular_block_array<ElementT> &arr, position_t i)
+template < class ElementT >
+__device__ ElementT
+getKey( regular_block_array< ElementT >& arr, position_t i )
 {
   position_t position = arr.offset + i;
-  return arr.data_pt[position / arr.block_size][position % arr.block_size];
+  return arr.data_pt[ position / arr.block_size ][ position % arr.block_size ];
 }
 
-template <class ElementT>
-ElementT *getKeyPt(regular_block_array<ElementT> &arr)
+template < class ElementT >
+ElementT*
+getKeyPt( regular_block_array< ElementT >& arr )
 {
   position_t position = arr.offset;
-  return &arr.data_pt[position / arr.block_size][position % arr.block_size];
+  return &arr.data_pt[ position / arr.block_size ][ position % arr.block_size ];
 }
 
-template <class ElementT>
-ElementT *getValuePt(regular_block_array<ElementT> &arr)
+template < class ElementT >
+ElementT*
+getValuePt( regular_block_array< ElementT >& arr )
 {
   return NULL;
 }
 
-template <class ElementT>
-__device__ void setElem(regular_block_array<ElementT> &arr, position_t i,
-			const ElementT &val)
+template < class ElementT >
+__device__ void
+setElem( regular_block_array< ElementT >& arr, position_t i, const ElementT& val )
 {
   position_t position = arr.offset + i;
-  arr.data_pt[position / arr.block_size][position % arr.block_size] = val;
+  arr.data_pt[ position / arr.block_size ][ position % arr.block_size ] = val;
 }
 
 //////////////////////////////////////////////////////////////////////
 
-unsigned int nextPowerOf2(unsigned int n);
+unsigned int nextPowerOf2( unsigned int n );
 
-// atomically set old_index = *arg_max_pt, 
+// atomically set old_index = *arg_max_pt,
 // check whether array[index]>array[old_index].
 // If it is true, set *arg_max_pt=index
-__device__ int atomicArgMax(position_t *array, int *arg_max_pt, int index);
-
+__device__ int atomicArgMax( position_t* array, int* arg_max_pt, int index );
 
 // find difference between two arrays of type T and specified size
-template<class T>
-__global__ void diffKernel(T* c, const T* a, const T* b, position_t size)
+template < class T >
+__global__ void
+diffKernel( T* c, const T* a, const T* b, position_t size )
 {
   position_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < size) {
-    c[i] = a[i] - b[i];
+  if ( i < size )
+  {
+    c[ i ] = a[ i ] - b[ i ];
   }
 }
 
-
-__global__ void copass_last_step_kernel(position_t *part_size, position_t *m_d,
-					uint k, position_t tot_diff,
-					position_t *diff,
-					position_t *diff_cumul,
-					position_t *num_down);
+__global__ void copass_last_step_kernel( position_t* part_size,
+  position_t* m_d,
+  uint k,
+  position_t tot_diff,
+  position_t* diff,
+  position_t* diff_cumul,
+  position_t* num_down );
 
 //////////////////////////////////////////////////
-__global__ void case2_inc_partitions_kernel(position_t *part_size,
-					    int *sorted_extra_elem_idx,
-					    position_t tot_diff);
-
+__global__ void case2_inc_partitions_kernel( position_t* part_size, int* sorted_extra_elem_idx, position_t tot_diff );
 
 // find the number of elements <= val
 // in a sorted array array[i+1]>=array[i]
-template <class KeyT, class ArrayT, uint bsize>
-__device__ void search_block_up(ArrayT array, position_t size, KeyT val,
-				position_t *num_up)
+template < class KeyT, class ArrayT, uint bsize >
+__device__ void
+search_block_up( ArrayT array, position_t size, KeyT val, position_t* num_up )
 {
-  __shared__ KeyT shared_array[bsize+1];
+  __shared__ KeyT shared_array[ bsize + 1 ];
   __shared__ position_t left;
   __shared__ position_t right;
 
   int tid = threadIdx.x;
-  if (size==0 || getKey(array, 0) > val) {
-    if (tid == 0) {
+  if ( size == 0 || getKey( array, 0 ) > val )
+  {
+    if ( tid == 0 )
+    {
       *num_up = 0;
     }
     return;
   }
-  else if (getKey(array, size-1) <= val) {
-    if (tid == 0) {
+  else if ( getKey( array, size - 1 ) <= val )
+  {
+    if ( tid == 0 )
+    {
       *num_up = size;
     }
     return;
   }
 
-  if (tid == 0) {
+  if ( tid == 0 )
+  {
     left = 0;
     right = size - 1;
   }
-  
+
   position_t step = size - 1;
 
-  //if (tid == 0) {
-  //  printf("bid:%d tid:0 step:%ld size:%ld\n", blockIdx.x, step, size);
-  //  printf("arr[n-1]: %d arr[n-2] %d val %d\n", getKey(array, size-1),
-  //      getKey(array, size-2), val);
-  //}
+  // if (tid == 0) {
+  //   printf("bid:%d tid:0 step:%ld size:%ld\n", blockIdx.x, step, size);
+  //   printf("arr[n-1]: %d arr[n-2] %d val %d\n", getKey(array, size-1),
+  //       getKey(array, size-2), val);
+  // }
   __syncthreads();
-  while (step>1 && (right-left)>1) {
+  while ( step > 1 && ( right - left ) > 1 )
+  {
     position_t pos;
-    position_t new_step = (step + blockDim.x - 1) / blockDim.x;
-    int n_steps = (int)((step + new_step - 1) / new_step);
+    position_t new_step = ( step + blockDim.x - 1 ) / blockDim.x;
+    int n_steps = ( int ) ( ( step + new_step - 1 ) / new_step );
     step = new_step;
-    if (tid == 0) {
+    if ( tid == 0 )
+    {
       pos = left;
-      shared_array[0] = getKey(array, left);
-      shared_array[n_steps] = getKey(array, right);
-      //printf("bid:%d tid:0 n_steps:%d sa:%d right:%ld arr:%d step: %ld\n",
-      //     blockIdx.x, n_steps, (int)shared_array[n_steps], right,
-      //     (int)getKey(array, right), step);
+      shared_array[ 0 ] = getKey( array, left );
+      shared_array[ n_steps ] = getKey( array, right );
+      // printf("bid:%d tid:0 n_steps:%d sa:%d right:%ld arr:%d step: %ld\n",
+      //      blockIdx.x, n_steps, (int)shared_array[n_steps], right,
+      //      (int)getKey(array, right), step);
     }
-    else if (tid < n_steps) {
-      pos = left + step*tid;
-      if ((right-pos) >= 1) {
-	shared_array[tid] = getKey(array, pos);
-	//printf("bid:%d tid:%ld sa:%ld pos:%ld arr:%ld\n", blockIdx.x, tid,
-	//           shared_array[tid], pos, array[pos]);
+    else if ( tid < n_steps )
+    {
+      pos = left + step * tid;
+      if ( ( right - pos ) >= 1 )
+      {
+        shared_array[ tid ] = getKey( array, pos );
+        // printf("bid:%d tid:%ld sa:%ld pos:%ld arr:%ld\n", blockIdx.x, tid,
+        //            shared_array[tid], pos, array[pos]);
       }
     }
     __syncthreads();
-    if ((tid < n_steps) && ((right-pos) >= 1)
-	&& (shared_array[tid] <= val)
-	&& (shared_array[tid+1] > val)) {
+    if ( ( tid < n_steps ) && ( ( right - pos ) >= 1 ) && ( shared_array[ tid ] <= val )
+      && ( shared_array[ tid + 1 ] > val ) )
+    {
       left = pos;
-      right = min(pos + step, right);
-      //printf("bid:%d good tid:%d sa0:%d sa1:%d l:%ld r:%ld\n", blockIdx.x,
-      //     tid, (int)shared_array[tid], (int)shared_array[tid+1],
-      //     left, right);
+      right = min( pos + step, right );
+      // printf("bid:%d good tid:%d sa0:%d sa1:%d l:%ld r:%ld\n", blockIdx.x,
+      //      tid, (int)shared_array[tid], (int)shared_array[tid+1],
+      //      left, right);
     }
     __syncthreads();
   }
 
-  if (threadIdx.x==0) {
+  if ( threadIdx.x == 0 )
+  {
     *num_up = right;
-    //printf("Kernel block: %ld\tnum_up: %ld\n", blockIdx.x, right);
-    //printf("bid: %ld\tleft: %ld\tright: %ld\n", blockIdx.x, left, right);
+    // printf("Kernel block: %ld\tnum_up: %ld\n", blockIdx.x, right);
+    // printf("bid: %ld\tleft: %ld\tright: %ld\n", blockIdx.x, left, right);
   }
 }
 
-template <class KeyT, class ArrayT, uint bsize>
-__global__ void search_multi_up_kernel(ArrayT *subarray,
-				       KeyT *val_pt, position_t *num_up,
-				       position_t *sum_num_up)
+template < class KeyT, class ArrayT, uint bsize >
+__global__ void
+search_multi_up_kernel( ArrayT* subarray, KeyT* val_pt, position_t* num_up, position_t* sum_num_up )
 {
   int bid = blockIdx.x;
   KeyT val = *val_pt;
-  search_block_up<KeyT, ArrayT, bsize>
-    (subarray[bid], subarray[bid].size, val, &num_up[bid]);
-  if (threadIdx.x==0) {
-    atomicAdd((uposition_t*)sum_num_up, num_up[bid]);
-    //printf("bid: %ld\tm_d: %ld\n", blockIdx.x, num_up[bid]);
+  search_block_up< KeyT, ArrayT, bsize >( subarray[ bid ], subarray[ bid ].size, val, &num_up[ bid ] );
+  if ( threadIdx.x == 0 )
+  {
+    atomicAdd( ( uposition_t* ) sum_num_up, num_up[ bid ] );
+    // printf("bid: %ld\tm_d: %ld\n", blockIdx.x, num_up[bid]);
   }
 }
 
 // find the number of elements < val
 // in a sorted array array[i+1]>=array[i]
-template <class KeyT, class ArrayT, uint bsize>
-__device__ void search_block_down(ArrayT array, position_t size, KeyT val,
-				  position_t *num_down)
+template < class KeyT, class ArrayT, uint bsize >
+__device__ void
+search_block_down( ArrayT array, position_t size, KeyT val, position_t* num_down )
 {
-  __shared__ KeyT shared_array[bsize+1];
+  __shared__ KeyT shared_array[ bsize + 1 ];
   __shared__ position_t left;
   __shared__ position_t right;
 
   int tid = threadIdx.x;
-  if (size==0 || getKey(array, 0) >= val) {
-    if (tid == 0) {
+  if ( size == 0 || getKey( array, 0 ) >= val )
+  {
+    if ( tid == 0 )
+    {
       *num_down = 0;
     }
     return;
   }
-  else if (getKey(array, size-1) < val) {
-    if (tid == 0) {
+  else if ( getKey( array, size - 1 ) < val )
+  {
+    if ( tid == 0 )
+    {
       *num_down = size;
     }
     return;
   }
 
-  if (tid == 0) {
+  if ( tid == 0 )
+  {
     left = 0;
     right = size - 1;
   }
-  
+
   position_t step = size - 1;
 
-  //if (tid == 0) {
-  //  printf("bid:%d tid:0 step:%ld size:%ld\n", blockIdx.x, step, size);
-  //  printf("arr[n-1]: %d arr[n-2] %d val %d\n", getKey(array, size-1),
+  // if (tid == 0) {
+  //   printf("bid:%d tid:0 step:%ld size:%ld\n", blockIdx.x, step, size);
+  //   printf("arr[n-1]: %d arr[n-2] %d val %d\n", getKey(array, size-1),
   //	   getKey(array, size-2), val);
-  //}
+  // }
   __syncthreads();
-  while(step>1 && (right-left)>1) {
+  while ( step > 1 && ( right - left ) > 1 )
+  {
     position_t pos;
-    position_t new_step = (step + blockDim.x - 1) / blockDim.x;
-    int n_steps = (int)((step + new_step - 1) / new_step);
+    position_t new_step = ( step + blockDim.x - 1 ) / blockDim.x;
+    int n_steps = ( int ) ( ( step + new_step - 1 ) / new_step );
     step = new_step;
-    if (tid == 0) {
+    if ( tid == 0 )
+    {
       pos = left;
-      shared_array[0] = getKey(array, left);
-      shared_array[n_steps] = getKey(array, right);
-      //printf("bid:%d tid:0 n_steps:%d sa:%d right:%ld arr:%d step: %ld\n",
-      //     blockIdx.x, n_steps, (int)shared_array[n_steps], right,
-      //     (int)getKey(array, right), step);
+      shared_array[ 0 ] = getKey( array, left );
+      shared_array[ n_steps ] = getKey( array, right );
+      // printf("bid:%d tid:0 n_steps:%d sa:%d right:%ld arr:%d step: %ld\n",
+      //      blockIdx.x, n_steps, (int)shared_array[n_steps], right,
+      //      (int)getKey(array, right), step);
     }
-    else if (tid < n_steps) {
-      pos = left + step*tid;
-      if ((right-pos) >= 1) {
-	shared_array[tid] = getKey(array, pos);
-	//printf("bid:%d tid:%ld sa:%ld pos:%ld arr:%ld\n", blockIdx.x, tid,
-	//	     shared_array[tid], pos, array[pos]);
+    else if ( tid < n_steps )
+    {
+      pos = left + step * tid;
+      if ( ( right - pos ) >= 1 )
+      {
+        shared_array[ tid ] = getKey( array, pos );
+        // printf("bid:%d tid:%ld sa:%ld pos:%ld arr:%ld\n", blockIdx.x, tid,
+        //	     shared_array[tid], pos, array[pos]);
       }
     }
     __syncthreads();
-    if ((tid < n_steps) && ((right-pos) >= 1)
-	&& (shared_array[tid] < val)
-	&& (shared_array[tid+1] >= val)) {
+    if ( ( tid < n_steps ) && ( ( right - pos ) >= 1 ) && ( shared_array[ tid ] < val )
+      && ( shared_array[ tid + 1 ] >= val ) )
+    {
       left = pos;
-      right = min(pos + step, right);
-      //printf("bid:%d good tid:%d sa0:%d sa1:%d l:%ld r:%ld\n", blockIdx.x,
-      //     tid, (int)shared_array[tid], (int)shared_array[tid+1],
-      //     left, right);
+      right = min( pos + step, right );
+      // printf("bid:%d good tid:%d sa0:%d sa1:%d l:%ld r:%ld\n", blockIdx.x,
+      //      tid, (int)shared_array[tid], (int)shared_array[tid+1],
+      //      left, right);
     }
     __syncthreads();
   }
 
-  if (threadIdx.x==0) {
+  if ( threadIdx.x == 0 )
+  {
     *num_down = right;
-    //printf("Kernel block: %ld\tnum_up: %ld\n", blockIdx.x, right);
-    //printf("bid: %ld\tleft: %ld\tright: %ld\n", blockIdx.x, left, right);
+    // printf("Kernel block: %ld\tnum_up: %ld\n", blockIdx.x, right);
+    // printf("bid: %ld\tleft: %ld\tright: %ld\n", blockIdx.x, left, right);
   }
 }
 
-template <class KeyT, class ArrayT, uint bsize>
-__global__ void search_multi_down_kernel(ArrayT *subarray,
-					 KeyT *val_pt, position_t *num_down,
-					 position_t *sum_num_down)
+template < class KeyT, class ArrayT, uint bsize >
+__global__ void
+search_multi_down_kernel( ArrayT* subarray, KeyT* val_pt, position_t* num_down, position_t* sum_num_down )
 {
   int bid = blockIdx.x;
   KeyT val = *val_pt;
-  search_block_down<KeyT, ArrayT, bsize>
-    (subarray[bid], subarray[bid].size, val, &num_down[bid]);
-  if (threadIdx.x==0) {
-    atomicAdd((uposition_t*)sum_num_down, num_down[bid]);
-    //printf("bid: %ld\tm_u: %ld\n", blockIdx.x, num_down[bid]);
+  search_block_down< KeyT, ArrayT, bsize >( subarray[ bid ], subarray[ bid ].size, val, &num_down[ bid ] );
+  if ( threadIdx.x == 0 )
+  {
+    atomicAdd( ( uposition_t* ) sum_num_down, num_down[ bid ] );
+    // printf("bid: %ld\tm_u: %ld\n", blockIdx.x, num_down[bid]);
   }
 }
 
 ////////////////////////////////////////////////////////////
 // find the maximum of m_u[i] - m_d[i], i=0,...,size-1
-template <class ArrayT, uint bsize>
-__global__ void max_diff_kernel(position_t *m_u, position_t *m_d,
-				position_t size, ArrayT* subarray,
-				position_t *max_diff,
-				int *arg_max)
-{
-  __shared__ position_t diff_array[bsize];
+template < class ArrayT, uint bsize >
+__global__ void
+max_diff_kernel( position_t* m_u,
+  position_t* m_d,
+  position_t size,
+  ArrayT* subarray,
+  position_t* max_diff,
+  int* arg_max )
+{
+  __shared__ position_t diff_array[ bsize ];
   __shared__ int shared_arg_max;
 
   int i = threadIdx.x;
-  if (i >= size) return;
-  position_t sub_size = subarray[i].size;
-  if (i == 0) {
+  if ( i >= size )
+  {
+    return;
+  }
+  position_t sub_size = subarray[ i ].size;
+  if ( i == 0 )
+  {
     shared_arg_max = 0; // index of maximum difference
-    if (sub_size <= 0) {
-      diff_array[0] = -1;
+    if ( sub_size <= 0 )
+    {
+      diff_array[ 0 ] = -1;
     }
   }
 
-  if (sub_size > 0) {
-    diff_array[i] = m_u[i] - m_d[i];
+  if ( sub_size > 0 )
+  {
+    diff_array[ i ] = m_u[ i ] - m_d[ i ];
   }
   __syncthreads();
-  if (sub_size > 0) {
+  if ( sub_size > 0 )
+  {
 #ifdef PRINT_VRB
-    printf("diff i: %d m_u:%ld m_d:%ld diff_array:%ld\n", i, m_u[i], m_d[i],
-	   diff_array[i]);
+    printf( "diff i: %d m_u:%ld m_d:%ld diff_array:%ld\n", i, m_u[ i ], m_d[ i ], diff_array[ i ] );
 #endif
-    atomicArgMax(diff_array, &shared_arg_max, i);
+    atomicArgMax( diff_array, &shared_arg_max, i );
   }
   __syncthreads();
-  
-  if (i == 0) {
-    *max_diff = diff_array[shared_arg_max];
+
+  if ( i == 0 )
+  {
+    *max_diff = diff_array[ shared_arg_max ];
     *arg_max = shared_arg_max;
 #ifdef PRINT_VRB
-    printf("Kernel max_diff: %ld\targ_max: %d\n", *max_diff, *arg_max);
+    printf( "Kernel max_diff: %ld\targ_max: %d\n", *max_diff, *arg_max );
 #endif
   }
-}  
-
+}
 
 // check array element type, maybe replace with position_t
-template <class ElementT, int bsize>
-__global__ void prefix_scan(ElementT *array_in, ElementT *array_out,
-			    uint k, uint n)
+template < class ElementT, int bsize >
+__global__ void
+prefix_scan( ElementT* array_in, ElementT* array_out, uint k, uint n )
 {
-  __shared__ ElementT shared_arr[bsize];
+  __shared__ ElementT shared_arr[ bsize ];
   int tid = threadIdx.x;
-  
-  if (2*tid+1 >= 2*n) return;
-  
-  int offset = 1; 
-
-   // copy input array to shared memory
-  if (2*tid < k) {
-    shared_arr[2*tid] = array_in[2*tid];
+
+  if ( 2 * tid + 1 >= 2 * n )
+  {
+    return;
+  }
+
+  int offset = 1;
+
+  // copy input array to shared memory
+  if ( 2 * tid < k )
+  {
+    shared_arr[ 2 * tid ] = array_in[ 2 * tid ];
   }
-  else {
-    shared_arr[2*tid] = 0;
+  else
+  {
+    shared_arr[ 2 * tid ] = 0;
   }
-  if ((2*tid+1) < k) {
-    shared_arr[2*tid+1] = array_in[2*tid+1];
+  if ( ( 2 * tid + 1 ) < k )
+  {
+    shared_arr[ 2 * tid + 1 ] = array_in[ 2 * tid + 1 ];
   }
-  else {
-    shared_arr[2*tid+1] = 0;
+  else
+  {
+    shared_arr[ 2 * tid + 1 ] = 0;
   }
-  
-  for (int d=n>>1; d>0; d>>=1) {
+
+  for ( int d = n >> 1; d > 0; d >>= 1 )
+  {
     __syncthreads();
-    if (tid < d)    { 
-      int a = offset*(2*tid+1)-1;
-      int b = offset*(2*tid+2)-1;
-      shared_arr[b] += shared_arr[a];
+    if ( tid < d )
+    {
+      int a = offset * ( 2 * tid + 1 ) - 1;
+      int b = offset * ( 2 * tid + 2 ) - 1;
+      shared_arr[ b ] += shared_arr[ a ];
     }
     offset *= 2;
-  } 
-  if (tid == 0) {
-    shared_arr[n - 1] = 0;
   }
-  
-  for (int d=1; d<n; d*=2) {
+  if ( tid == 0 )
+  {
+    shared_arr[ n - 1 ] = 0;
+  }
+
+  for ( int d = 1; d < n; d *= 2 )
+  {
     offset >>= 1;
     __syncthreads();
-    if (tid < d)      { 
-      int a = offset*(2*tid+1)-1;
-      int b = offset*(2*tid+2)-1; 
-      ElementT t = shared_arr[a];
-      shared_arr[a] = shared_arr[b];
-      shared_arr[b] += t;
+    if ( tid < d )
+    {
+      int a = offset * ( 2 * tid + 1 ) - 1;
+      int b = offset * ( 2 * tid + 2 ) - 1;
+      ElementT t = shared_arr[ a ];
+      shared_arr[ a ] = shared_arr[ b ];
+      shared_arr[ b ] += t;
     }
   }
-  __syncthreads(); 
-   if (2*tid < k+1) {
-    array_out[2*tid] = shared_arr[2*tid];
+  __syncthreads();
+  if ( 2 * tid < k + 1 )
+  {
+    array_out[ 2 * tid ] = shared_arr[ 2 * tid ];
   }
-  if (2*tid < k) {
-    array_out[2*tid+1] = shared_arr[2*tid+1];
+  if ( 2 * tid < k )
+  {
+    array_out[ 2 * tid + 1 ] = shared_arr[ 2 * tid + 1 ];
   }
-} 
-
-
+}
 
 // trova num. di elementi dell'array < val
 // in un array ordinato array[i+1]>=array[i]
-template <class ElementT, uint bsize>
-__global__ void search_down(ElementT *array, position_t size,
-			    ElementT val, position_t *num_down)
+template < class ElementT, uint bsize >
+__global__ void
+search_down( ElementT* array, position_t size, ElementT val, position_t* num_down )
 {
-  contiguous_array<ElementT> arr;
+  contiguous_array< ElementT > arr;
   arr.data_pt = array;
   arr.offset = 0;
-  search_block_down<ElementT, contiguous_array<ElementT>, bsize>
-    (arr, size, val, num_down);
+  search_block_down< ElementT, contiguous_array< ElementT >, bsize >( arr, size, val, num_down );
 }
 
 // trova num. di elementi dell'array <= val
 // in un array ordinato array[i+1]>=array[i]
-template <class ElementT, uint bsize>
-__global__ void search_up(ElementT *array, position_t size,
-			    ElementT val, position_t *num_up)
+template < class ElementT, uint bsize >
+__global__ void
+search_up( ElementT* array, position_t size, ElementT val, position_t* num_up )
 {
-  contiguous_array<ElementT> arr;
+  contiguous_array< ElementT > arr;
   arr.data_pt = array;
   arr.offset = 0;
-  search_block_up<ElementT, contiguous_array<ElementT>, bsize>
-    (arr, size, val, num_up);
+  search_block_up< ElementT, contiguous_array< ElementT >, bsize >( arr, size, val, num_up );
 }
 
-
-template <class KeyT, class ArrayT, uint bsize>
-int search_multi_up(ArrayT *d_subarray, uint k,
-		    KeyT *d_val_pt, position_t *d_num_up,
-		    position_t *d_sum_num_up)
+template < class KeyT, class ArrayT, uint bsize >
+int
+search_multi_up( ArrayT* d_subarray, uint k, KeyT* d_val_pt, position_t* d_num_up, position_t* d_sum_num_up )
 {
-  gpuErrchk(cudaMemsetAsync(d_sum_num_up, 0, sizeof(position_t)));
-  search_multi_up_kernel <KeyT, ArrayT, bsize> <<<k, bsize>>>
-    (d_subarray, d_val_pt, d_num_up, d_sum_num_up);
+  gpuErrchk( cudaMemsetAsync( d_sum_num_up, 0, sizeof( position_t ) ) );
+  search_multi_up_kernel< KeyT, ArrayT, bsize > <<< k, bsize>>>( d_subarray, d_val_pt, d_num_up, d_sum_num_up );
   DBGCUDASYNC
-  
+
   return 0;
 }
 
-template <class KeyT, class ArrayT, uint bsize>
-int search_multi_down(ArrayT *d_subarray,
-		      uint k, KeyT *d_val_pt, position_t *d_num_down,
-		      position_t *d_sum_num_down)
+template < class KeyT, class ArrayT, uint bsize >
+int
+search_multi_down( ArrayT* d_subarray, uint k, KeyT* d_val_pt, position_t* d_num_down, position_t* d_sum_num_down )
 {
-  gpuErrchk(cudaMemsetAsync(d_sum_num_down, 0, sizeof(position_t)));
-  search_multi_down_kernel <KeyT, ArrayT, bsize> <<<k, bsize>>>
-    (d_subarray, d_val_pt, d_num_down, d_sum_num_down);
+  gpuErrchk( cudaMemsetAsync( d_sum_num_down, 0, sizeof( position_t ) ) );
+  search_multi_down_kernel< KeyT, ArrayT, bsize > <<< k, bsize>>>( d_subarray, d_val_pt, d_num_down, d_sum_num_down );
 
   DBGCUDASYNC
-  
+
   return 0;
 }
 
-// atomically set old_index = *arg_max_pt, 
+// atomically set old_index = *arg_max_pt,
 // check whether array[index]>array[old_index].
 // If it is true, set *arg_max_pt=index
-template<class KeyT>
-__device__ int atomicKeyArgMax(KeyT *array, int *arg_max_pt, int index)
+template < class KeyT >
+__device__ int
+atomicKeyArgMax( KeyT* array, int* arg_max_pt, int index )
 {
   int old_index = *arg_max_pt;
   int assumed_index;
-  do {
-    if (old_index>=0 && array[old_index]>=array[index]) {
+  do
+  {
+    if ( old_index >= 0 && array[ old_index ] >= array[ index ] )
+    {
       break;
     }
     assumed_index = old_index;
-    old_index = atomicCAS(arg_max_pt, assumed_index, index);
-  } while (assumed_index != old_index);
-  
+    old_index = atomicCAS( arg_max_pt, assumed_index, index );
+  } while ( assumed_index != old_index );
+
   return old_index;
 }
 
-// atomically set old_index = *arg_min_pt, 
+// atomically set old_index = *arg_min_pt,
 // check whether array[index]<array[old_index].
 // If it is true, set *arg_min_pt=index
-template<class KeyT>
-__device__ int atomicKeyArgMin(KeyT *array, int *arg_min_pt, int index)
+template < class KeyT >
+__device__ int
+atomicKeyArgMin( KeyT* array, int* arg_min_pt, int index )
 {
   int old_index = *arg_min_pt;
   int assumed_index;
-  do {
-    if (old_index>=0 && array[old_index]<=array[index]) {
+  do
+  {
+    if ( old_index >= 0 && array[ old_index ] <= array[ index ] )
+    {
       break;
     }
     assumed_index = old_index;
-    old_index = atomicCAS(arg_min_pt, assumed_index, index);
-  } while (assumed_index != old_index);
-  
+    old_index = atomicCAS( arg_min_pt, assumed_index, index );
+  } while ( assumed_index != old_index );
+
   return old_index;
 }
 
-
-template<class KeyT, class ArrayT, uint bsize>
-__global__ void threshold_range_kernel(ArrayT* subarray,
-				       position_t tot_part_size,
-				       uint k, KeyT *t_u, KeyT *t_d)
+template < class KeyT, class ArrayT, uint bsize >
+__global__ void
+threshold_range_kernel( ArrayT* subarray, position_t tot_part_size, uint k, KeyT* t_u, KeyT* t_d )
 {
-  __shared__ KeyT shared_t_u[bsize];
-  __shared__ KeyT shared_t_d[bsize];
+  __shared__ KeyT shared_t_u[ bsize ];
+  __shared__ KeyT shared_t_d[ bsize ];
   __shared__ int shared_arg_max;
   __shared__ int shared_arg_min;
 
-  if (threadIdx.x==0) {
+  if ( threadIdx.x == 0 )
+  {
     shared_arg_max = -1;
     shared_arg_min = -1;
   }
   __syncthreads();
 
 #ifdef PRINT_VRB
-  bool print_vrb = (threadIdx.x==0);
+  bool print_vrb = ( threadIdx.x == 0 );
 #endif
-  int i=threadIdx.x;
+  int i = threadIdx.x;
 
   position_t sub_size;
-  if (i < k) {
-    //printf("i: %d\t sa pt: %lld\n", i, (long long int)subarray[i].data_pt);
-    sub_size = subarray[i].size;
-    if (sub_size > 0) {
+  if ( i < k )
+  {
+    // printf("i: %d\t sa pt: %lld\n", i, (long long int)subarray[i].data_pt);
+    sub_size = subarray[ i ].size;
+    if ( sub_size > 0 )
+    {
       position_t m0_u = tot_part_size;
       // (tot_part_size + k -2) / (k-1); // ceil (tot_part_size / k)
       position_t m0_d = tot_part_size / k; // floor (tot_part_size / k)
 #ifdef PRINT_VRB
-      if (print_vrb) printf("tot_part_size: %ld\n", tot_part_size);
-      if (print_vrb) printf("m0_u: %ld\tm0_d: %ld\n", m0_u, m0_d);
+      if ( print_vrb )
+      {
+        printf( "tot_part_size: %ld\n", tot_part_size );
+      }
+      if ( print_vrb )
+      {
+        printf( "m0_u: %ld\tm0_d: %ld\n", m0_u, m0_d );
+      }
 #endif
       // find the maximum of subarray[i][m_u]
       // and the minimum of  subarray[i][m_d]
-      
+
       // if the indexes are out of range put them in range
-      position_t m1_u = min(m0_u, sub_size);
-      position_t m1_d = min(m0_d, sub_size);
-      m1_u = max(m1_u - 1, (position_t)0);
-      m1_d = max(m1_d - 1, (position_t)0);
+      position_t m1_u = min( m0_u, sub_size );
+      position_t m1_d = min( m0_d, sub_size );
+      m1_u = max( m1_u - 1, ( position_t ) 0 );
+      m1_d = max( m1_d - 1, ( position_t ) 0 );
 #ifdef PRINT_VRB
-      printf("i: %d\tm1_u: %ld\tm1_d: %ld\tsubarray_size: %ld\n", i,
-      	     m1_u, m1_d, sub_size);
+      printf( "i: %d\tm1_u: %ld\tm1_d: %ld\tsubarray_size: %ld\n", i, m1_u, m1_d, sub_size );
 #endif
-      // update upper and lower limit of threshold range      
-      shared_t_u[i] = getKey(subarray[i], m1_u);
-      shared_t_d[i] = getKey(subarray[i], m1_d);
+      // update upper and lower limit of threshold range
+      shared_t_u[ i ] = getKey( subarray[ i ], m1_u );
+      shared_t_d[ i ] = getKey( subarray[ i ], m1_d );
 #ifdef PRINT_VRB
-      printf("i: %d\tshared_t_u: %d\tshared_t_d: %d\n", i, shared_t_u[i],
-	     shared_t_d[i]);
+      printf( "i: %d\tshared_t_u: %d\tshared_t_d: %d\n", i, shared_t_u[ i ], shared_t_d[ i ] );
 #endif
     }
   }
 #ifdef PRINT_VRB
   __syncthreads();
-  if (i==0) {
-    for (int j=0; j<k; j++) {
-      printf("j: %d\tshared_t_u: %d\tshared_t_d: %d\n", j, shared_t_u[j],
-	     shared_t_d[j]);
+  if ( i == 0 )
+  {
+    for ( int j = 0; j < k; j++ )
+    {
+      printf( "j: %d\tshared_t_u: %d\tshared_t_d: %d\n", j, shared_t_u[ j ], shared_t_d[ j ] );
     }
   }
 #endif
@@ -1008,298 +1086,342 @@ __global__ void threshold_range_kernel(ArrayT* subarray,
   ///// creare template di atomicKeyArgMax per tipi generici a 32 e 64 bit
   //// usare anche il verso (ascending/descending) e la fz di confronto
   //// isBefore, isAfter, isNotAfter,...
-  if (i < k && sub_size > 0) {
-    atomicKeyArgMax(shared_t_u, &shared_arg_max, i);
-    atomicKeyArgMin(shared_t_d, &shared_arg_min, i);
+  if ( i < k && sub_size > 0 )
+  {
+    atomicKeyArgMax( shared_t_u, &shared_arg_max, i );
+    atomicKeyArgMin( shared_t_d, &shared_arg_min, i );
 #ifdef PRINT_VRB
-    printf("i: %d\tshared_t_u: %d\tshared_arg_max: %d\n", i, shared_t_u[i],
-	   shared_arg_max);
-    printf("i: %d\tshared_t_d: %d\tshared_arg_min: %d\n", i, shared_t_u[i],
-	   shared_arg_max);
+    printf( "i: %d\tshared_t_u: %d\tshared_arg_max: %d\n", i, shared_t_u[ i ], shared_arg_max );
+    printf( "i: %d\tshared_t_d: %d\tshared_arg_min: %d\n", i, shared_t_u[ i ], shared_arg_max );
 #endif
   }
   __syncthreads();
-  
-  if (threadIdx.x==0) {
-    *t_u = shared_t_u[shared_arg_max];
-    *t_d = shared_t_d[shared_arg_min];
+
+  if ( threadIdx.x == 0 )
+  {
+    *t_u = shared_t_u[ shared_arg_max ];
+    *t_d = shared_t_d[ shared_arg_min ];
 #ifdef PRINT_VRB
-    printf("Kernel t_u: %d\tt_d: %d\n", *t_u, *t_d);
+    printf( "Kernel t_u: %d\tt_d: %d\n", *t_u, *t_d );
 #endif
   }
-}  
+}
 
-template <class KeyT, class ArrayT>
-__global__ void eval_t_tilde_kernel(ArrayT *subarray,
-				    position_t *m_u, position_t *m_d,
-				    int *arg_max, KeyT *t_tilde)
+template < class KeyT, class ArrayT >
+__global__ void
+eval_t_tilde_kernel( ArrayT* subarray, position_t* m_u, position_t* m_d, int* arg_max, KeyT* t_tilde )
 {
   int i = *arg_max;
-  int m_tilde = (m_u[i] + m_d[i])/2;
-  m_tilde = max(m_tilde - 1, 0);
-  *t_tilde = getKey(subarray[i], m_tilde);
-  //printf("m_tilde: %d\t *t_tilde: %d\n", m_tilde, *t_tilde);
-}
-
-template <class KeyT, class ArrayT>
-__global__ void case2_extra_elems_kernel(ArrayT *subarray,
-					 uint k, position_t *m_d,
-					 position_t *m_u,
-					 KeyT *extra_elem,
-					 int *extra_elem_idx,
-					 int *n_extra_elems)
+  int m_tilde = ( m_u[ i ] + m_d[ i ] ) / 2;
+  m_tilde = max( m_tilde - 1, 0 );
+  *t_tilde = getKey( subarray[ i ], m_tilde );
+  // printf("m_tilde: %d\t *t_tilde: %d\n", m_tilde, *t_tilde);
+}
+
+template < class KeyT, class ArrayT >
+__global__ void
+case2_extra_elems_kernel( ArrayT* subarray,
+  uint k,
+  position_t* m_d,
+  position_t* m_u,
+  KeyT* extra_elem,
+  int* extra_elem_idx,
+  int* n_extra_elems )
 {
   int i = threadIdx.x;
-  if (i == 0) {
+  if ( i == 0 )
+  {
     *n_extra_elems = 0;
   }
   __syncthreads();
-  
-  if (i >= k) return;
-  int sub_size = (int)subarray[i].size;
-  if (sub_size <= 0) return;
-  
-  if (m_u[i] > m_d[i]) {
-    int i_elem = atomicAdd(n_extra_elems, 1);
-    extra_elem[i_elem] = getKey(subarray[i], m_d[i]);
-    extra_elem_idx[i_elem] = i;
+
+  if ( i >= k )
+  {
+    return;
+  }
+  int sub_size = ( int ) subarray[ i ].size;
+  if ( sub_size <= 0 )
+  {
+    return;
   }
-}
 
+  if ( m_u[ i ] > m_d[ i ] )
+  {
+    int i_elem = atomicAdd( n_extra_elems, 1 );
+    extra_elem[ i_elem ] = getKey( subarray[ i ], m_d[ i ] );
+    extra_elem_idx[ i_elem ] = i;
+  }
+}
 
-template <class ElementT, class ArrayT, class AuxArrayT>
-__global__ void extract_partitions_kernel(ArrayT *subarray,
-					  uint k, position_t *part_size,
-					  position_t *part_size_cumul,
-					  AuxArrayT aux_array)
+template < class ElementT, class ArrayT, class AuxArrayT >
+__global__ void
+extract_partitions_kernel( ArrayT* subarray,
+  uint k,
+  position_t* part_size,
+  position_t* part_size_cumul,
+  AuxArrayT aux_array )
 {
   const int i_arr = blockIdx.x;
-  position_t size_i_arr = part_size[i_arr];
-  position_t i_aux_offset = part_size_cumul[i_arr];
-  
-  for (position_t i_elem = threadIdx.x; i_elem < size_i_arr;
-       i_elem += blockDim.x) {
+  position_t size_i_arr = part_size[ i_arr ];
+  position_t i_aux_offset = part_size_cumul[ i_arr ];
+
+  for ( position_t i_elem = threadIdx.x; i_elem < size_i_arr; i_elem += blockDim.x )
+  {
     position_t i_aux = i_aux_offset + i_elem;
-    ElementT elem = getElem(subarray[i_arr], i_elem);
-    setElem(aux_array, i_aux, elem);
+    ElementT elem = getElem( subarray[ i_arr ], i_elem );
+    setElem( aux_array, i_aux, elem );
   }
 }
 
-template <class ElementT, class TargetArray, class SourceArray> 
-void __global__ CopyArray(TargetArray target_arr, SourceArray source_arr)
+template < class ElementT, class TargetArray, class SourceArray >
+void __global__
+CopyArray( TargetArray target_arr, SourceArray source_arr )
 {
   position_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= source_arr.size) return;
-  
-  ElementT elem = getElem(source_arr, i);
-  setElem(target_arr, i, elem);
+  if ( i >= source_arr.size )
+  {
+    return;
+  }
+
+  ElementT elem = getElem( source_arr, i );
+  setElem( target_arr, i, elem );
 }
 
-template <class KeyT>
-void contiguousTranslate(contiguous_array<KeyT> &arr,
-			 position_t transl, char *d_buffer,
-			 position_t buffer_size)
+template < class KeyT >
+void
+contiguousTranslate( contiguous_array< KeyT >& arr, position_t transl, char* d_buffer, position_t buffer_size )
 {
   position_t elem_num = arr.size;
   position_t s_pos0 = arr.offset;
   position_t t_pos0 = arr.offset + transl;
-  
-  if (transl>=elem_num) {
-    gpuErrchk(cudaMemcpyAsync(&arr.data_pt[t_pos0],
-			      &arr.data_pt[s_pos0],
-			      elem_num*sizeof(KeyT), cudaMemcpyDeviceToDevice));
+
+  if ( transl >= elem_num )
+  {
+    gpuErrchk( cudaMemcpyAsync(
+      &arr.data_pt[ t_pos0 ], &arr.data_pt[ s_pos0 ], elem_num * sizeof( KeyT ), cudaMemcpyDeviceToDevice ) );
   }
-  else {
-    GPUMemCpyBuffered((char*)&arr.data_pt[t_pos0],
-		      (char*)&arr.data_pt[s_pos0],
-		      elem_num*sizeof(KeyT), d_buffer, buffer_size);
+  else
+  {
+    GPUMemCpyBuffered( ( char* ) &arr.data_pt[ t_pos0 ],
+      ( char* ) &arr.data_pt[ s_pos0 ],
+      elem_num * sizeof( KeyT ),
+      d_buffer,
+      buffer_size );
   }
   arr.offset += transl;
 }
 
-template <class KeyT, class ValueT>
-void contiguousTranslate(contiguous_key_value<KeyT, ValueT> &arr,
-			 position_t transl, char *d_buffer,
-			 position_t buffer_size)
+template < class KeyT, class ValueT >
+void
+contiguousTranslate( contiguous_key_value< KeyT, ValueT >& arr,
+  position_t transl,
+  char* d_buffer,
+  position_t buffer_size )
 {
   position_t elem_num = arr.size;
   position_t s_pos0 = arr.offset;
   position_t t_pos0 = arr.offset + transl;
-  
-  if (transl>=elem_num) {
-    gpuErrchk(cudaMemcpyAsync(&arr.key_pt[t_pos0],
-			      &arr.key_pt[s_pos0],
-			      elem_num*sizeof(KeyT), cudaMemcpyDeviceToDevice));
-    gpuErrchk(cudaMemcpyAsync(&arr.value_pt[t_pos0],
-			      &arr.value_pt[s_pos0],
-			      elem_num*sizeof(ValueT),
-			      cudaMemcpyDeviceToDevice));
+
+  if ( transl >= elem_num )
+  {
+    gpuErrchk( cudaMemcpyAsync(
+      &arr.key_pt[ t_pos0 ], &arr.key_pt[ s_pos0 ], elem_num * sizeof( KeyT ), cudaMemcpyDeviceToDevice ) );
+    gpuErrchk( cudaMemcpyAsync(
+      &arr.value_pt[ t_pos0 ], &arr.value_pt[ s_pos0 ], elem_num * sizeof( ValueT ), cudaMemcpyDeviceToDevice ) );
   }
-  else {
-    GPUMemCpyBuffered((char*)&arr.key_pt[t_pos0],
-		      (char*)&arr.key_pt[s_pos0],
-		      elem_num*sizeof(KeyT), d_buffer, buffer_size);
-    GPUMemCpyBuffered((char*)&arr.value_pt[t_pos0],
-		      (char*)&arr.value_pt[s_pos0],
-		      elem_num*sizeof(ValueT), d_buffer, buffer_size);
+  else
+  {
+    GPUMemCpyBuffered( ( char* ) &arr.key_pt[ t_pos0 ],
+      ( char* ) &arr.key_pt[ s_pos0 ],
+      elem_num * sizeof( KeyT ),
+      d_buffer,
+      buffer_size );
+    GPUMemCpyBuffered( ( char* ) &arr.value_pt[ t_pos0 ],
+      ( char* ) &arr.value_pt[ s_pos0 ],
+      elem_num * sizeof( ValueT ),
+      d_buffer,
+      buffer_size );
   }
   arr.offset += transl;
 }
 
-template <class KeyT, class ValueT>
-void CopyRegion(regular_block_key_value<KeyT, ValueT> &arr,
-		int t_ib, position_t t_j0, int s_ib, position_t s_j0,
-		position_t elem_num, char *d_buffer,
-		position_t buffer_size) {
+template < class KeyT, class ValueT >
+void
+CopyRegion( regular_block_key_value< KeyT, ValueT >& arr,
+  int t_ib,
+  position_t t_j0,
+  int s_ib,
+  position_t s_j0,
+  position_t elem_num,
+  char* d_buffer,
+  position_t buffer_size )
+{
 
   position_t transl = t_j0 - s_j0;
-  if (t_ib != s_ib || transl>=elem_num) {
-    gpuErrchk(cudaMemcpyAsync(&arr.h_key_pt[t_ib][t_j0],
-			      &arr.h_key_pt[s_ib][s_j0],
-			      elem_num*sizeof(KeyT), cudaMemcpyDeviceToDevice));
-    gpuErrchk(cudaMemcpyAsync(&arr.h_value_pt[t_ib][t_j0],
-			      &arr.h_value_pt[s_ib][s_j0],
-			      elem_num*sizeof(ValueT),
-			      cudaMemcpyDeviceToDevice));
+  if ( t_ib != s_ib || transl >= elem_num )
+  {
+    gpuErrchk( cudaMemcpyAsync( &arr.h_key_pt[ t_ib ][ t_j0 ],
+      &arr.h_key_pt[ s_ib ][ s_j0 ],
+      elem_num * sizeof( KeyT ),
+      cudaMemcpyDeviceToDevice ) );
+    gpuErrchk( cudaMemcpyAsync( &arr.h_value_pt[ t_ib ][ t_j0 ],
+      &arr.h_value_pt[ s_ib ][ s_j0 ],
+      elem_num * sizeof( ValueT ),
+      cudaMemcpyDeviceToDevice ) );
   }
-  else {
-    GPUMemCpyBuffered((char*)&arr.h_key_pt[t_ib][t_j0],
-		    (char*)&arr.h_key_pt[s_ib][s_j0],
-		    elem_num*sizeof(KeyT), d_buffer, buffer_size);
-    GPUMemCpyBuffered((char*)&arr.h_value_pt[t_ib][t_j0],
-		    (char*)&arr.h_value_pt[s_ib][s_j0],
-		    elem_num*sizeof(ValueT), d_buffer, buffer_size);
+  else
+  {
+    GPUMemCpyBuffered( ( char* ) &arr.h_key_pt[ t_ib ][ t_j0 ],
+      ( char* ) &arr.h_key_pt[ s_ib ][ s_j0 ],
+      elem_num * sizeof( KeyT ),
+      d_buffer,
+      buffer_size );
+    GPUMemCpyBuffered( ( char* ) &arr.h_value_pt[ t_ib ][ t_j0 ],
+      ( char* ) &arr.h_value_pt[ s_ib ][ s_j0 ],
+      elem_num * sizeof( ValueT ),
+      d_buffer,
+      buffer_size );
   }
 }
 
-template <class KeyT>
-void CopyRegion(regular_block_array<KeyT> &arr,
-		int t_ib, position_t t_j0, int s_ib, position_t s_j0,
-		position_t elem_num, char *d_buffer,
-		position_t buffer_size) {
+template < class KeyT >
+void
+CopyRegion( regular_block_array< KeyT >& arr,
+  int t_ib,
+  position_t t_j0,
+  int s_ib,
+  position_t s_j0,
+  position_t elem_num,
+  char* d_buffer,
+  position_t buffer_size )
+{
 
   position_t transl = t_j0 - s_j0;
-  if (t_ib != s_ib || transl>=elem_num) {
-    gpuErrchk(cudaMemcpyAsync(&arr.h_data_pt[t_ib][t_j0],
-			      &arr.h_data_pt[s_ib][s_j0],
-			      elem_num*sizeof(KeyT), cudaMemcpyDeviceToDevice));
+  if ( t_ib != s_ib || transl >= elem_num )
+  {
+    gpuErrchk( cudaMemcpyAsync( &arr.h_data_pt[ t_ib ][ t_j0 ],
+      &arr.h_data_pt[ s_ib ][ s_j0 ],
+      elem_num * sizeof( KeyT ),
+      cudaMemcpyDeviceToDevice ) );
   }
-  else {
-    GPUMemCpyBuffered((char*)&arr.h_data_pt[t_ib][t_j0],
-		    (char*)&arr.h_data_pt[s_ib][s_j0],
-		    elem_num*sizeof(KeyT), d_buffer, buffer_size);
+  else
+  {
+    GPUMemCpyBuffered( ( char* ) &arr.h_data_pt[ t_ib ][ t_j0 ],
+      ( char* ) &arr.h_data_pt[ s_ib ][ s_j0 ],
+      elem_num * sizeof( KeyT ),
+      d_buffer,
+      buffer_size );
   }
 }
 
-
-template <class ArrayT>
-void regularBlockTranslate(ArrayT &arr,
-			   position_t transl, char *d_buffer,
-			   position_t buffer_size)
+template < class ArrayT >
+void
+regularBlockTranslate( ArrayT& arr, position_t transl, char* d_buffer, position_t buffer_size )
 {
   position_t elem_num = arr.size;
   position_t s_pos1 = arr.offset + elem_num - 1;
-  int s_ib1 = (int)(s_pos1 / arr.block_size);
+  int s_ib1 = ( int ) ( s_pos1 / arr.block_size );
   position_t s_j1 = s_pos1 % arr.block_size;
 
   position_t t_pos1 = arr.offset + transl + elem_num - 1;
-  int t_ib1 = (int)(t_pos1 / arr.block_size);
+  int t_ib1 = ( int ) ( t_pos1 / arr.block_size );
   position_t t_j1 = t_pos1 % arr.block_size;
-  
+
   position_t s_num1 = s_j1 + 1;
   position_t t_num1 = t_j1 + 1;
 
-  if (t_num1<elem_num && t_num1<s_num1) {
-    CopyRegion(arr, t_ib1, 0, s_ib1, s_num1-t_num1, t_num1, d_buffer,
-	       buffer_size);
+  if ( t_num1 < elem_num && t_num1 < s_num1 )
+  {
+    CopyRegion( arr, t_ib1, 0, s_ib1, s_num1 - t_num1, t_num1, d_buffer, buffer_size );
     elem_num -= t_num1;
     s_num1 -= t_num1;
     t_num1 = arr.block_size;
     t_ib1--;
-    
-    if (s_num1<elem_num) {
-      CopyRegion(arr, t_ib1, t_num1-s_num1, s_ib1, 0, s_num1, d_buffer,
-		 buffer_size);
+
+    if ( s_num1 < elem_num )
+    {
+      CopyRegion( arr, t_ib1, t_num1 - s_num1, s_ib1, 0, s_num1, d_buffer, buffer_size );
       elem_num -= s_num1;
       t_num1 -= s_num1;
       s_num1 = arr.block_size;
       s_ib1--;
     }
   }
-  else if (s_num1<elem_num) { // && s_num1<t_num1) {
-    CopyRegion(arr, t_ib1, t_num1-s_num1, s_ib1, 0, s_num1, d_buffer,
-	       buffer_size);
+  else if ( s_num1 < elem_num )
+  { // && s_num1<t_num1) {
+    CopyRegion( arr, t_ib1, t_num1 - s_num1, s_ib1, 0, s_num1, d_buffer, buffer_size );
     elem_num -= s_num1;
     t_num1 -= s_num1;
     s_num1 = arr.block_size;
     s_ib1--;
-    
-    if (t_num1<elem_num) {
-      CopyRegion(arr, t_ib1, 0, s_ib1, s_num1-t_num1, t_num1, d_buffer,
-		 buffer_size);
+
+    if ( t_num1 < elem_num )
+    {
+      CopyRegion( arr, t_ib1, 0, s_ib1, s_num1 - t_num1, t_num1, d_buffer, buffer_size );
       elem_num -= t_num1;
       s_num1 -= t_num1;
       t_num1 = arr.block_size;
       t_ib1--;
     }
   }
-  CopyRegion(arr, t_ib1, t_num1-elem_num, s_ib1, s_num1-elem_num, elem_num,
-	     d_buffer, buffer_size);
+  CopyRegion( arr, t_ib1, t_num1 - elem_num, s_ib1, s_num1 - elem_num, elem_num, d_buffer, buffer_size );
   arr.offset += transl;
 }
 
-template <class KeyT>
-void Translate(contiguous_array<KeyT> &arr,
-	       position_t transl, char *d_buffer, position_t buffer_size)
+template < class KeyT >
+void
+Translate( contiguous_array< KeyT >& arr, position_t transl, char* d_buffer, position_t buffer_size )
 {
-  contiguousTranslate(arr, transl, d_buffer, buffer_size);
+  contiguousTranslate( arr, transl, d_buffer, buffer_size );
 }
 
-template <class KeyT, class ValueT>
-void Translate(contiguous_key_value<KeyT, ValueT> &arr,
-	       position_t transl, char *d_buffer, position_t buffer_size)
+template < class KeyT, class ValueT >
+void
+Translate( contiguous_key_value< KeyT, ValueT >& arr, position_t transl, char* d_buffer, position_t buffer_size )
 {
-  contiguousTranslate(arr, transl, d_buffer, buffer_size);
+  contiguousTranslate( arr, transl, d_buffer, buffer_size );
 }
 
-template <class KeyT>
-void Translate(regular_block_array<KeyT> &arr,
-	       position_t transl, char *d_buffer, position_t buffer_size)
+template < class KeyT >
+void
+Translate( regular_block_array< KeyT >& arr, position_t transl, char* d_buffer, position_t buffer_size )
 {
-  regularBlockTranslate(arr, transl, d_buffer, buffer_size);
+  regularBlockTranslate( arr, transl, d_buffer, buffer_size );
 }
 
-template <class KeyT, class ValueT>
-void Translate(regular_block_key_value<KeyT, ValueT> &arr,
-	       position_t transl, char *d_buffer, position_t buffer_size)
+template < class KeyT, class ValueT >
+void
+Translate( regular_block_key_value< KeyT, ValueT >& arr, position_t transl, char* d_buffer, position_t buffer_size )
 {
-  regularBlockTranslate(arr, transl, d_buffer, buffer_size);
+  regularBlockTranslate( arr, transl, d_buffer, buffer_size );
 }
 
-template <class ArrayT>
-void repack(ArrayT *h_subarray,
-	    uint k, position_t *part_size, char *d_buffer,
-	    position_t buffer_size)
+template < class ArrayT >
+void
+repack( ArrayT* h_subarray, uint k, position_t* part_size, char* d_buffer, position_t buffer_size )
 {
-  position_t psize = part_size[k-1];
-  h_subarray[k-1].offset += psize;
-  h_subarray[k-1].size -= psize;
+  position_t psize = part_size[ k - 1 ];
+  h_subarray[ k - 1 ].offset += psize;
+  h_subarray[ k - 1 ].size -= psize;
 
   position_t transl = psize; // translation of last subarray
                              // to be updated for each subarray
 
   // move blocks of memory to the right in reverse order
-  for (int i_arr=k-2; i_arr>=0; i_arr--) {
-    position_t sub_size = h_subarray[i_arr].size;
-    if (sub_size <= 0) continue;
-      
-    psize = part_size[i_arr];
-    h_subarray[i_arr].offset += psize;
-    h_subarray[i_arr].size -= psize;
-    Translate(h_subarray[i_arr], transl, d_buffer, buffer_size);
+  for ( int i_arr = k - 2; i_arr >= 0; i_arr-- )
+  {
+    position_t sub_size = h_subarray[ i_arr ].size;
+    if ( sub_size <= 0 )
+    {
+      continue;
+    }
+
+    psize = part_size[ i_arr ];
+    h_subarray[ i_arr ].offset += psize;
+    h_subarray[ i_arr ].size -= psize;
+    Translate( h_subarray[ i_arr ], transl, d_buffer, buffer_size );
     transl += psize;
-  }  
+  }
 }
 
-
 #endif
diff --git a/src/copass_sort.cu b/src/copass_sort.cu
index 1be9d81a8..b044f703c 100644
--- a/src/copass_sort.cu
+++ b/src/copass_sort.cu
@@ -12,79 +12,77 @@ You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include <iostream>
+#include "copass_kernels.h"
+#include "copass_sort.h"
+#include "cuda_error.h"
 #include <algorithm>
-#include <cstdlib>
 #include <climits>
-#include <vector>
+#include <cstdlib>
+#include <iostream>
 #include <utility>
-#include "cuda_error.h"
-#include "copass_kernels.h"
-#include "copass_sort.h"
+#include <vector>
 
 const bool print_gpu_cpu_vrb = false;
 
 namespace copass_sort
 {
-  uint k_;
-  position_t block_size_;
-  void *d_aux_array_key_pt_;
-  void *d_aux_array_value_pt_;
-  position_t *h_part_size_;
-  position_t *d_part_size_;
-}
-
+uint k_;
+position_t block_size_;
+void* d_aux_array_key_pt_;
+void* d_aux_array_value_pt_;
+position_t* h_part_size_;
+position_t* d_part_size_;
+} // namespace copass_sort
 
-
-
-int copass_sort::last_step(position_t *local_d_m_d, position_t *local_d_m_u,
-			   position_t *local_d_sum_m_d,
-			   position_t local_h_sum_m_d,
-			   position_t tot_part_size,
-			   uint k, uint kp_next_pow_2,
-			   position_t *d_part_size, position_t *d_diff,
-			   position_t *d_diff_cumul, position_t *h_diff,
-			   position_t *h_diff_cumul, position_t *d_num_down)
+int
+copass_sort::last_step( position_t* local_d_m_d,
+  position_t* local_d_m_u,
+  position_t* local_d_sum_m_d,
+  position_t local_h_sum_m_d,
+  position_t tot_part_size,
+  uint k,
+  uint kp_next_pow_2,
+  position_t* d_part_size,
+  position_t* d_diff,
+  position_t* d_diff_cumul,
+  position_t* h_diff,
+  position_t* h_diff_cumul,
+  position_t* d_num_down )
 {
-    diffKernel<<<1, k>>>(d_diff, local_d_m_u, local_d_m_d, k);
-    DBGCUDASYNC
-    prefix_scan<position_t, 1024><<<1, 512>>>
-      (d_diff, d_diff_cumul, k+1, kp_next_pow_2);
-    DBGCUDASYNC
+  diffKernel<<< 1, k>>>( d_diff, local_d_m_u, local_d_m_d, k );
+  DBGCUDASYNC
+  prefix_scan< position_t, 1024 > <<< 1, 512 >>>( d_diff, d_diff_cumul, k + 1, kp_next_pow_2 );
+  DBGCUDASYNC
 
-    gpuErrchk(cudaMemcpyAsync(h_diff, d_diff, k*sizeof(position_t),
-			      cudaMemcpyDeviceToHost));
-    gpuErrchk(cudaMemcpy(h_diff_cumul, d_diff_cumul,
-			 (k + 1)*sizeof(position_t),
-			 cudaMemcpyDeviceToHost));
-    if (print_gpu_cpu_vrb) {
-      printf("h_diff: ");
-      for (uint i=0; i<k; i++) {
-	printf("%ld ", h_diff[i]);
-      }
-      printf("\n");
-      printf("h_diff_cumul: ");
-      for (uint i=0; i<k+1; i++) {
-	printf("%ld ", h_diff_cumul[i]);
-      }
-      printf("\n");
+  gpuErrchk( cudaMemcpyAsync( h_diff, d_diff, k * sizeof( position_t ), cudaMemcpyDeviceToHost ) );
+  gpuErrchk( cudaMemcpy( h_diff_cumul, d_diff_cumul, ( k + 1 ) * sizeof( position_t ), cudaMemcpyDeviceToHost ) );
+  if ( print_gpu_cpu_vrb )
+  {
+    printf( "h_diff: " );
+    for ( uint i = 0; i < k; i++ )
+    {
+      printf( "%ld ", h_diff[ i ] );
     }
-    position_t tot_diff = tot_part_size - local_h_sum_m_d;
-    search_down<position_t, 1024><<<1, 1024>>>
-      (d_diff_cumul+1, k, tot_diff, d_num_down);
+    printf( "\n" );
+    printf( "h_diff_cumul: " );
+    for ( uint i = 0; i < k + 1; i++ )
+    {
+      printf( "%ld ", h_diff_cumul[ i ] );
+    }
+    printf( "\n" );
+  }
+  position_t tot_diff = tot_part_size - local_h_sum_m_d;
+  search_down< position_t, 1024 > <<< 1, 1024 >>>( d_diff_cumul + 1, k, tot_diff, d_num_down );
 
-    copass_last_step_kernel<<<1, 1024>>>(d_part_size, local_d_m_d, k,
-					 tot_diff, d_diff, d_diff_cumul,
-					 d_num_down);
-    DBGCUDASYNC
+  copass_last_step_kernel<<< 1, 1024 >>>( d_part_size, local_d_m_d, k, tot_diff, d_diff, d_diff_cumul, d_num_down );
+  DBGCUDASYNC
 
-    return 0;
+  return 0;
 }
 
-position_t *copass_sort::get_part_size()
+position_t*
+copass_sort::get_part_size()
 {
-  gpuErrchk(cudaMemcpy(h_part_size_, d_part_size_, k_*sizeof(position_t),
-		       cudaMemcpyDeviceToHost));
+  gpuErrchk( cudaMemcpy( h_part_size_, d_part_size_, k_ * sizeof( position_t ), cudaMemcpyDeviceToHost ) );
   return h_part_size_;
 }
-
diff --git a/src/copass_sort.h b/src/copass_sort.h
index ecbcf8c62..a56b1199b 100644
--- a/src/copass_sort.h
+++ b/src/copass_sort.h
@@ -15,8 +15,9 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #ifndef COPASS_SORT_H
 #define COPASS_SORT_H
 
-#include "cuda_error.h"
 #include "copass_kernels.h"
+#include "cuda_error.h"
+#include <algorithm>
 
 extern const bool print_gpu_cpu_vrb;
 
@@ -25,618 +26,691 @@ extern uint last_i_sub;
 
 namespace copass_sort
 {
-  //////////////////////////////////////////
-  // temporary, for testing
-  extern uint k_;
-  extern position_t block_size_;
-  extern void *d_aux_array_key_pt_;
-  extern void *d_aux_array_value_pt_;
-  extern position_t *h_part_size_;
-  extern position_t *d_part_size_;
-  ////////////////////////////////////////////////////
-    
-  template<class KeyT, class ElementT, class KeyArrayT, class ArrayT,
-	   class AuxArrayT>
-  int sort_template(KeyArrayT key_array, ArrayT *h_subarray,
-		    uint k, position_t block_size,
-		    void *d_storage, int64_t &st_bytes);
-
-  template <class ElementT, class ArrayT, class AuxArrayT>
-  int extract_partitions(ArrayT *d_subarray, uint k,
-			 uint k_next_pow_2,
-			 position_t *d_part_size, position_t *d_part_size_cumul,
-			 AuxArrayT *d_aux_array);  
-  
-  int last_step(position_t *local_d_m_d, position_t *local_d_m_u,
-		position_t *local_d_sum_m_d,
-		position_t local_h_sum_m_d,
-		position_t tot_part_size,
-		uint k, uint kp_next_pow_2,
-		position_t *d_part_size, position_t *d_diff,
-		position_t *d_diff_cumul, position_t *h_diff,
-		position_t *h_diff_cumul, position_t *d_num_down);
-
-template <class KeyT, class ArrayT>
-int last_step_case2(ArrayT *d_subarray,
-		    position_t tot_part_size,
-		    uint k, position_t *d_part_size,
-		    position_t *d_m_d, position_t *d_m_u,
-		    position_t h_sum_m_d,
-		    KeyT *d_extra_elem,
-		    KeyT *h_extra_elem,
-		    int *d_extra_elem_idx,
-		    int *h_extra_elem_idx,
-		    int *d_n_extra_elems);
-  
-  ////////////// Temporary for checking
-  template <class KeyT>
-  KeyT *get_aux_array_keys();
-
-  template <class ValueT>
-  ValueT *get_aux_array_values();
-
-  position_t *get_part_size();
-
-  template <class KeyT>
-  int alloc(position_t n, position_t block_size);
-  
-  template <class KeyT>
-  int sort(KeyT *d_keys);
-
-};
-
-
-  
-template <class ElementT, class ArrayT, class AuxArrayT>
-int copass_sort::extract_partitions(ArrayT *d_subarray, uint k,
-				    uint k_next_pow_2,
-				    position_t *d_part_size,
-				    position_t *d_part_size_cumul,
-				    AuxArrayT d_aux_array)
+//////////////////////////////////////////
+// temporary, for testing
+extern uint k_;
+extern position_t block_size_;
+extern void* d_aux_array_key_pt_;
+extern void* d_aux_array_value_pt_;
+extern position_t* h_part_size_;
+extern position_t* d_part_size_;
+////////////////////////////////////////////////////
+
+template < class KeyT, class ElementT, class KeyArrayT, class ArrayT, class AuxArrayT >
+int sort_template( KeyArrayT key_array,
+  ArrayT* h_subarray,
+  uint k,
+  position_t block_size,
+  void* d_storage,
+  int64_t& st_bytes );
+
+template < class ElementT, class ArrayT, class AuxArrayT >
+int extract_partitions( ArrayT* d_subarray,
+  uint k,
+  uint k_next_pow_2,
+  position_t* d_part_size,
+  position_t* d_part_size_cumul,
+  AuxArrayT d_aux_array );
+
+int last_step( position_t* local_d_m_d,
+  position_t* local_d_m_u,
+  position_t* local_d_sum_m_d,
+  position_t local_h_sum_m_d,
+  position_t tot_part_size,
+  uint k,
+  uint kp_next_pow_2,
+  position_t* d_part_size,
+  position_t* d_diff,
+  position_t* d_diff_cumul,
+  position_t* h_diff,
+  position_t* h_diff_cumul,
+  position_t* d_num_down );
+
+template < class KeyT, class ArrayT >
+int last_step_case2( ArrayT* d_subarray,
+  position_t tot_part_size,
+  uint k,
+  position_t* d_part_size,
+  position_t* d_m_d,
+  position_t* d_m_u,
+  position_t h_sum_m_d,
+  KeyT* d_extra_elem,
+  KeyT* h_extra_elem,
+  int* d_extra_elem_idx,
+  int* h_extra_elem_idx,
+  int* d_n_extra_elems );
+
+////////////// Temporary for checking
+template < class KeyT >
+KeyT* get_aux_array_keys();
+
+template < class ValueT >
+ValueT* get_aux_array_values();
+
+position_t* get_part_size();
+
+template < class KeyT >
+int alloc( position_t n, position_t block_size );
+
+template < class KeyT >
+int sort( KeyT* d_keys, position_t n, position_t block_size, void* d_storage, int64_t& st_bytes );
+
+template < class KeyT, class ValueT >
+int sort( KeyT* d_keys, ValueT* d_values, position_t n, position_t block_size, void* d_storage, int64_t& st_bytes );
+
+template < class KeyT >
+int sort( KeyT** key_subarray, position_t n, position_t block_size, void* d_storage, int64_t& st_bytes );
+
+template < class KeyT, class ValueT >
+int sort( KeyT** key_subarray,
+  ValueT** value_subarray,
+  position_t n,
+  position_t block_size,
+  void* d_storage,
+  int64_t& st_bytes );
+
+}; // namespace copass_sort
+
+template < class ElementT, class ArrayT, class AuxArrayT >
+int
+copass_sort::extract_partitions( ArrayT* d_subarray,
+  uint k,
+  uint k_next_pow_2,
+  position_t* d_part_size,
+  position_t* d_part_size_cumul,
+  AuxArrayT d_aux_array )
 {
-  prefix_scan<position_t, 1024><<<1, 512>>>(d_part_size, d_part_size_cumul, k,
-				k_next_pow_2);
-  gpuErrchk(cudaPeekAtLastError());
-  gpuErrchk(cudaDeviceSynchronize());
-  
-  extract_partitions_kernel<ElementT, ArrayT, AuxArrayT><<< k, 1024 >>>
-    (d_subarray, k, d_part_size, d_part_size_cumul, d_aux_array);
-  
+  prefix_scan< position_t, 1024 > <<< 1, 512 >>>( d_part_size, d_part_size_cumul, k, k_next_pow_2 );
+  gpuErrchk( cudaPeekAtLastError() );
+  gpuErrchk( cudaDeviceSynchronize() );
+
+  extract_partitions_kernel< ElementT, ArrayT, AuxArrayT > <<< k, 1024 >>>(
+    d_subarray, k, d_part_size, d_part_size_cumul, d_aux_array );
+
   DBGCUDASYNC
-  //gpuErrchk(cudaPeekAtLastError());
-  //gpuErrchk(cudaDeviceSynchronize());
-  
+  // gpuErrchk(cudaPeekAtLastError());
+  // gpuErrchk(cudaDeviceSynchronize());
+
   return 0;
 }
 
-
-template <class KeyT, class ArrayT>
-int copass_sort::last_step_case2(ArrayT *d_subarray,
-				 position_t tot_part_size,
-				 uint k, position_t *d_part_size,
-				 position_t *d_m_d, position_t *d_m_u,
-				 position_t h_sum_m_d,
-				 KeyT *d_extra_elem,
-				 KeyT *h_extra_elem,
-				 int *d_extra_elem_idx,
-				 int *h_extra_elem_idx,
-				 int *d_n_extra_elems)
+template < class KeyT, class ArrayT >
+int
+copass_sort::last_step_case2( ArrayT* d_subarray,
+  position_t tot_part_size,
+  uint k,
+  position_t* d_part_size,
+  position_t* d_m_d,
+  position_t* d_m_u,
+  position_t h_sum_m_d,
+  KeyT* d_extra_elem,
+  KeyT* h_extra_elem,
+  int* d_extra_elem_idx,
+  int* h_extra_elem_idx,
+  int* d_n_extra_elems )
 {
-  gpuErrchk(cudaMemcpy(d_part_size, d_m_d, k*sizeof(position_t),
-		       cudaMemcpyDeviceToDevice));
-  
+  gpuErrchk( cudaMemcpy( d_part_size, d_m_d, k * sizeof( position_t ), cudaMemcpyDeviceToDevice ) );
+
   position_t tot_diff = tot_part_size - h_sum_m_d;
   // printf("kernel tot_diff: %ld\n", tot_diff);
-  
-  if (tot_diff > 0) {
-    case2_extra_elems_kernel<KeyT, ArrayT><<<1, 1024>>>
-      (d_subarray, k, d_m_d, d_m_u,
-       d_extra_elem, d_extra_elem_idx, d_n_extra_elems);
-    
-    gpuErrchk(cudaPeekAtLastError());
-    gpuErrchk(cudaDeviceSynchronize());
-    
+
+  if ( tot_diff > 0 )
+  {
+    case2_extra_elems_kernel< KeyT, ArrayT > <<< 1, 1024 >>>(
+      d_subarray, k, d_m_d, d_m_u, d_extra_elem, d_extra_elem_idx, d_n_extra_elems );
+
+    gpuErrchk( cudaPeekAtLastError() );
+    gpuErrchk( cudaDeviceSynchronize() );
+
     int n_extra_elems;
-    gpuErrchk(cudaMemcpy(&n_extra_elems, d_n_extra_elems, sizeof(int),
-			 cudaMemcpyDeviceToHost));
-    if (n_extra_elems < tot_diff) {
-      printf("Error in copass_last_step_case2_gpu. Not enough extra elements"
-	     " to complete partitions\n");
-      exit(EXIT_FAILURE);
+    gpuErrchk( cudaMemcpy( &n_extra_elems, d_n_extra_elems, sizeof( int ), cudaMemcpyDeviceToHost ) );
+    if ( n_extra_elems < tot_diff )
+    {
+      printf(
+        "Error in copass_last_step_case2_gpu. Not enough extra elements"
+        " to complete partitions\n" );
+      exit( EXIT_FAILURE );
     }
 
     //// !!!!!!!!! temporarily sort in CPU side using std::sort
     //// replace with cub sort directly in the GPU
-    gpuErrchk(cudaMemcpy(h_extra_elem, d_extra_elem,
-			 n_extra_elems*sizeof(KeyT),
-			 cudaMemcpyDeviceToHost));
-    gpuErrchk(cudaMemcpy(h_extra_elem_idx, d_extra_elem_idx,
-			 n_extra_elems*sizeof(int),
-			 cudaMemcpyDeviceToHost));
+    gpuErrchk( cudaMemcpy( h_extra_elem, d_extra_elem, n_extra_elems * sizeof( KeyT ), cudaMemcpyDeviceToHost ) );
+    gpuErrchk(
+      cudaMemcpy( h_extra_elem_idx, d_extra_elem_idx, n_extra_elems * sizeof( int ), cudaMemcpyDeviceToHost ) );
     // build pair vector
-    std::vector<std::pair<KeyT, int>> extra_elem_and_idx;
-    for (int i=0; i<n_extra_elems; i++) {
-      std::pair<KeyT, int> p(h_extra_elem[i], h_extra_elem_idx[i]);
-      extra_elem_and_idx.push_back(p);
+    std::vector< std::pair< KeyT, int > > extra_elem_and_idx;
+    for ( int i = 0; i < n_extra_elems; i++ )
+    {
+      std::pair< KeyT, int > p( h_extra_elem[ i ], h_extra_elem_idx[ i ] );
+      extra_elem_and_idx.push_back( p );
     }
     // sort pair
-    std::sort(extra_elem_and_idx.begin(), extra_elem_and_idx.end());
+    std::sort( extra_elem_and_idx.begin(), extra_elem_and_idx.end() );
     //, [](auto &left, auto &right) {return left.second < right.second;);
     // extract indexes from sorted vector
-    for (int i=0; i<n_extra_elems; i++) {
-      h_extra_elem_idx[i] = extra_elem_and_idx[i].second;
+    for ( int i = 0; i < n_extra_elems; i++ )
+    {
+      h_extra_elem_idx[ i ] = extra_elem_and_idx[ i ].second;
     }
-    
-    gpuErrchk(cudaMemcpy(d_extra_elem_idx, h_extra_elem_idx,
-			 n_extra_elems*sizeof(int),
-			 cudaMemcpyHostToDevice));
+
+    gpuErrchk(
+      cudaMemcpy( d_extra_elem_idx, h_extra_elem_idx, n_extra_elems * sizeof( int ), cudaMemcpyHostToDevice ) );
 
     /////////////////////////////////////////////////
-    
-    case2_inc_partitions_kernel<<<1, 1024>>>(d_part_size,
-					     d_extra_elem_idx, tot_diff);
-    gpuErrchk(cudaPeekAtLastError());
-    gpuErrchk(cudaDeviceSynchronize());
+
+    case2_inc_partitions_kernel<<< 1, 1024 >>>( d_part_size, d_extra_elem_idx, tot_diff );
+    gpuErrchk( cudaPeekAtLastError() );
+    gpuErrchk( cudaDeviceSynchronize() );
   }
-  
+
   return 0;
 }
 
-template<class KeyT, class ElementT, class KeyArrayT, class ArrayT,
-	 class AuxArrayT>
-int copass_sort::sort_template(KeyArrayT key_array, ArrayT *h_subarray,
-			       uint k, position_t block_size,
-			       void *d_storage, int64_t &st_bytes)
+template < class KeyT, class ElementT, class KeyArrayT, class ArrayT, class AuxArrayT >
+int
+copass_sort::sort_template( KeyArrayT key_array,
+  ArrayT* h_subarray,
+  uint k,
+  position_t block_size,
+  void* d_storage,
+  int64_t& st_bytes )
 {
   //////////////////////////////////////////////////////////////////////
   // uint k; // number of subarrays
   // position_t block_size;  // size of auxiliary array for storage
   //////////////////////////////////////////////////////////////////////
   const int buffer_fract = 5;
-  
-  ArrayT *d_subarray;
-  
+
+  ArrayT* d_subarray;
+
   AuxArrayT d_aux_array;
 
-  position_t *h_part_size; // size of extracted partitions
-  position_t *d_part_size;
-  position_t *d_part_size_cumul;
-  
-  position_t *d_m_u;
-  position_t *d_m_d;
-  position_t *d_mu_u;
-  position_t *d_mu_d;
-  position_t *d_sum_m_u;
-  position_t *d_sum_m_d;
-  position_t *d_sum_mu_u;
-  position_t *d_sum_mu_d;
-  
-  KeyT *d_t_u;
-  KeyT *d_t_d;
-
-  position_t *h_m_u;
-  position_t *h_m_d;
+  position_t* h_part_size; // size of extracted partitions
+  position_t* d_part_size;
+  position_t* d_part_size_cumul;
+
+  position_t* d_m_u;
+  position_t* d_m_d;
+  position_t* d_mu_u;
+  position_t* d_mu_d;
+  position_t* d_sum_m_u;
+  position_t* d_sum_m_d;
+  position_t* d_sum_mu_u;
+  position_t* d_sum_mu_d;
+
+  KeyT* d_t_u;
+  KeyT* d_t_d;
+
+  position_t* h_m_u;
+  position_t* h_m_d;
   position_t h_sum_m_u;
   position_t h_sum_m_d;
-  
-  position_t *h_mu_u;
-  position_t *h_mu_d;
+
+  position_t* h_mu_u;
+  position_t* h_mu_d;
   position_t h_sum_mu_u;
   position_t h_sum_mu_d;
-  
-  position_t *d_diff;
-  position_t *d_diff_cumul;
-  
-  position_t *h_diff;
-  position_t *h_diff_cumul;
 
-  position_t *d_num_down;
+  position_t* d_diff;
+  position_t* d_diff_cumul;
+
+  position_t* h_diff;
+  position_t* h_diff_cumul;
+
+  position_t* d_num_down;
 
-  position_t *d_max_diff;
-  int *d_arg_max;
+  position_t* d_max_diff;
+  int* d_arg_max;
 
-  KeyT *d_t_tilde;
+  KeyT* d_t_tilde;
 
   uint k_next_pow_2;
   uint kp_next_pow_2;
 
-  KeyT *d_extra_elem;
-  KeyT *h_extra_elem;
-  
-  int *d_extra_elem_idx;
-  int *h_extra_elem_idx;
+  KeyT* d_extra_elem;
+  KeyT* h_extra_elem;
 
-  int *d_n_extra_elems;
+  int* d_extra_elem_idx;
+  int* h_extra_elem_idx;
 
-  char *d_buffer;
-  
-  array_GPUMalloc(d_storage, st_bytes, d_aux_array, block_size);
-  
-  position_t buffer_size = block_size/buffer_fract;
-  cudaReusableAlloc(d_storage, st_bytes, &d_buffer, buffer_size, sizeof(char));
+  int* d_n_extra_elems;
 
-  h_part_size  = new position_t[k];
-  cudaReusableAlloc(d_storage, st_bytes, &d_part_size, k, sizeof(position_t));
+  char* d_buffer;
 
-  cudaReusableAlloc(d_storage, st_bytes, &d_part_size_cumul,
-		    (k + 1), sizeof(position_t));
+  array_GPUMalloc( d_storage, st_bytes, d_aux_array, block_size );
 
-  cudaReusableAlloc(d_storage, st_bytes, &d_m_u, k, sizeof(position_t));
-  cudaReusableAlloc(d_storage, st_bytes, &d_m_d, k, sizeof(position_t));
-  cudaReusableAlloc(d_storage, st_bytes, &d_mu_u, k, sizeof(position_t));
-  cudaReusableAlloc(d_storage, st_bytes, &d_mu_d, k, sizeof(position_t));
+  position_t buffer_size = block_size / buffer_fract;
+  cudaReusableAlloc( d_storage, st_bytes, &d_buffer, buffer_size, sizeof( char ) );
 
-  cudaReusableAlloc(d_storage, st_bytes, &d_sum_m_u, 1, sizeof(position_t));
-  cudaReusableAlloc(d_storage, st_bytes, &d_sum_m_d, 1, sizeof(position_t));
+  h_part_size = new position_t[ k ];
+  cudaReusableAlloc( d_storage, st_bytes, &d_part_size, k, sizeof( position_t ) );
 
-  cudaReusableAlloc(d_storage, st_bytes, &d_sum_mu_u, 1, sizeof(position_t));
-  cudaReusableAlloc(d_storage, st_bytes, &d_sum_mu_d, 1, sizeof(position_t));
+  cudaReusableAlloc( d_storage, st_bytes, &d_part_size_cumul, ( k + 1 ), sizeof( position_t ) );
 
-  cudaReusableAlloc(d_storage, st_bytes, &d_t_u, 1, sizeof(KeyT));
-  cudaReusableAlloc(d_storage, st_bytes, &d_t_d, 1, sizeof(KeyT));
+  cudaReusableAlloc( d_storage, st_bytes, &d_m_u, k, sizeof( position_t ) );
+  cudaReusableAlloc( d_storage, st_bytes, &d_m_d, k, sizeof( position_t ) );
+  cudaReusableAlloc( d_storage, st_bytes, &d_mu_u, k, sizeof( position_t ) );
+  cudaReusableAlloc( d_storage, st_bytes, &d_mu_d, k, sizeof( position_t ) );
 
-  h_m_u = new position_t[k];
-  h_m_d = new position_t[k];
-  
-  h_mu_u = new position_t[k];
-  h_mu_d = new position_t[k];
+  cudaReusableAlloc( d_storage, st_bytes, &d_sum_m_u, 1, sizeof( position_t ) );
+  cudaReusableAlloc( d_storage, st_bytes, &d_sum_m_d, 1, sizeof( position_t ) );
+
+  cudaReusableAlloc( d_storage, st_bytes, &d_sum_mu_u, 1, sizeof( position_t ) );
+  cudaReusableAlloc( d_storage, st_bytes, &d_sum_mu_d, 1, sizeof( position_t ) );
+
+  cudaReusableAlloc( d_storage, st_bytes, &d_t_u, 1, sizeof( KeyT ) );
+  cudaReusableAlloc( d_storage, st_bytes, &d_t_d, 1, sizeof( KeyT ) );
+
+  h_m_u = new position_t[ k ];
+  h_m_d = new position_t[ k ];
+
+  h_mu_u = new position_t[ k ];
+  h_mu_d = new position_t[ k ];
 
   // use one more element (k+1) to avoid illegal memory access of
   // subsequent use of the arrays in prefix scan
-  cudaReusableAlloc(d_storage, st_bytes, &d_diff, k + 1, sizeof(position_t));
-  cudaReusableAlloc(d_storage, st_bytes, &d_diff_cumul, k + 1,
-		    sizeof(position_t));
+  cudaReusableAlloc( d_storage, st_bytes, &d_diff, k + 1, sizeof( position_t ) );
+  cudaReusableAlloc( d_storage, st_bytes, &d_diff_cumul, k + 1, sizeof( position_t ) );
 
-  h_diff = new position_t[k];
-  h_diff_cumul = new position_t[k+1];
+  h_diff = new position_t[ k ];
+  h_diff_cumul = new position_t[ k + 1 ];
 
-  cudaReusableAlloc(d_storage, st_bytes, &d_num_down, 1, sizeof(position_t));
+  cudaReusableAlloc( d_storage, st_bytes, &d_num_down, 1, sizeof( position_t ) );
 
-  cudaReusableAlloc(d_storage, st_bytes, &d_max_diff, 1, sizeof(position_t));
-  cudaReusableAlloc(d_storage, st_bytes, &d_arg_max, 1, sizeof(int));
+  cudaReusableAlloc( d_storage, st_bytes, &d_max_diff, 1, sizeof( position_t ) );
+  cudaReusableAlloc( d_storage, st_bytes, &d_arg_max, 1, sizeof( int ) );
 
-  cudaReusableAlloc(d_storage, st_bytes, &d_t_tilde, 1, sizeof(KeyT));
+  cudaReusableAlloc( d_storage, st_bytes, &d_t_tilde, 1, sizeof( KeyT ) );
 
-  k_next_pow_2 = nextPowerOf2(k);
-  kp_next_pow_2 = nextPowerOf2(k+1);
+  k_next_pow_2 = nextPowerOf2( k );
+  kp_next_pow_2 = nextPowerOf2( k + 1 );
 
-  cudaReusableAlloc(d_storage, st_bytes, &d_extra_elem, k, sizeof(KeyT));
-  h_extra_elem = new KeyT[k];  
+  cudaReusableAlloc( d_storage, st_bytes, &d_extra_elem, k, sizeof( KeyT ) );
+  h_extra_elem = new KeyT[ k ];
 
-  cudaReusableAlloc(d_storage, st_bytes, &d_extra_elem_idx, k, sizeof(int));
-  h_extra_elem_idx = new int[k];  
+  cudaReusableAlloc( d_storage, st_bytes, &d_extra_elem_idx, k, sizeof( int ) );
+  h_extra_elem_idx = new int[ k ];
 
-  cudaReusableAlloc(d_storage, st_bytes, &d_n_extra_elems, 1, sizeof(int));
+  cudaReusableAlloc( d_storage, st_bytes, &d_n_extra_elems, 1, sizeof( int ) );
 
-  cudaReusableAlloc(d_storage, st_bytes, &d_subarray, k, sizeof(ArrayT));
+  cudaReusableAlloc( d_storage, st_bytes, &d_subarray, k, sizeof( ArrayT ) );
 
   // if d_storage==NULL this function should only evaluate the storage bytes
-  if (d_storage == NULL) {
+  if ( d_storage == NULL )
+  {
     int64_t align_bytes = 256;
-    int64_t align_mask = ~(align_bytes - 1);
+    int64_t align_mask = ~( align_bytes - 1 );
+
+    st_bytes = ( st_bytes + align_bytes - 1 ) & align_mask;
 
-    st_bytes = (st_bytes + align_bytes - 1) & align_mask;
-    
     return 0;
   }
 
-  gpuErrchk(cudaMemcpyAsync(d_subarray, h_subarray,
-			    k*sizeof(ArrayT), cudaMemcpyHostToDevice));
+  gpuErrchk( cudaMemcpyAsync( d_subarray, h_subarray, k * sizeof( ArrayT ), cudaMemcpyHostToDevice ) );
 
   ///// TEMPORARY, FOR TESTING
   k_ = k;
   block_size_ = block_size;
   h_part_size_ = h_part_size;
   d_part_size_ = d_part_size;
-  d_aux_array_key_pt_ = getKeyPt(d_aux_array);
-  d_aux_array_value_pt_ = getValuePt(d_aux_array);
-  
+  d_aux_array_key_pt_ = getKeyPt( d_aux_array );
+  d_aux_array_value_pt_ = getValuePt( d_aux_array );
+
   //////////////////// serve???????!!!!!!!!!!
   position_t tot_part_size = block_size;
 
-  ArrayT target_array[k-1];
-  for (uint i=0; i<k-1; i++) {
-    target_array[i] = h_subarray[i];
-    for (uint j=i+1; j<k; j++) {
-      target_array[i].size += h_subarray[j].size;
+  ArrayT target_array[ k - 1 ];
+  for ( uint i = 0; i < k - 1; i++ )
+  {
+    target_array[ i ] = h_subarray[ i ];
+    for ( uint j = i + 1; j < k; j++ )
+    {
+      target_array[ i ].size += h_subarray[ j ].size;
     }
   }
 
   //////////////////////////////////////////////////////////
   // LOOP SHOULD START HERE
   //////////////////////////////////////////////////////////
-  for (uint i_sub=0; i_sub<k-1; i_sub++) {
-    threshold_range_kernel<KeyT, ArrayT, 1024> <<<1, k>>>
-      (d_subarray, block_size, k, d_t_u, d_t_d);
+  for ( uint i_sub = 0; i_sub < k - 1; i_sub++ )
+  {
+    threshold_range_kernel< KeyT, ArrayT, 1024 > <<< 1, k>>>( d_subarray, block_size, k, d_t_u, d_t_d );
 
-    //DBGCUDASYNC
+    // DBGCUDASYNC
     CUDASYNC
-    search_multi_down<KeyT, ArrayT, 1024>
-      (d_subarray, k, d_t_u, d_m_u, d_sum_m_u);
+    search_multi_down< KeyT, ArrayT, 1024 >( d_subarray, k, d_t_u, d_m_u, d_sum_m_u );
     CUDASYNC
-    search_multi_up<KeyT, ArrayT, 1024>
-      (d_subarray, k, d_t_d, d_m_d, d_sum_m_d);
+    search_multi_up< KeyT, ArrayT, 1024 >( d_subarray, k, d_t_d, d_m_d, d_sum_m_d );
     CUDASYNC
-    gpuErrchk(cudaMemcpyAsync(&h_sum_m_u, d_sum_m_u, sizeof(position_t),
-			      cudaMemcpyDeviceToHost));
-    gpuErrchk(cudaMemcpy(&h_sum_m_d, d_sum_m_d, sizeof(position_t),
-			 cudaMemcpyDeviceToHost));
-    if (print_gpu_cpu_vrb)
-      printf("kernel sum_m_u: %ld\tsum_m_d: %ld\n", h_sum_m_u, h_sum_m_d);
+    gpuErrchk( cudaMemcpyAsync( &h_sum_m_u, d_sum_m_u, sizeof( position_t ), cudaMemcpyDeviceToHost ) );
+    gpuErrchk( cudaMemcpy( &h_sum_m_d, d_sum_m_d, sizeof( position_t ), cudaMemcpyDeviceToHost ) );
+    if ( print_gpu_cpu_vrb )
+    {
+      printf( "kernel sum_m_u: %ld\tsum_m_d: %ld\n", h_sum_m_u, h_sum_m_d );
+    }
     /////////////////////////////////////////////////////////////
-    if (block_size >= h_sum_m_u) { // m_u -> m_d
-      search_multi_up<KeyT, ArrayT, 1024>
-	(d_subarray, k, d_t_u, d_mu_u, d_sum_mu_u);
+    if ( block_size >= h_sum_m_u )
+    { // m_u -> m_d
+      search_multi_up< KeyT, ArrayT, 1024 >( d_subarray, k, d_t_u, d_mu_u, d_sum_mu_u );
       /////////////////////
-      gpuErrchk(cudaMemcpyAsync(h_m_u, d_m_u, k*sizeof(position_t),
-			   cudaMemcpyDeviceToHost));
-      gpuErrchk(cudaMemcpyAsync(h_mu_u, d_mu_u, k*sizeof(position_t),
-			   cudaMemcpyDeviceToHost));
-      if (print_gpu_cpu_vrb) {
-	CUDASYNC
-	printf("last step gpu cond 0 h_m_u: ");
-	for (uint i=0; i<k; i++) {
-	  printf("%ld ", h_m_u[i]);
-	}
-	printf("\n");
-	printf("last step gpu cond 0 h_mu_u: ");
-	for (uint i=0; i<k; i++) {
-	  printf("%ld ", h_mu_u[i]);
-	}
-	printf("\n");
+      gpuErrchk( cudaMemcpyAsync( h_m_u, d_m_u, k * sizeof( position_t ), cudaMemcpyDeviceToHost ) );
+      gpuErrchk( cudaMemcpyAsync( h_mu_u, d_mu_u, k * sizeof( position_t ), cudaMemcpyDeviceToHost ) );
+      if ( print_gpu_cpu_vrb )
+      {
+        CUDASYNC
+        printf( "last step gpu cond 0 h_m_u: " );
+        for ( uint i = 0; i < k; i++ )
+        {
+          printf( "%ld ", h_m_u[ i ] );
+        }
+        printf( "\n" );
+        printf( "last step gpu cond 0 h_mu_u: " );
+        for ( uint i = 0; i < k; i++ )
+        {
+          printf( "%ld ", h_mu_u[ i ] );
+        }
+        printf( "\n" );
       }
-      last_step(d_m_u, d_mu_u, d_sum_m_u, h_sum_m_u,
-		tot_part_size, k, kp_next_pow_2, d_part_size, d_diff,
-		d_diff_cumul, h_diff, h_diff_cumul, d_num_down);
-      if (print_gpu_cpu_vrb) {
-	CUDASYNC
-	printf("Kernel Final step condition 0\n");
-	printf("Kernel total partition size before final step: %ld\n", h_sum_m_u);
+      last_step( d_m_u,
+        d_mu_u,
+        d_sum_m_u,
+        h_sum_m_u,
+        tot_part_size,
+        k,
+        kp_next_pow_2,
+        d_part_size,
+        d_diff,
+        d_diff_cumul,
+        h_diff,
+        h_diff_cumul,
+        d_num_down );
+      if ( print_gpu_cpu_vrb )
+      {
+        CUDASYNC
+        printf( "Kernel Final step condition 0\n" );
+        printf( "Kernel total partition size before final step: %ld\n", h_sum_m_u );
       }
     }
 
     //////////////////////////////////////////////////////////////
-    else if (block_size <= h_sum_m_d) {
-      search_multi_down<KeyT, ArrayT, 1024>
-	(d_subarray, k, d_t_d, d_mu_d, d_sum_mu_d);
+    else if ( block_size <= h_sum_m_d )
+    {
+      search_multi_down< KeyT, ArrayT, 1024 >( d_subarray, k, d_t_d, d_mu_d, d_sum_mu_d );
       /////////////////////
-      gpuErrchk(cudaMemcpyAsync(h_mu_d, d_mu_d, k*sizeof(position_t),
-				cudaMemcpyDeviceToHost));
-      gpuErrchk(cudaMemcpyAsync(h_m_d, d_m_d, k*sizeof(position_t),
-				cudaMemcpyDeviceToHost));
-      gpuErrchk(cudaMemcpy(&h_sum_mu_d, d_sum_mu_d, sizeof(position_t),
-			   cudaMemcpyDeviceToHost));
-      if (print_gpu_cpu_vrb) {
-	printf("last step gpu cond 1 h_mu_d: ");
-	for (uint i=0; i<k; i++) {
-	  printf("%ld ", h_mu_d[i]);
-	}
-	printf("\n");
-	printf("last step gpu cond 1 h_m_d: ");
-	for (uint i=0; i<k; i++) {
-	  printf("%ld ", h_m_d[i]);
-	}
-	printf("\n");
+      gpuErrchk( cudaMemcpyAsync( h_mu_d, d_mu_d, k * sizeof( position_t ), cudaMemcpyDeviceToHost ) );
+      gpuErrchk( cudaMemcpyAsync( h_m_d, d_m_d, k * sizeof( position_t ), cudaMemcpyDeviceToHost ) );
+      gpuErrchk( cudaMemcpy( &h_sum_mu_d, d_sum_mu_d, sizeof( position_t ), cudaMemcpyDeviceToHost ) );
+      if ( print_gpu_cpu_vrb )
+      {
+        printf( "last step gpu cond 1 h_mu_d: " );
+        for ( uint i = 0; i < k; i++ )
+        {
+          printf( "%ld ", h_mu_d[ i ] );
+        }
+        printf( "\n" );
+        printf( "last step gpu cond 1 h_m_d: " );
+        for ( uint i = 0; i < k; i++ )
+        {
+          printf( "%ld ", h_m_d[ i ] );
+        }
+        printf( "\n" );
       }
-      last_step(d_mu_d, d_m_d, d_sum_mu_d, h_sum_mu_d,
-		tot_part_size, k, kp_next_pow_2, d_part_size, d_diff,
-		d_diff_cumul, h_diff, h_diff_cumul, d_num_down);
-      if (print_gpu_cpu_vrb) {
-	CUDASYNC
-	printf("Kernel Final step condition 1\n");
-	printf("Kernel total partition size before final step: %ld\n",
-	       h_sum_mu_d);
+      last_step( d_mu_d,
+        d_m_d,
+        d_sum_mu_d,
+        h_sum_mu_d,
+        tot_part_size,
+        k,
+        kp_next_pow_2,
+        d_part_size,
+        d_diff,
+        d_diff_cumul,
+        h_diff,
+        h_diff_cumul,
+        d_num_down );
+      if ( print_gpu_cpu_vrb )
+      {
+        CUDASYNC
+        printf( "Kernel Final step condition 1\n" );
+        printf( "Kernel total partition size before final step: %ld\n", h_sum_mu_d );
       }
     }
-    else {
-      for(;;) {
-	max_diff_kernel<ArrayT, 1024><<<1, 1024>>>
-	  (d_m_u, d_m_d, k, d_subarray, d_max_diff, d_arg_max);
-	DBGCUDASYNC
-	position_t h_max_diff;
-	gpuErrchk(cudaMemcpy(&h_max_diff, d_max_diff, sizeof(position_t),
-			     cudaMemcpyDeviceToHost));
-	if (h_max_diff<=1) {
-	  gpuErrchk(cudaMemcpy(&h_sum_m_d, d_sum_m_d, sizeof(position_t),
-			       cudaMemcpyDeviceToHost));
-	  last_step_case2<KeyT, ArrayT>
-	    (d_subarray, tot_part_size, k, d_part_size,
-	     d_m_d, d_m_u, h_sum_m_d, d_extra_elem,
-	     h_extra_elem, d_extra_elem_idx, h_extra_elem_idx,
-	     d_n_extra_elems);
-	  if (print_gpu_cpu_vrb) {
-	    CUDASYNC
-	    printf("Kernel final step condition 2\n");
-	    printf("Total partition size before final step: %ld\n", h_sum_m_d);
-	  }
-	  break;
-	}
-	eval_t_tilde_kernel<KeyT, ArrayT><<< 1, 1 >>>
-	  (d_subarray, d_m_u, d_m_d, d_arg_max, d_t_tilde);
-	DBGCUDASYNC
-	  
-	search_multi_up<KeyT, ArrayT, 1024>
-	  (d_subarray, k, d_t_tilde, d_mu_u,
-	   d_sum_mu_u);
-	search_multi_down<KeyT, ArrayT, 1024>
-	  (d_subarray, k, d_t_tilde, d_mu_d,
-	   d_sum_mu_d);
-	gpuErrchk(cudaMemcpyAsync(&h_sum_mu_u, d_sum_mu_u, sizeof(position_t),
-				  cudaMemcpyDeviceToHost));
-	gpuErrchk(cudaMemcpy(&h_sum_mu_d, d_sum_mu_d, sizeof(position_t),
-			     cudaMemcpyDeviceToHost));
-	if (block_size < h_sum_mu_d) {
-	  gpuErrchk(cudaMemcpyAsync(d_m_u, d_mu_d, k*sizeof(position_t),
-				    cudaMemcpyDeviceToDevice));
-	  gpuErrchk(cudaMemcpyAsync(d_sum_m_u, d_sum_mu_d, sizeof(position_t),
-			       cudaMemcpyDeviceToDevice));
-	}
-	else if (block_size > h_sum_mu_u) {
-	  gpuErrchk(cudaMemcpyAsync(d_m_d, d_mu_u, k*sizeof(position_t),
-				    cudaMemcpyDeviceToDevice));
-	  gpuErrchk(cudaMemcpyAsync(d_sum_m_d, d_sum_mu_u, sizeof(position_t),
-				    cudaMemcpyDeviceToDevice));
-	}
-	else { // sum_mu_d <= tot_part_size <= sum_mu_u
-	  last_step(d_mu_d, d_mu_u, d_sum_mu_d, h_sum_mu_d,
-		    tot_part_size, k, kp_next_pow_2, d_part_size,
-		    d_diff, d_diff_cumul, h_diff, h_diff_cumul,
-		    d_num_down);
-	  if (print_gpu_cpu_vrb) {
-	    CUDASYNC
-	    printf("Kernel final step condition 3\n");
-	    printf("Kernel total part size before final step: %ld\n", h_sum_mu_d);
-	  }
-	  break;
-	}
+    else
+    {
+      for ( ;; )
+      {
+        max_diff_kernel< ArrayT, 1024 > <<< 1, 1024 >>>( d_m_u, d_m_d, k, d_subarray, d_max_diff, d_arg_max );
+        DBGCUDASYNC
+        position_t h_max_diff;
+        gpuErrchk( cudaMemcpy( &h_max_diff, d_max_diff, sizeof( position_t ), cudaMemcpyDeviceToHost ) );
+        if ( h_max_diff <= 1 )
+        {
+          gpuErrchk( cudaMemcpy( &h_sum_m_d, d_sum_m_d, sizeof( position_t ), cudaMemcpyDeviceToHost ) );
+          last_step_case2< KeyT, ArrayT >( d_subarray,
+            tot_part_size,
+            k,
+            d_part_size,
+            d_m_d,
+            d_m_u,
+            h_sum_m_d,
+            d_extra_elem,
+            h_extra_elem,
+            d_extra_elem_idx,
+            h_extra_elem_idx,
+            d_n_extra_elems );
+          if ( print_gpu_cpu_vrb )
+          {
+            CUDASYNC
+            printf( "Kernel final step condition 2\n" );
+            printf( "Total partition size before final step: %ld\n", h_sum_m_d );
+          }
+          break;
+        }
+        eval_t_tilde_kernel< KeyT, ArrayT > <<< 1, 1 >>>( d_subarray, d_m_u, d_m_d, d_arg_max, d_t_tilde );
+        DBGCUDASYNC
+
+        search_multi_up< KeyT, ArrayT, 1024 >( d_subarray, k, d_t_tilde, d_mu_u, d_sum_mu_u );
+        search_multi_down< KeyT, ArrayT, 1024 >( d_subarray, k, d_t_tilde, d_mu_d, d_sum_mu_d );
+        gpuErrchk( cudaMemcpyAsync( &h_sum_mu_u, d_sum_mu_u, sizeof( position_t ), cudaMemcpyDeviceToHost ) );
+        gpuErrchk( cudaMemcpy( &h_sum_mu_d, d_sum_mu_d, sizeof( position_t ), cudaMemcpyDeviceToHost ) );
+        if ( block_size < h_sum_mu_d )
+        {
+          gpuErrchk( cudaMemcpyAsync( d_m_u, d_mu_d, k * sizeof( position_t ), cudaMemcpyDeviceToDevice ) );
+          gpuErrchk( cudaMemcpyAsync( d_sum_m_u, d_sum_mu_d, sizeof( position_t ), cudaMemcpyDeviceToDevice ) );
+        }
+        else if ( block_size > h_sum_mu_u )
+        {
+          gpuErrchk( cudaMemcpyAsync( d_m_d, d_mu_u, k * sizeof( position_t ), cudaMemcpyDeviceToDevice ) );
+          gpuErrchk( cudaMemcpyAsync( d_sum_m_d, d_sum_mu_u, sizeof( position_t ), cudaMemcpyDeviceToDevice ) );
+        }
+        else
+        { // sum_mu_d <= tot_part_size <= sum_mu_u
+          last_step( d_mu_d,
+            d_mu_u,
+            d_sum_mu_d,
+            h_sum_mu_d,
+            tot_part_size,
+            k,
+            kp_next_pow_2,
+            d_part_size,
+            d_diff,
+            d_diff_cumul,
+            h_diff,
+            h_diff_cumul,
+            d_num_down );
+          if ( print_gpu_cpu_vrb )
+          {
+            CUDASYNC
+            printf( "Kernel final step condition 3\n" );
+            printf( "Kernel total part size before final step: %ld\n", h_sum_mu_d );
+          }
+          break;
+        }
       }
     }
 
-    extract_partitions<ElementT, ArrayT>
-      (d_subarray, k, k_next_pow_2, d_part_size,
-       d_part_size_cumul, d_aux_array);
+    extract_partitions< ElementT, ArrayT >( d_subarray, k, k_next_pow_2, d_part_size, d_part_size_cumul, d_aux_array );
 
     //////////////////////////////////////////////////////////////////////
     //// USE THE INDEX OF THE ITERATION ON the k -1 target arrays
-    gpuErrchk(cudaMemcpy(h_part_size, d_part_size_, k*sizeof(position_t),
-			 cudaMemcpyDeviceToHost));
+    gpuErrchk( cudaMemcpy( h_part_size, d_part_size_, k * sizeof( position_t ), cudaMemcpyDeviceToHost ) );
 
-    repack(h_subarray, k, h_part_size, d_buffer, buffer_size);
-    gpuErrchk(cudaMemcpyAsync(d_subarray, h_subarray,
-			      k*sizeof(ArrayT), cudaMemcpyHostToDevice));
-    if (compare_with_serial && i_sub==last_i_sub) return 0;
+    repack( h_subarray, k, h_part_size, d_buffer, buffer_size );
+    gpuErrchk( cudaMemcpyAsync( d_subarray, h_subarray, k * sizeof( ArrayT ), cudaMemcpyHostToDevice ) );
+    if ( compare_with_serial && i_sub == last_i_sub )
+    {
+      return 0;
+    }
 
-    CopyArray<ElementT, ArrayT, AuxArrayT>
-      <<< (block_size + 1023) / 1024, 1024 >>>
-      (target_array[i_sub], d_aux_array);
+    CopyArray< ElementT, ArrayT, AuxArrayT > <<< ( block_size + 1023 ) / 1024, 1024 >>>(
+      target_array[ i_sub ], d_aux_array );
   }
-  
+
   return 0;
 }
 //////////////////////////////////////////////////////////////////////
 
-
-template <class KeyT>
-int copass_sort::sort(KeyT *d_keys, position_t n, position_t block_size,
-		      void *d_storage, int64_t &st_bytes)
+template < class KeyT >
+int
+copass_sort::sort( KeyT* d_keys, position_t n, position_t block_size, void* d_storage, int64_t& st_bytes )
 {
   st_bytes = 0;
-  uint k = (uint)((n + block_size - 1) / block_size); // number of subarrays
-
-  contiguous_array<KeyT> h_subarray[k];
-  contiguous_array<KeyT> array_block[k];
-  for (uint i=0; i<k; i++) {
-    h_subarray[i].data_pt = d_keys;
-    h_subarray[i].offset = i * block_size;
-    h_subarray[i].size = i<k-1 ? block_size : n-(k-1)*block_size;
-    array_block[i] = h_subarray[i];
+  uint k = ( uint ) ( ( n + block_size - 1 ) / block_size ); // number of subarrays
+
+  contiguous_array< KeyT > h_subarray[ k ];
+  contiguous_array< KeyT > array_block[ k ];
+  for ( uint i = 0; i < k; i++ )
+  {
+    h_subarray[ i ].data_pt = d_keys;
+    h_subarray[ i ].offset = i * block_size;
+    h_subarray[ i ].size = i < k - 1 ? block_size : n - ( k - 1 ) * block_size;
+    array_block[ i ] = h_subarray[ i ];
   }
 
   int64_t ext_st_bytes = 0;
-  for (uint i=0; i<k; i++) {
-    array_GPUSort(h_subarray[i], d_storage, ext_st_bytes);
-    if (d_storage == NULL) break;
+  for ( uint i = 0; i < k; i++ )
+  {
+    array_GPUSort( h_subarray[ i ], d_storage, ext_st_bytes );
+    if ( d_storage == NULL )
+    {
+      break;
+    }
   }
 
-  contiguous_array<KeyT> key_array;
+  contiguous_array< KeyT > key_array;
   key_array.data_pt = d_keys;
   key_array.offset = 0;
   key_array.size = n;
 
-  sort_template<KeyT, KeyT, contiguous_array<KeyT>,
-		contiguous_array<KeyT>, contiguous_array<KeyT> >
-    (key_array, h_subarray, k, block_size, d_storage, st_bytes);
+  sort_template< KeyT, KeyT, contiguous_array< KeyT >, contiguous_array< KeyT >, contiguous_array< KeyT > >(
+    key_array, h_subarray, k, block_size, d_storage, st_bytes );
 
-  st_bytes = max(st_bytes, ext_st_bytes);
+  st_bytes = std::max( st_bytes, ext_st_bytes );
 
-  if (d_storage==NULL || compare_with_serial) return 0;
+  if ( d_storage == NULL || compare_with_serial )
+  {
+    return 0;
+  }
 
-  for (uint i=0; i<k; i++) {
-    array_GPUSort(array_block[i], d_storage, ext_st_bytes);
+  for ( uint i = 0; i < k; i++ )
+  {
+    array_GPUSort( array_block[ i ], d_storage, ext_st_bytes );
   }
 
   return 0;
 }
 
 ////////////// Temporary for checking !!!!!!!!!!!!!!!!!
-template <class KeyT>
-KeyT *copass_sort::get_aux_array_keys()
+template < class KeyT >
+KeyT*
+copass_sort::get_aux_array_keys()
 {
-  KeyT *h_aux_array_keys = new KeyT[block_size_];
-  gpuErrchk(cudaMemcpy(h_aux_array_keys, d_aux_array_key_pt_,
-		       block_size_*sizeof(KeyT),
-		       cudaMemcpyDeviceToHost));
+  KeyT* h_aux_array_keys = new KeyT[ block_size_ ];
+  gpuErrchk(
+    cudaMemcpy( h_aux_array_keys, d_aux_array_key_pt_, block_size_ * sizeof( KeyT ), cudaMemcpyDeviceToHost ) );
   return h_aux_array_keys;
 }
 
 ////////////// Temporary for checking !!!!!!!!!!!!!!!!!
-template <class ValueT>
-ValueT *copass_sort::get_aux_array_values()
+template < class ValueT >
+ValueT*
+copass_sort::get_aux_array_values()
 {
-  ValueT *h_aux_array_values = new ValueT[block_size_];
-  gpuErrchk(cudaMemcpy(h_aux_array_values, d_aux_array_value_pt_,
-		       block_size_*sizeof(ValueT),
-		       cudaMemcpyDeviceToHost));
+  ValueT* h_aux_array_values = new ValueT[ block_size_ ];
+  gpuErrchk(
+    cudaMemcpy( h_aux_array_values, d_aux_array_value_pt_, block_size_ * sizeof( ValueT ), cudaMemcpyDeviceToHost ) );
   return h_aux_array_values;
 }
 
-
-template <class KeyT, class ValueT>
-int copass_sort::sort(KeyT *d_keys, ValueT *d_values, position_t n,
-		      position_t block_size,
-		      void *d_storage, int64_t &st_bytes)
+template < class KeyT, class ValueT >
+int
+copass_sort::sort( KeyT* d_keys,
+  ValueT* d_values,
+  position_t n,
+  position_t block_size,
+  void* d_storage,
+  int64_t& st_bytes )
 {
   st_bytes = 0;
-  uint k = (uint)((n + block_size - 1) / block_size); // number of subarrays
-
-  contiguous_key_value<KeyT, ValueT> h_subarray[k];
-  contiguous_key_value<KeyT, ValueT> array_block[k];
-  for (uint i=0; i<k; i++) {
-    h_subarray[i].key_pt = d_keys;
-    h_subarray[i].value_pt = d_values;
-    h_subarray[i].offset = i * block_size;
-    h_subarray[i].size = i<k-1 ? block_size : n-(k-1)*block_size;
-    array_block[i] = h_subarray[i];
+  uint k = ( uint ) ( ( n + block_size - 1 ) / block_size ); // number of subarrays
+
+  contiguous_key_value< KeyT, ValueT > h_subarray[ k ];
+  contiguous_key_value< KeyT, ValueT > array_block[ k ];
+  for ( uint i = 0; i < k; i++ )
+  {
+    h_subarray[ i ].key_pt = d_keys;
+    h_subarray[ i ].value_pt = d_values;
+    h_subarray[ i ].offset = i * block_size;
+    h_subarray[ i ].size = i < k - 1 ? block_size : n - ( k - 1 ) * block_size;
+    array_block[ i ] = h_subarray[ i ];
   }
-  
+
   int64_t ext_st_bytes = 0;
-  for (uint i=0; i<k; i++) {
-    array_GPUSort(array_block[i], d_storage, ext_st_bytes);
-    if (d_storage == NULL) break;
+  for ( uint i = 0; i < k; i++ )
+  {
+    array_GPUSort( array_block[ i ], d_storage, ext_st_bytes );
+    if ( d_storage == NULL )
+    {
+      break;
+    }
   }
 
-  contiguous_array<KeyT> key_array;
+  contiguous_array< KeyT > key_array;
   key_array.data_pt = d_keys;
   key_array.offset = 0;
   key_array.size = n;
 
-  sort_template<KeyT, key_value<KeyT, ValueT>,
-		contiguous_array<KeyT>, contiguous_key_value<KeyT, ValueT>,
-		contiguous_key_value<KeyT, ValueT> >
-    (key_array, h_subarray, k, block_size, d_storage, st_bytes);
+  sort_template< KeyT,
+    key_value< KeyT, ValueT >,
+    contiguous_array< KeyT >,
+    contiguous_key_value< KeyT, ValueT >,
+    contiguous_key_value< KeyT, ValueT > >( key_array, h_subarray, k, block_size, d_storage, st_bytes );
 
-  st_bytes = max(st_bytes, ext_st_bytes);
+  st_bytes = std::max( st_bytes, ext_st_bytes );
 
-  if (d_storage==NULL || compare_with_serial) return 0;
+  if ( d_storage == NULL || compare_with_serial )
+  {
+    return 0;
+  }
 
-  for (uint i=0; i<k; i++) {
-    //array_GPUSort(h_subarray[i], d_storage, ext_st_bytes);
-    array_GPUSort(array_block[i], d_storage, ext_st_bytes);
+  for ( uint i = 0; i < k; i++ )
+  {
+    // array_GPUSort(h_subarray[i], d_storage, ext_st_bytes);
+    array_GPUSort( array_block[ i ], d_storage, ext_st_bytes );
   }
 
   return 0;
 }
 
-
-template <class KeyT>
-int copass_sort::sort(KeyT **key_subarray, position_t n, position_t block_size,
-		      void *d_storage, int64_t &st_bytes)
+template < class KeyT >
+int
+copass_sort::sort( KeyT** key_subarray, position_t n, position_t block_size, void* d_storage, int64_t& st_bytes )
 {
   st_bytes = 0;
-  uint k = (uint)((n + block_size - 1) / block_size); // number of subarrays
+  uint k = ( uint ) ( ( n + block_size - 1 ) / block_size ); // number of subarrays
 
-  regular_block_array<KeyT> h_key_array;
-  regular_block_array<KeyT> d_key_array;
+  regular_block_array< KeyT > h_key_array;
+  regular_block_array< KeyT > d_key_array;
 
   h_key_array.data_pt = key_subarray;
   h_key_array.block_size = block_size;
@@ -644,91 +718,102 @@ int copass_sort::sort(KeyT **key_subarray, position_t n, position_t block_size,
   h_key_array.size = n;
 
   int64_t ext_st_bytes = 0;
-  for (uint i=0; i<k; i++) {
-    contiguous_array<KeyT> key_block = getBlock(h_key_array, i);
-    array_GPUSort(key_block, d_storage, ext_st_bytes);
-    if (d_storage == NULL) break;
+  for ( uint i = 0; i < k; i++ )
+  {
+    contiguous_array< KeyT > key_block = getBlock( h_key_array, i );
+    array_GPUSort( key_block, d_storage, ext_st_bytes );
+    if ( d_storage == NULL )
+    {
+      break;
+    }
   }
 
-  KeyT **d_key_array_data_pt = NULL;
-  cudaReusableAlloc(d_storage, st_bytes, &d_key_array_data_pt,
-		    k, sizeof(KeyT*));
-  if (d_storage != NULL) {
-    gpuErrchk(cudaMemcpy(d_key_array_data_pt, key_subarray,
-			 k*sizeof(KeyT*), cudaMemcpyHostToDevice));
+  KeyT** d_key_array_data_pt = NULL;
+  cudaReusableAlloc( d_storage, st_bytes, &d_key_array_data_pt, k, sizeof( KeyT* ) );
+  if ( d_storage != NULL )
+  {
+    gpuErrchk( cudaMemcpy( d_key_array_data_pt, key_subarray, k * sizeof( KeyT* ), cudaMemcpyHostToDevice ) );
   }
 
-  d_key_array.data_pt = d_key_array_data_pt; //key_subarray;
+  d_key_array.data_pt = d_key_array_data_pt; // key_subarray;
   d_key_array.block_size = block_size;
   d_key_array.offset = 0;
   d_key_array.size = n;
 
-  regular_block_array<KeyT> h_subarray[k];
-  for (uint i=0; i<k; i++) {
-    h_subarray[i].h_data_pt = key_subarray;
-    h_subarray[i].data_pt = d_key_array_data_pt; //key_subarray;
-    h_subarray[i].block_size = block_size;
-    h_subarray[i].offset = i * block_size;
-    h_subarray[i].size = i<k-1 ? block_size : n-(k-1)*block_size;
+  regular_block_array< KeyT > h_subarray[ k ];
+  for ( uint i = 0; i < k; i++ )
+  {
+    h_subarray[ i ].h_data_pt = key_subarray;
+    h_subarray[ i ].data_pt = d_key_array_data_pt; // key_subarray;
+    h_subarray[ i ].block_size = block_size;
+    h_subarray[ i ].offset = i * block_size;
+    h_subarray[ i ].size = i < k - 1 ? block_size : n - ( k - 1 ) * block_size;
   }
-  
-  sort_template<KeyT, KeyT, regular_block_array<KeyT>,
-		regular_block_array<KeyT>, contiguous_array <KeyT> >
-    (d_key_array, h_subarray, k, block_size, d_storage, st_bytes);
-
-  st_bytes = max(st_bytes, ext_st_bytes);
-
-  if (d_storage==NULL || compare_with_serial) return 0;
-  
-  for (uint i=0; i<k; i++) {
-    contiguous_array<KeyT> key_block = getBlock(h_key_array, i);
-    array_GPUSort(key_block, d_storage, ext_st_bytes);
+
+  sort_template< KeyT, KeyT, regular_block_array< KeyT >, regular_block_array< KeyT >, contiguous_array< KeyT > >(
+    d_key_array, h_subarray, k, block_size, d_storage, st_bytes );
+
+  st_bytes = std::max( st_bytes, ext_st_bytes );
+
+  if ( d_storage == NULL || compare_with_serial )
+  {
+    return 0;
+  }
+
+  for ( uint i = 0; i < k; i++ )
+  {
+    contiguous_array< KeyT > key_block = getBlock( h_key_array, i );
+    array_GPUSort( key_block, d_storage, ext_st_bytes );
   }
 
   return 0;
 }
 
-template <class KeyT, class ValueT>
-int copass_sort::sort(KeyT **key_subarray, ValueT **value_subarray,
-		      position_t n, position_t block_size,
-		      void *d_storage, int64_t &st_bytes)
+template < class KeyT, class ValueT >
+int
+copass_sort::sort( KeyT** key_subarray,
+  ValueT** value_subarray,
+  position_t n,
+  position_t block_size,
+  void* d_storage,
+  int64_t& st_bytes )
 {
   st_bytes = 0;
-  uint k = (uint)((n + block_size - 1) / block_size); // number of subarrays
+  uint k = ( uint ) ( ( n + block_size - 1 ) / block_size ); // number of subarrays
 
-  regular_block_key_value<KeyT, ValueT> h_key_value;
-  regular_block_array<KeyT> d_key_array;
+  regular_block_key_value< KeyT, ValueT > h_key_value;
+  regular_block_array< KeyT > d_key_array;
 
   h_key_value.key_pt = key_subarray;
   h_key_value.value_pt = value_subarray;
   h_key_value.block_size = block_size;
   h_key_value.offset = 0;
   h_key_value.size = n;
-  
+
   int64_t ext_st_bytes = 0;
-  for (uint i=0; i<k; i++) {
-    contiguous_key_value<KeyT, ValueT> key_value_block =
-      getBlock(h_key_value, i);
-    array_GPUSort(key_value_block, d_storage, ext_st_bytes);
-    if (d_storage == NULL) break;
+  for ( uint i = 0; i < k; i++ )
+  {
+    contiguous_key_value< KeyT, ValueT > key_value_block = getBlock( h_key_value, i );
+    array_GPUSort( key_value_block, d_storage, ext_st_bytes );
+    if ( d_storage == NULL )
+    {
+      break;
+    }
   }
 
-  KeyT **d_key_array_data_pt = NULL;
-  cudaReusableAlloc(d_storage, st_bytes, &d_key_array_data_pt,
-		    k, sizeof(KeyT*));
-  
+  KeyT** d_key_array_data_pt = NULL;
+  cudaReusableAlloc( d_storage, st_bytes, &d_key_array_data_pt, k, sizeof( KeyT* ) );
 
-  if (d_storage != NULL) {
-    gpuErrchk(cudaMemcpy(d_key_array_data_pt, key_subarray,
-			 k*sizeof(KeyT*), cudaMemcpyHostToDevice));
+  if ( d_storage != NULL )
+  {
+    gpuErrchk( cudaMemcpy( d_key_array_data_pt, key_subarray, k * sizeof( KeyT* ), cudaMemcpyHostToDevice ) );
   }
-  ValueT **d_value_array_data_pt = NULL;
-  cudaReusableAlloc(d_storage, st_bytes, &d_value_array_data_pt,
-		    k, sizeof(ValueT*));
-    
-  if (d_storage != NULL) {
-    gpuErrchk(cudaMemcpy(d_value_array_data_pt, value_subarray,
-			 k*sizeof(ValueT*), cudaMemcpyHostToDevice));
+  ValueT** d_value_array_data_pt = NULL;
+  cudaReusableAlloc( d_storage, st_bytes, &d_value_array_data_pt, k, sizeof( ValueT* ) );
+
+  if ( d_storage != NULL )
+  {
+    gpuErrchk( cudaMemcpy( d_value_array_data_pt, value_subarray, k * sizeof( ValueT* ), cudaMemcpyHostToDevice ) );
   }
 
   d_key_array.data_pt = d_key_array_data_pt;
@@ -736,30 +821,35 @@ int copass_sort::sort(KeyT **key_subarray, ValueT **value_subarray,
   d_key_array.offset = 0;
   d_key_array.size = n;
 
-  regular_block_key_value<KeyT, ValueT> h_subarray[k];
-  for (uint i=0; i<k; i++) {
-    h_subarray[i].h_key_pt = key_subarray;
-    h_subarray[i].h_value_pt = value_subarray;
-    h_subarray[i].key_pt = d_key_array_data_pt;
-    h_subarray[i].value_pt = d_value_array_data_pt;
-    h_subarray[i].block_size = block_size;
-    h_subarray[i].offset = i * block_size;
-    h_subarray[i].size = i<k-1 ? block_size : n-(k-1)*block_size;
+  regular_block_key_value< KeyT, ValueT > h_subarray[ k ];
+  for ( uint i = 0; i < k; i++ )
+  {
+    h_subarray[ i ].h_key_pt = key_subarray;
+    h_subarray[ i ].h_value_pt = value_subarray;
+    h_subarray[ i ].key_pt = d_key_array_data_pt;
+    h_subarray[ i ].value_pt = d_value_array_data_pt;
+    h_subarray[ i ].block_size = block_size;
+    h_subarray[ i ].offset = i * block_size;
+    h_subarray[ i ].size = i < k - 1 ? block_size : n - ( k - 1 ) * block_size;
   }
 
-  sort_template<KeyT, key_value<KeyT, ValueT>, regular_block_array<KeyT>,
-		regular_block_key_value<KeyT, ValueT>,
-		contiguous_key_value <KeyT, ValueT> >
-    (d_key_array, h_subarray, k, block_size, d_storage, st_bytes);
+  sort_template< KeyT,
+    key_value< KeyT, ValueT >,
+    regular_block_array< KeyT >,
+    regular_block_key_value< KeyT, ValueT >,
+    contiguous_key_value< KeyT, ValueT > >( d_key_array, h_subarray, k, block_size, d_storage, st_bytes );
 
-  st_bytes = max(st_bytes, ext_st_bytes);
+  st_bytes = std::max( st_bytes, ext_st_bytes );
+
+  if ( d_storage == NULL || compare_with_serial )
+  {
+    return 0;
+  }
 
-  if (d_storage==NULL || compare_with_serial) return 0;
-    
-  for (uint i=0; i<k; i++) {
-    contiguous_key_value<KeyT, ValueT> key_value_block =
-      getBlock(h_key_value, i);
-    array_GPUSort(key_value_block, d_storage, ext_st_bytes);
+  for ( uint i = 0; i < k; i++ )
+  {
+    contiguous_key_value< KeyT, ValueT > key_value_block = getBlock( h_key_value, i );
+    array_GPUSort( key_value_block, d_storage, ext_st_bytes );
   }
 
   return 0;
diff --git a/src/cuda_error.h b/src/cuda_error.h
index 4432b4f17..7978cda69 100644
--- a/src/cuda_error.h
+++ b/src/cuda_error.h
@@ -20,61 +20,153 @@
  *
  */
 
+#ifndef CUDAERROR_H
+#define CUDAERROR_H
+#include <config.h>
 
+#include "ngpu_exception.h"
+#include <map>
+#include <stdio.h>
 
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
 
+namespace cuda_error_ns
+{
+extern std::map< void*, size_t > alloc_map_;
+extern size_t mem_used_;
+extern size_t mem_max_;
+extern int verbose_;
+} // namespace cuda_error_ns
 
-#ifndef CUDAERROR_H
-#define CUDAERROR_H
-#include <stdio.h>
-#include "ngpu_exception.h"
+inline int
+printMPIRank()
+{
 
-#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+#ifdef HAVE_MPI
+  int proc_num;
+  MPI_Comm_size( MPI_COMM_WORLD, &proc_num );
+  if ( proc_num > 1 )
+  {
+    int mpi_id;
+    MPI_Comm_rank( MPI_COMM_WORLD, &mpi_id );
+    printf( "MPI rank: %d\t", mpi_id );
+  }
+#endif
+
+  return 0;
+}
+
+inline void
+mapCUDAMemAlloc( void* dev_pt, size_t n_bytes )
 {
-   if (code != cudaSuccess) 
-   {
-      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
-      if (abort) throw ngpu_exception("CUDA error");
-   }
+  cuda_error_ns::alloc_map_.insert( { dev_pt, n_bytes } );
+  cuda_error_ns::mem_used_ += n_bytes;
+
+  if ( cuda_error_ns::mem_used_ > cuda_error_ns::mem_max_ )
+  {
+    cuda_error_ns::mem_max_ = cuda_error_ns::mem_used_;
+  }
+  if ( cuda_error_ns::verbose_ > 0 )
+  {
+    printMPIRank();
+    printf( "GPU memory usage: used = %.3f, max used = %.3f\n",
+      ( float ) cuda_error_ns::mem_used_ / 1024.0 / 1024.0,
+      ( float ) cuda_error_ns::mem_max_ / 1024.0 / 1024.0 );
+  }
 }
 
-#define CUDA_CALL(x)   do { if((x) != cudaSuccess) { \
-      printf("Error at %s:%d\n",__FILE__,__LINE__);  \
-      throw ngpu_exception("CUDA error");}} while(0)
-#define CURAND_CALL(x) do { if((x) != CURAND_STATUS_SUCCESS) {	\
-     printf("Error at %s:%d\n",__FILE__,__LINE__);		\
-     throw ngpu_exception("CUDA error");}} while(0)
+inline void
+mapCUDAMemFree( void* dev_pt )
+{
+  if ( cuda_error_ns::alloc_map_.find( dev_pt ) == cuda_error_ns::alloc_map_.end() )
+  {
+    throw ngpu_exception( "CUDA error: pointer not found in mapCUDAMemFree." );
+  }
+  size_t n_bytes = cuda_error_ns::alloc_map_.at( dev_pt );
+  cuda_error_ns::alloc_map_.erase( dev_pt );
+  cuda_error_ns::mem_used_ -= n_bytes;
 
-//#define DEBUG_CUDA_SYNC
-#ifdef DEBUG_CUDA_SYNC
-#define DBGCUDASYNC \
-  gpuErrchk(cudaPeekAtLastError()); \
-  gpuErrchk(cudaDeviceSynchronize());
-#else
-#define DBGCUDASYNC \
-  gpuErrchk(cudaPeekAtLastError());
-#endif
-#define CUDASYNC				\
-  gpuErrchk(cudaPeekAtLastError());		\
-  gpuErrchk(cudaDeviceSynchronize());
+  if ( cuda_error_ns::verbose_ > 0 )
+  {
+    printMPIRank();
+    printf( "GPU memory usage: used = %.3f, max used = %.3f\n",
+      ( float ) cuda_error_ns::mem_used_ / 1024.0 / 1024.0,
+      ( float ) cuda_error_ns::mem_max_ / 1024.0 / 1024.0 );
+  }
+}
 
-//#define DEBUG_CUDAMALLOC
-#ifdef DEBUG_CUDAMALLOC
-#define CUDAMALLOCCTRL(str, dev_pt, n_bytes) {				\
-  printf("Allocating %lld bytes pointed by %s in device memory at %s:%d\n", \
-         (unsigned long long)n_bytes, str, __FILE__,__LINE__);  \
-  gpuAssert(cudaMalloc(dev_pt, n_bytes), __FILE__, __LINE__); }
-#define CUDAFREECTRL(str, dev_pt) {				\
-  printf("Deallocating device memory pointed by %s in at %s:%d\n", \
-         str, __FILE__,__LINE__);				   \
-  gpuAssert(cudaFree(dev_pt), __FILE__, __LINE__); }
+#define gpuErrchk( ans )                      \
+  {                                           \
+    gpuAssert( ( ans ), __FILE__, __LINE__ ); \
+  }
+inline void
+gpuAssert( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if ( code != cudaSuccess )
+  {
+    fprintf( stderr, "GPUassert: %s %s %d\n", cudaGetErrorString( code ), file, line );
+    if ( abort )
+    {
+      throw ngpu_exception( "CUDA error" );
+    }
+  }
+}
+
+#define CUDA_CALL( x )                                  \
+  do                                                    \
+  {                                                     \
+    if ( ( x ) != cudaSuccess )                         \
+    {                                                   \
+      printf( "Error at %s:%d\n", __FILE__, __LINE__ ); \
+      throw ngpu_exception( "CUDA error" );             \
+    }                                                   \
+  } while ( 0 )
+#define CURAND_CALL( x )                                \
+  do                                                    \
+  {                                                     \
+    if ( ( x ) != CURAND_STATUS_SUCCESS )               \
+    {                                                   \
+      printf( "Error at %s:%d\n", __FILE__, __LINE__ ); \
+      throw ngpu_exception( "CUDA error" );             \
+    }                                                   \
+  } while ( 0 )
+
+// #define DEBUG_CUDA_SYNC
+#ifdef DEBUG_CUDA_SYNC
+#define DBGCUDASYNC                   \
+  gpuErrchk( cudaPeekAtLastError() ); \
+  gpuErrchk( cudaDeviceSynchronize() );
 #else
-#define CUDAMALLOCCTRL(str, dev_pt, n_bytes) {				\
-  gpuAssert(cudaMalloc(dev_pt, n_bytes), __FILE__, __LINE__); }
-#define CUDAFREECTRL(str, dev_pt) {				\
-  gpuAssert(cudaFree(dev_pt), __FILE__, __LINE__); }
+#define DBGCUDASYNC gpuErrchk( cudaPeekAtLastError() );
 #endif
+#define CUDASYNC                      \
+  gpuErrchk( cudaPeekAtLastError() ); \
+  gpuErrchk( cudaDeviceSynchronize() );
 
-
+#define CUDAMALLOCCTRL( str, dev_pt, n_bytes )                                   \
+  {                                                                              \
+    if ( cuda_error_ns::verbose_ > 0 )                                           \
+    {                                                                            \
+      printMPIRank();                                                            \
+      printf( "Allocating %lld bytes pointed by %s in device memory at %s:%d\n", \
+        ( unsigned long long ) n_bytes,                                          \
+        str,                                                                     \
+        __FILE__,                                                                \
+        __LINE__ );                                                              \
+    }                                                                            \
+    gpuAssert( cudaMalloc( dev_pt, n_bytes ), __FILE__, __LINE__ );              \
+    mapCUDAMemAlloc( *dev_pt, n_bytes );                                         \
+  }
+#define CUDAFREECTRL( str, dev_pt )                                                                \
+  {                                                                                                \
+    if ( cuda_error_ns::verbose_ > 0 )                                                             \
+    {                                                                                              \
+      printMPIRank();                                                                              \
+      printf( "Deallocating device memory pointed by %s in at %s:%d\n", str, __FILE__, __LINE__ ); \
+    }                                                                                              \
+    gpuAssert( cudaFree( dev_pt ), __FILE__, __LINE__ );                                           \
+    mapCUDAMemFree( dev_pt );                                                                      \
+  }
 #endif
diff --git a/src/distribution.cu b/src/distribution.cu
index 3be8e7090..64a7452f4 100644
--- a/src/distribution.cu
+++ b/src/distribution.cu
@@ -20,239 +20,268 @@
  *
  */
 
-#include <vector>
-#include <cuda.h>
-#include <curand.h>
 #include "cuda_error.h"
-#include "ngpu_exception.h"
 #include "distribution.h"
+#include "ngpu_exception.h"
+#include <cuda.h>
+#include <curand.h>
+#include <vector>
 
-
-__global__ void randomNormalClippedKernel(float *arr, int64_t n, float mu,
-					  float sigma, float low, float high,
-					  double normal_cdf_alpha,
-					  double normal_cdf_beta)
+__global__ void
+randomNormalClippedKernel( float* arr,
+  int64_t n,
+  float mu,
+  float sigma,
+  float low,
+  float high,
+  double normal_cdf_alpha,
+  double normal_cdf_beta )
 {
-  const double epsilon=1.0e-15;
+  const double epsilon = 1.0e-15;
   int64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid>=n) return;
-  float uniform = arr[tid];
-  double p = normal_cdf_alpha + (normal_cdf_beta - normal_cdf_alpha) * uniform;
+  if ( tid >= n )
+  {
+    return;
+  }
+  float uniform = arr[ tid ];
+  double p = normal_cdf_alpha + ( normal_cdf_beta - normal_cdf_alpha ) * uniform;
   double v = p * 2.0 - 1.0;
-  v = max(v,  epsilon - 1.0);
-  v = min(v, -epsilon + 1.0);
-  double x = (double)sigma * sqrt(2.0) * erfinv(v) + mu;
-  x = max(x, low);
-  x = min(x, high);
-  arr[tid] = (float)x;
+  v = max( v, epsilon - 1.0 );
+  v = min( v, -epsilon + 1.0 );
+  double x = ( double ) sigma * sqrt( 2.0 ) * erfinv( v ) + mu;
+  x = max( x, low );
+  x = min( x, high );
+  arr[ tid ] = ( float ) x;
 }
 
-double normalCDF(double value)
+double
+normalCDF( double value )
 {
-   return 0.5 * erfc(-value * M_SQRT1_2);
+  return 0.5 * erfc( -value * M_SQRT1_2 );
 }
 
-int randomNormalClipped(float *arr, int64_t n, float mu,
-			float sigma, float low, float high)
+int
+randomNormalClipped( float* arr, int64_t n, float mu, float sigma, float low, float high )
 {
-  double alpha = ((double)low - mu) / sigma;
-  double beta = ((double)high - mu) / sigma;
-  double normal_cdf_alpha = normalCDF(alpha);
-  double normal_cdf_beta = normalCDF(beta);
+  double alpha = ( ( double ) low - mu ) / sigma;
+  double beta = ( ( double ) high - mu ) / sigma;
+  double normal_cdf_alpha = normalCDF( alpha );
+  double normal_cdf_beta = normalCDF( beta );
 
-  //printf("mu: %f\tsigma: %f\tlow: %f\thigh: %f\tn: %ld\n",
+  // printf("mu: %f\tsigma: %f\tlow: %f\thigh: %f\tn: %ld\n",
   //	 mu, sigma, low, high, n);
-  //n = 10000;
-  randomNormalClippedKernel<<<(n+1023)/1024, 1024>>>(arr, n, mu, sigma,
-						     low, high,
-						     normal_cdf_alpha,
-						     normal_cdf_beta);
+  // n = 10000;
+  randomNormalClippedKernel<<< ( n + 1023 ) / 1024, 1024 >>>(
+    arr, n, mu, sigma, low, high, normal_cdf_alpha, normal_cdf_beta );
   DBGCUDASYNC
   // temporary test, remove!!!!!!!!!!!!!
-  //gpuErrchk( cudaDeviceSynchronize() );
-  //float h_arr[10000];
-  //gpuErrchk(cudaMemcpy(h_arr, arr, n*sizeof(float), cudaMemcpyDeviceToHost));
-  //for (int i=0; i<n; i++) {
+  // gpuErrchk( cudaDeviceSynchronize() );
+  // float h_arr[10000];
+  // gpuErrchk(cudaMemcpy(h_arr, arr, n*sizeof(float), cudaMemcpyDeviceToHost));
+  // for (int i=0; i<n; i++) {
   //  printf("arr: %f\n", h_arr[i]);
   //}
-  //exit(0);
+  // exit(0);
 
   return 0;
 }
 
-
-
-bool Distribution::isDistribution(int distr_idx)
+bool
+Distribution::isDistribution( int distr_idx )
 {
-  if (distr_idx>DISTR_TYPE_ARRAY && distr_idx<N_DISTR_TYPE) {
+  if ( distr_idx > DISTR_TYPE_ARRAY && distr_idx < N_DISTR_TYPE )
+  {
     return true;
   }
-  else {
+  else
+  {
     return false;
   }
 }
-  
-bool Distribution::isArray(int distr_idx)
+
+bool
+Distribution::isArray( int distr_idx )
 {
-  if (distr_idx==DISTR_TYPE_ARRAY) {
+  if ( distr_idx == DISTR_TYPE_ARRAY )
+  {
     return true;
   }
-  else {
+  else
+  {
     return false;
   }
 }
 
-void Distribution::checkDistributionInitialized()
+void
+Distribution::checkDistributionInitialized()
 {
-  if (distr_idx_<DISTR_TYPE_ARRAY || distr_idx_>=N_DISTR_TYPE) {
-    throw ngpu_exception("Distribution was not initialized");
+  if ( distr_idx_ < DISTR_TYPE_ARRAY || distr_idx_ >= N_DISTR_TYPE )
+  {
+    throw ngpu_exception( "Distribution was not initialized" );
   }
 }
 
-int Distribution::vectSize()
+int
+Distribution::vectSize()
 {
   return vect_size_;
 }
 
-float *Distribution::getArray(curandGenerator_t &gen, int64_t n_elem,
-			      int i_vect)
+float*
+Distribution::getArray( curandGenerator_t& gen, int64_t n_elem, int i_vect )
 {
   checkDistributionInitialized();
-  if (distr_idx_>=DISTR_TYPE_ARRAY) {
-    CUDAMALLOCCTRL("&d_array_pt_",&d_array_pt_, n_elem*sizeof(float));
+  if ( distr_idx_ >= DISTR_TYPE_ARRAY )
+  {
+    CUDAMALLOCCTRL( "&d_array_pt_", &d_array_pt_, n_elem * sizeof( float ) );
   }
-  if (distr_idx_==DISTR_TYPE_ARRAY) {
-    gpuErrchk(cudaMemcpy(d_array_pt_, h_array_pt_, n_elem*sizeof(float),
-			 cudaMemcpyHostToDevice));    
+  if ( distr_idx_ == DISTR_TYPE_ARRAY )
+  {
+    gpuErrchk( cudaMemcpy( d_array_pt_, h_array_pt_, n_elem * sizeof( float ), cudaMemcpyHostToDevice ) );
   }
-  else if (distr_idx_==DISTR_TYPE_NORMAL_CLIPPED) {
-    //printf("ok0\n");
-    CURAND_CALL(curandGenerateUniform(gen, d_array_pt_, n_elem));
-    //printf("ok1\n");
-    randomNormalClipped(d_array_pt_, n_elem, mu_[i_vect], sigma_[i_vect],
-			low_[i_vect], high_[i_vect]);
-    //printf("ok2\n");
+  else if ( distr_idx_ == DISTR_TYPE_NORMAL_CLIPPED )
+  {
+    CURAND_CALL( curandGenerateUniform( gen, d_array_pt_, n_elem ) );
+    randomNormalClipped( d_array_pt_, n_elem, mu_[ i_vect ], sigma_[ i_vect ], low_[ i_vect ], high_[ i_vect ] );
   }
-  else if (distr_idx_==DISTR_TYPE_NORMAL) {
-    float low = mu_[i_vect] - 5.0*sigma_[i_vect];
-    float high = mu_[i_vect] + 5.0*sigma_[i_vect];
-    CURAND_CALL(curandGenerateUniform(gen, d_array_pt_, n_elem));
-    randomNormalClipped(d_array_pt_, n_elem, mu_[i_vect], sigma_[i_vect],
-			low, high);
+  else if ( distr_idx_ == DISTR_TYPE_NORMAL )
+  {
+    float low = mu_[ i_vect ] - 5.0 * sigma_[ i_vect ];
+    float high = mu_[ i_vect ] + 5.0 * sigma_[ i_vect ];
+    CURAND_CALL( curandGenerateUniform( gen, d_array_pt_, n_elem ) );
+    randomNormalClipped( d_array_pt_, n_elem, mu_[ i_vect ], sigma_[ i_vect ], low, high );
   }
   return d_array_pt_;
 }
 
-int Distribution::SetIntParam(std::string param_name, int val)
+int
+Distribution::SetIntParam( std::string param_name, int val )
 {
-  if (param_name=="distr_idx") {
-    if (isDistribution(val) || isArray(val)) {
+  if ( param_name == "distr_idx" )
+  {
+    if ( isDistribution( val ) || isArray( val ) )
+    {
       distr_idx_ = val;
       vect_size_ = 0;
       mu_.clear();
       sigma_.clear();
       low_.clear();
       high_.clear();
-
     }
-    else {
-      throw ngpu_exception("Invalid distribution type");
+    else
+    {
+      throw ngpu_exception( "Invalid distribution type" );
     }
   }
-  else if (param_name=="vect_size") {
+  else if ( param_name == "vect_size" )
+  {
     vect_size_ = val;
-    mu_.resize(vect_size_);
-    sigma_.resize(vect_size_);
-    low_.resize(vect_size_);
-    high_.resize(vect_size_);
+    mu_.resize( vect_size_ );
+    sigma_.resize( vect_size_ );
+    low_.resize( vect_size_ );
+    high_.resize( vect_size_ );
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized distribution "
-				     "integer parameter ") + param_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized distribution "
+                                       "integer parameter " )
+      + param_name );
   }
 
   return 0;
 }
 
-int Distribution::SetScalParam(std::string param_name, float val)
+int
+Distribution::SetScalParam( std::string param_name, float val )
 {
-  //printf("dok0\n");
   checkDistributionInitialized();
-  //printf("dok1\n");
-  if (vect_size_ <= 0) {
-    throw ngpu_exception("Distribution parameter vector dimension "
-			 "was not initialized");
+  if ( vect_size_ <= 0 )
+  {
+    throw ngpu_exception(
+      "Distribution parameter vector dimension "
+      "was not initialized" );
   }
-  else if (vect_size_>1) {
-    throw ngpu_exception("Distribution parameter vector dimension"
-			 " inconsistent for scalar parameter");
+  else if ( vect_size_ > 1 )
+  {
+    throw ngpu_exception(
+      "Distribution parameter vector dimension"
+      " inconsistent for scalar parameter" );
   }
-  //printf("dok2\n");
-  SetVectParam(param_name, val, 0);
-  
+  SetVectParam( param_name, val, 0 );
+
   return 0;
 }
 
-int Distribution::SetVectParam(std::string param_name, float val, int i)
+int
+Distribution::SetVectParam( std::string param_name, float val, int i )
 {
-  //printf("dok3\n");
   checkDistributionInitialized();
-  //printf("dok4\n");
-  if (vect_size_ <= 0) {
-    throw ngpu_exception("Distribution parameter vector dimension "
-			 "was not initialized");
+  if ( vect_size_ <= 0 )
+  {
+    throw ngpu_exception(
+      "Distribution parameter vector dimension "
+      "was not initialized" );
   }
-  if (i > vect_size_) {
-    throw ngpu_exception("Vector parameter index for distribution "
-			 "out of range");
+  if ( i > vect_size_ )
+  {
+    throw ngpu_exception(
+      "Vector parameter index for distribution "
+      "out of range" );
   }
-  //printf("dok5\n");
-  if (param_name=="mu") {
+  if ( param_name == "mu" )
+  {
     // aggiungere && distr_idx==NORMAL || distr_idx==NORMAL_CLIPPED
-    //printf("dok6 i: %d val: %f\n", i, val);
-    mu_[i] = val;
+    mu_[ i ] = val;
   }
-  else if (param_name=="sigma") {
-    sigma_[i] = val;
+  else if ( param_name == "sigma" )
+  {
+    sigma_[ i ] = val;
   }
-  else if (param_name=="low") {
-    low_[i] = val;
+  else if ( param_name == "low" )
+  {
+    low_[ i ] = val;
   }
-  else if (param_name=="high") {
-    high_[i] = val;
+  else if ( param_name == "high" )
+  {
+    high_[ i ] = val;
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized distribution "
-				     "float parameter ") + param_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized distribution "
+                                       "float parameter " )
+      + param_name );
   }
-  //printf("dok7\n");
-  
+
   return 0;
 }
 
-int Distribution::SetFloatPtParam(std::string param_name, float *h_array_pt)
+int
+Distribution::SetFloatPtParam( std::string param_name, float* h_array_pt )
 {
-  if (param_name=="array_pt") {
+  if ( param_name == "array_pt" )
+  {
     distr_idx_ = DISTR_TYPE_ARRAY;
     h_array_pt_ = h_array_pt;
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized distribution "
-				     "float pointer parameter ") + param_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized distribution "
+                                       "float pointer parameter " )
+      + param_name );
   }
 
   return 0;
 }
 
-bool Distribution::IsFloatParam(std::string param_name)
+bool
+Distribution::IsFloatParam( std::string param_name )
 {
-  if ((param_name=="mu")
-      || (param_name=="sigma")
-      || (param_name=="low")
-      || (param_name=="high")) {
+  if ( ( param_name == "mu" ) || ( param_name == "sigma" ) || ( param_name == "low" ) || ( param_name == "high" ) )
+  {
     return true;
   }
-  else {
+  else
+  {
     return false;
   }
 }
diff --git a/src/distribution.h b/src/distribution.h
index 8febd06ef..416287df4 100644
--- a/src/distribution.h
+++ b/src/distribution.h
@@ -22,57 +22,56 @@
 
 #ifndef DISTRIBUTION_H
 #define DISTRIBUTION_H
-#include <vector>
 #include <cuda.h>
 #include <curand.h>
+#include <vector>
 
 class Distribution
 {
-  //curandGenerator_t *curand_generator_;
+  // curandGenerator_t *curand_generator_;
   int distr_idx_;
   int vect_size_;
-  float *h_array_pt_;
-  float *d_array_pt_;
-  std::vector<float> mu_;
-  std::vector<float> sigma_;
-  std::vector<float> low_;
-  std::vector<float> high_;
+  float* h_array_pt_;
+  float* d_array_pt_;
+  std::vector< float > mu_;
+  std::vector< float > sigma_;
+  std::vector< float > low_;
+  std::vector< float > high_;
 
 public:
-  //void setCurandGenerator(curandGenerator_t *gen)
+  // void setCurandGenerator(curandGenerator_t *gen)
   //{curand_generator_ = gen;}
-  
-  bool isDistribution(int distr_idx);
-  
-  bool isArray(int distr_idx);
+
+  bool isDistribution( int distr_idx );
+
+  bool isArray( int distr_idx );
 
   void checkDistributionInitialized();
 
   int vectSize();
 
-  float *getArray(curandGenerator_t &gen, int64_t n_elem, int i_vect = 0);
-  
-  int SetIntParam(std::string param_name, int val);
+  float* getArray( curandGenerator_t& gen, int64_t n_elem, int i_vect = 0 );
 
-  int SetScalParam(std::string param_name, float val);
+  int SetIntParam( std::string param_name, int val );
 
-  int SetVectParam(std::string param_name, float val, int i);
+  int SetScalParam( std::string param_name, float val );
 
-  int SetFloatPtParam(std::string param_name, float *h_array_pt);
+  int SetVectParam( std::string param_name, float val, int i );
 
-  bool IsFloatParam(std::string param_name);
+  int SetFloatPtParam( std::string param_name, float* h_array_pt );
 
+  bool IsFloatParam( std::string param_name );
 };
 
-enum DistributionType {
-  DISTR_TYPE_NONE=0,
+enum DistributionType
+{
+  DISTR_TYPE_NONE = 0,
   DISTR_TYPE_ARRAY,
   DISTR_TYPE_NORMAL,
   DISTR_TYPE_NORMAL_CLIPPED,
   N_DISTR_TYPE
 };
 
-int randomNormalClipped(float *arr, int64_t n, float mu,
-			float sigma, float low, float high);
+int randomNormalClipped( float* arr, int64_t n, float mu, float sigma, float low, float high );
 
 #endif
diff --git a/src/dummyfile.cpp b/src/dummyfile.cpp
index bc98828c0..6ad3bca41 100644
--- a/src/dummyfile.cpp
+++ b/src/dummyfile.cpp
@@ -1,5 +1,6 @@
 #include <config.h>
-int dummy_function_for_nestgpu()
+int
+dummy_function_for_nestgpu()
 #include <config.h>
 {
 #include <config.h>
diff --git a/src/ext_neuron.cu b/src/ext_neuron.cu
index 633b9935d..4ed776a93 100644
--- a/src/ext_neuron.cu
+++ b/src/ext_neuron.cu
@@ -20,30 +20,25 @@
  *
  */
 
-
-
-
-
-#include <config.h>
+#include "ext_neuron.h"
 #include <cmath>
+#include <config.h>
 #include <iostream>
-#include "ext_neuron.h"
 
 using namespace ext_neuron_ns;
 
-__global__ void UpdateExtNeuron(float *port_input_pt, float *port_value_pt,
-				int n_node, int n_var, int n_port_var,
-				int n_port)
+__global__ void
+UpdateExtNeuron( float* port_input_pt, float* port_value_pt, int n_node, int n_var, int n_port_var, int n_port )
 {
   int i_thread = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_thread<n_node*n_port) {
-    int i_port = i_thread%n_port;
-    int i_node = i_thread/n_port;
-    float *pip = port_input_pt + i_node*n_var + n_port_var*i_port;
-    //printf("port %d node %d pip %f\n", i_port, i_node, *pip);
-    port_value_pt[i_node*n_var + n_port_var*i_port]
-      = *pip;
-    *pip = 0.0;    
+  if ( i_thread < n_node * n_port )
+  {
+    int i_port = i_thread % n_port;
+    int i_node = i_thread / n_port;
+    float* pip = port_input_pt + i_node * n_var + n_port_var * i_port;
+    // printf("port %d node %d pip %f\n", i_port, i_node, *pip);
+    port_value_pt[ i_node * n_var + n_port_var * i_port ] = *pip;
+    *pip = 0.0;
   }
 }
 
@@ -53,9 +48,10 @@ ext_neuron::~ext_neuron()
   FreeParamArr();
 }
 
-int ext_neuron::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+ext_neuron::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_ext_neuron_model;
   ext_neuron_flag_ = true;
   n_scal_var_ = N_SCAL_VAR;
@@ -63,69 +59,73 @@ int ext_neuron::Init(int i_node_0, int n_node, int n_port,
   n_scal_param_ = N_SCAL_PARAM;
   n_port_param_ = N_PORT_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
   AllocParamArr();
   AllocVarArr();
 
-  scal_var_name_ = NULL; //ext_neuron_scal_var_name;
-  port_var_name_= ext_neuron_port_var_name;
+  scal_var_name_ = nullptr; // ext_neuron_scal_var_name;
+  port_var_name_ = ext_neuron_port_var_name;
   scal_param_name_ = ext_neuron_scal_param_name;
   port_param_name_ = ext_neuron_port_param_name;
 
-  port_weight_arr_ = GetParamArr() + n_scal_param_
-    + GetPortParamIdx("port_weight");
+  port_weight_arr_ = GetParamArr() + n_scal_param_ + GetPortParamIdx( "port_weight" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = n_port_param_;
 
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("port_input");
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "port_input" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
-  SetScalParam(0, n_node, "den_delay", 0.0);
+  SetScalParam( 0, n_node, "den_delay", 0.0 );
 
-  for (int i=0; i<n_port; i++) {
-    port_weight_vect_.push_back(1.0);
-    port_input_vect_.push_back(0.0);
+  for ( int i = 0; i < n_port; i++ )
+  {
+    port_weight_vect_.push_back( 1.0 );
+    port_input_vect_.push_back( 0.0 );
   }
-  SetPortParam(0, n_node, "port_weight", port_weight_vect_.data(), n_port);
-  SetPortVar(0, n_node, "port_input", port_input_vect_.data(), n_port);
-  
+  SetPortParam( 0, n_node, "port_weight", port_weight_vect_.data(), n_port );
+  SetPortVar( 0, n_node, "port_input", port_input_vect_.data(), n_port );
+
   return 0;
 }
 
-int ext_neuron::Update(long long it, double t1) {
+int
+ext_neuron::Update( long long it, double t1 )
+{
   // std::cout << "Ext neuron update\n";
-  float *port_input_pt =  GetVarPt(0, "port_input", 0);
-  float *port_value_pt =  GetVarPt(0, "port_value", 0);
-  
-  UpdateExtNeuron<<<(n_node_*n_port_+1023)/1024, 1024>>>
-    (port_input_pt, port_value_pt, n_node_, n_var_, n_port_var_, n_port_);
-  //gpuErrchk( cudaDeviceSynchronize() );
-  
+  float* port_input_pt = GetVarPt( 0, "port_input", 0 );
+  float* port_value_pt = GetVarPt( 0, "port_value", 0 );
+
+  UpdateExtNeuron<<< ( n_node_ * n_port_ + 1023 ) / 1024, 1024 >>>(
+    port_input_pt, port_value_pt, n_node_, n_var_, n_port_var_, n_port_ );
+  // gpuErrchk( cudaDeviceSynchronize() );
+
   return 0;
 }
 
-int ext_neuron::Free()
+int
+ext_neuron::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
-  
+
   return 0;
 }
 
-float *ext_neuron::GetExtNeuronInputSpikes(int *n_node, int *n_port)
+float*
+ext_neuron::GetExtNeuronInputSpikes( int* n_node, int* n_port )
 {
-  if ((int)ext_neuron_input_spikes_.size()<n_node_*n_port_) {
-    ext_neuron_input_spikes_.resize(n_node_*n_port_, 0.0);
+  if ( ( int ) ext_neuron_input_spikes_.size() < n_node_ * n_port_ )
+  {
+    ext_neuron_input_spikes_.resize( n_node_ * n_port_, 0.0 );
   }
   *n_node = n_node_;
   *n_port = n_port_;
-  float *var_arr = GetPortVar(0, n_node_, "port_value");
-  ext_neuron_input_spikes_.assign(var_arr, var_arr+n_node_*n_port_);
-  free(var_arr);
-  
+  float* var_arr = GetPortVar( 0, n_node_, "port_value" );
+  ext_neuron_input_spikes_.assign( var_arr, var_arr + n_node_ * n_port_ );
+  free( var_arr );
+
   return ext_neuron_input_spikes_.data();
 }
diff --git a/src/ext_neuron.h b/src/ext_neuron.h
index 5da068b47..457475d64 100644
--- a/src/ext_neuron.h
+++ b/src/ext_neuron.h
@@ -20,75 +20,65 @@
  *
  */
 
-
-
-
-
 #ifndef EXTNEURON_H
 #define EXTNEURON_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 namespace ext_neuron_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   N_SCAL_VAR = 0
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_port_input = 0,
   i_port_value,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_den_delay = 0,
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_port_weight = 0,
   N_PORT_PARAM
 };
 
-//const std::string *ext_neuron_scal_var_name[N_SCAL_VAR] = {};
+// const std::string *ext_neuron_scal_var_name[N_SCAL_VAR] = {};
 
-const std::string ext_neuron_port_var_name[N_PORT_VAR] = {
-  "port_input", "port_value"
-};
+const std::string ext_neuron_port_var_name[ N_PORT_VAR ] = { "port_input", "port_value" };
 
-const std::string ext_neuron_scal_param_name[N_SCAL_PARAM] = {
-  "den_delay"
-};
+const std::string ext_neuron_scal_param_name[ N_SCAL_PARAM ] = { "den_delay" };
 
-const std::string ext_neuron_port_param_name[N_PORT_PARAM] = {
-  "port_weight"
-};
+const std::string ext_neuron_port_param_name[ N_PORT_PARAM ] = { "port_weight" };
 
-}
+} // namespace ext_neuron_ns
 
 class ext_neuron : public BaseNeuron
 {
- public:
+public:
   ~ext_neuron();
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
 
-  //int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
+  // int Calibrate(double time_min, float time_resolution);
+
+  int Update( long long it, double t1 );
 
   int Free();
 
-  float *GetExtNeuronInputSpikes(int *n_node, int *n_port);
-  
+  float* GetExtNeuronInputSpikes( int* n_node, int* n_port );
 };
 
-
 #endif
diff --git a/src/getRealTime.cu b/src/getRealTime.cu
index ec1a6b60a..ee7841fd5 100644
--- a/src/getRealTime.cu
+++ b/src/getRealTime.cu
@@ -20,10 +20,6 @@
  *
  */
 
-
-
-
-
 /*
  * Author:  David Robert Nadeau
  * Site:    http://NadeauSoftware.com/
@@ -31,16 +27,16 @@
  *          http://creativecommons.org/licenses/by/3.0/deed.en_US
  */
 
-#if defined(_WIN32)
-#include <config.h>
+#if defined( _WIN32 )
 #include <Windows.h>
+#include <config.h>
 
-#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
-#include <unistd.h>	/* POSIX flags */
-#include <time.h>	/* clock_gettime(), time() */
-#include <sys/time.h>	/* gethrtime(), gettimeofday() */
+#elif defined( __unix__ ) || defined( __unix ) || defined( unix ) || ( defined( __APPLE__ ) && defined( __MACH__ ) )
+#include <sys/time.h> /* gethrtime(), gettimeofday() */
+#include <time.h>     /* clock_gettime(), time() */
+#include <unistd.h>   /* POSIX flags */
 
-#if defined(__MACH__) && defined(__APPLE__)
+#if defined( __MACH__ ) && defined( __APPLE__ )
 #include <mach/mach.h>
 #include <mach/mach_time.h>
 #endif
@@ -49,10 +45,6 @@
 #error "Unable to define getRealTime( ) for an unknown OS."
 #endif
 
-
-
-
-
 /**
  * Returns the real time, in seconds, or -1.0 if an error occurred.
  *
@@ -60,73 +52,74 @@
  * The returned real time is only useful for computing an elapsed time
  * between two calls to this function.
  */
-double getRealTime( )
+double
+getRealTime()
 {
-#if defined(_WIN32)
-	FILETIME tm;
-	ULONGLONG t;
-#if defined(NTDDI_WIN8) && NTDDI_VERSION >= NTDDI_WIN8
-	/* Windows 8, Windows Server 2012 and later. ---------------- */
-	GetSystemTimePreciseAsFileTime( &tm );
+#if defined( _WIN32 )
+  FILETIME tm;
+  ULONGLONG t;
+#if defined( NTDDI_WIN8 ) && NTDDI_VERSION >= NTDDI_WIN8
+  /* Windows 8, Windows Server 2012 and later. ---------------- */
+  GetSystemTimePreciseAsFileTime( &tm );
 #else
-	/* Windows 2000 and later. ---------------------------------- */
-	GetSystemTimeAsFileTime( &tm );
+  /* Windows 2000 and later. ---------------------------------- */
+  GetSystemTimeAsFileTime( &tm );
 #endif
-	t = ((ULONGLONG)tm.dwHighDateTime << 32) | (ULONGLONG)tm.dwLowDateTime;
-	return (double)t / 10000000.0;
-
-#elif (defined(__hpux) || defined(hpux)) || ((defined(__sun__) || defined(__sun) || defined(sun)) && (defined(__SVR4) || defined(__svr4__)))
-	/* HP-UX, Solaris. ------------------------------------------ */
-	return (double)gethrtime( ) / 1000000000.0;
-
-#elif defined(__MACH__) && defined(__APPLE__)
-	/* OSX. ----------------------------------------------------- */
-	static double timeConvert = 0.0;
-	if ( timeConvert == 0.0 )
-	{
-		mach_timebase_info_data_t timeBase;
-		(void)mach_timebase_info( &timeBase );
-		timeConvert = (double)timeBase.numer /
-			(double)timeBase.denom /
-			1000000000.0;
-	}
-	return (double)mach_absolute_time( ) * timeConvert;
-
-#elif defined(_POSIX_VERSION)
-	/* POSIX. --------------------------------------------------- */
-#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0)
-	{
-		struct timespec ts;
-#if defined(CLOCK_MONOTONIC_PRECISE)
-		/* BSD. --------------------------------------------- */
-		const clockid_t id = CLOCK_MONOTONIC_PRECISE;
-#elif defined(CLOCK_MONOTONIC_RAW)
-		/* Linux. ------------------------------------------- */
-		const clockid_t id = CLOCK_MONOTONIC_RAW;
-#elif defined(CLOCK_HIGHRES)
-		/* Solaris. ----------------------------------------- */
-		const clockid_t id = CLOCK_HIGHRES;
-#elif defined(CLOCK_MONOTONIC)
-		/* AIX, BSD, Linux, POSIX, Solaris. ----------------- */
-		const clockid_t id = CLOCK_MONOTONIC;
-#elif defined(CLOCK_REALTIME)
-		/* AIX, BSD, HP-UX, Linux, POSIX. ------------------- */
-		const clockid_t id = CLOCK_REALTIME;
+  t = ( ( ULONGLONG ) tm.dwHighDateTime << 32 ) | ( ULONGLONG ) tm.dwLowDateTime;
+  return ( double ) t / 10000000.0;
+
+#elif ( defined( __hpux ) || defined( hpux ) ) \
+  || ( ( defined( __sun__ ) || defined( __sun ) || defined( sun ) ) && ( defined( __SVR4 ) || defined( __svr4__ ) ) )
+  /* HP-UX, Solaris. ------------------------------------------ */
+  return ( double ) gethrtime() / 1000000000.0;
+
+#elif defined( __MACH__ ) && defined( __APPLE__ )
+  /* OSX. ----------------------------------------------------- */
+  static double timeConvert = 0.0;
+  if ( timeConvert == 0.0 )
+  {
+    mach_timebase_info_data_t timeBase;
+    ( void ) mach_timebase_info( &timeBase );
+    timeConvert = ( double ) timeBase.numer / ( double ) timeBase.denom / 1000000000.0;
+  }
+  return ( double ) mach_absolute_time() * timeConvert;
+
+#elif defined( _POSIX_VERSION )
+  /* POSIX. --------------------------------------------------- */
+#if defined( _POSIX_TIMERS ) && ( _POSIX_TIMERS > 0 )
+  {
+    struct timespec ts;
+#if defined( CLOCK_MONOTONIC_PRECISE )
+    /* BSD. --------------------------------------------- */
+    const clockid_t id = CLOCK_MONOTONIC_PRECISE;
+#elif defined( CLOCK_MONOTONIC_RAW )
+    /* Linux. ------------------------------------------- */
+    const clockid_t id = CLOCK_MONOTONIC_RAW;
+#elif defined( CLOCK_HIGHRES )
+    /* Solaris. ----------------------------------------- */
+    const clockid_t id = CLOCK_HIGHRES;
+#elif defined( CLOCK_MONOTONIC )
+    /* AIX, BSD, Linux, POSIX, Solaris. ----------------- */
+    const clockid_t id = CLOCK_MONOTONIC;
+#elif defined( CLOCK_REALTIME )
+    /* AIX, BSD, HP-UX, Linux, POSIX. ------------------- */
+    const clockid_t id = CLOCK_REALTIME;
 #else
-		const clockid_t id = (clockid_t)-1;	/* Unknown. */
+    const clockid_t id = ( clockid_t ) -1; /* Unknown. */
 #endif /* CLOCK_* */
-		if ( id != (clockid_t)-1 && clock_gettime( id, &ts ) != -1 )
-			return (double)ts.tv_sec +
-				(double)ts.tv_nsec / 1000000000.0;
-		/* Fall thru. */
-	}
+    if ( id != ( clockid_t ) -1 && clock_gettime( id, &ts ) != -1 )
+    {
+      return ( double ) ts.tv_sec + ( double ) ts.tv_nsec / 1000000000.0;
+    }
+    /* Fall thru. */
+  }
 #endif /* _POSIX_TIMERS */
 
-	/* AIX, BSD, Cygwin, HP-UX, Linux, OSX, POSIX, Solaris. ----- */
-	struct timeval tm;
-	gettimeofday( &tm, NULL );
-	return (double)tm.tv_sec + (double)tm.tv_usec / 1000000.0;
+  /* AIX, BSD, Cygwin, HP-UX, Linux, OSX, POSIX, Solaris. ----- */
+  struct timeval tm;
+  gettimeofday( &tm, nullptr );
+  return ( double ) tm.tv_sec + ( double ) tm.tv_usec / 1000000.0;
 #else
-	return -1.0;		/* Failed. */
+  return -1.0; /* Failed. */
 #endif
 }
diff --git a/src/getRealTime.h b/src/getRealTime.h
index 5dff62f2b..fb04461a4 100644
--- a/src/getRealTime.h
+++ b/src/getRealTime.h
@@ -20,10 +20,6 @@
  *
  */
 
-
-
-
-
 #ifndef GETREALTIME_H
 #define GETREALTIME_H
 
diff --git a/src/get_spike.cu b/src/get_spike.cu
index 3e40375bb..6ab0e8af3 100644
--- a/src/get_spike.cu
+++ b/src/get_spike.cu
@@ -23,24 +23,23 @@
 #include <config.h>
 #include <stdio.h>
 
+#include "connect.h"
+#include "cuda_error.h"
 #include "nestgpu.h"
 #include "node_group.h"
 #include "send_spike.h"
 #include "spike_buffer.h"
-#include "cuda_error.h"
-#include "connect.h"
-
 
 // improve using a grid
 /*
 __global__ void GetSpikes(double *spike_array, int array_size, int n_port,
-			  int n_var,
-			  float *port_weight_arr,
-			  int port_weight_arr_step,
-			  int port_weight_port_step,
-			  float *port_input_arr,
-			  int port_input_arr_step,
-			  int port_input_port_step)
+                          int n_var,
+                          float *port_weight_arr,
+                          int port_weight_arr_step,
+                          int port_weight_port_step,
+                          float *port_input_arr,
+                          int port_input_arr_step,
+                          int port_input_port_step)
 {
   int i_array = threadIdx.x + blockIdx.x * blockDim.x;
   if (i_array < array_size*n_port) {
@@ -59,54 +58,58 @@ __global__ void GetSpikes(double *spike_array, int array_size, int n_port,
 }
 */
 
-__global__ void GetSpikes(double *spike_array, int array_size, int n_port,
-			  int n_var,
-			  float *port_weight_arr,
-			  int port_weight_arr_step,
-			  int port_weight_port_step,
-			  float *port_input_arr,
-			  int port_input_arr_step,
-			  int port_input_port_step)
+__global__ void
+GetSpikes( double* spike_array,
+  int array_size,
+  int n_port,
+  int n_var,
+  float* port_weight_arr,
+  int port_weight_arr_step,
+  int port_weight_port_step,
+  float* port_input_arr,
+  int port_input_arr_step,
+  int port_input_port_step )
 {
-  int i_target = blockIdx.x*blockDim.x+threadIdx.x;
-  int port = blockIdx.y*blockDim.y+threadIdx.y;
+  int i_target = blockIdx.x * blockDim.x + threadIdx.x;
+  int port = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if ( i_target < array_size && port < n_port )
+  {
+    int i_array = port * array_size + i_target;
+    int port_input = i_target * port_input_arr_step + port_input_port_step * port;
+    int port_weight = i_target * port_weight_arr_step + port_weight_port_step * port;
+    double d_val = ( double ) port_input_arr[ port_input ] + spike_array[ i_array ] * port_weight_arr[ port_weight ];
 
-  if (i_target < array_size && port<n_port) {
-    int i_array = port*array_size + i_target;
-    int port_input = i_target*port_input_arr_step
-      + port_input_port_step*port;
-    int port_weight = i_target*port_weight_arr_step
-      + port_weight_port_step*port;
-    double d_val = (double)port_input_arr[port_input]
-      + spike_array[i_array]
-      * port_weight_arr[port_weight];
-    
-    port_input_arr[port_input] = (float)d_val;
+    port_input_arr[ port_input ] = ( float ) d_val;
   }
 }
 
-
-int NESTGPU::ClearGetSpikeArrays()
+int
+NESTGPU::ClearGetSpikeArrays()
 {
-  for (unsigned int i=0; i<node_vect_.size(); i++) {
-    BaseNeuron *bn = node_vect_[i];
-    if (bn->get_spike_array_ != NULL) {
-      gpuErrchk(cudaMemsetAsync(bn->get_spike_array_, 0, bn->n_node_*bn->n_port_
-			   *sizeof(double)));
+  for ( unsigned int i = 0; i < node_vect_.size(); i++ )
+  {
+    BaseNeuron* bn = node_vect_[ i ];
+    if ( bn->get_spike_array_ != nullptr )
+    {
+      gpuErrchk( cudaMemsetAsync( bn->get_spike_array_, 0, bn->n_node_ * bn->n_port_ * sizeof( double ) ) );
     }
   }
-  
+
   return 0;
 }
 
-int NESTGPU::FreeGetSpikeArrays()
+int
+NESTGPU::FreeGetSpikeArrays()
 {
-  for (unsigned int i=0; i<node_vect_.size(); i++) {
-    BaseNeuron *bn = node_vect_[i];
-    if (bn->get_spike_array_ != NULL) {
-      CUDAFREECTRL("bn->get_spike_array_",bn->get_spike_array_);
+  for ( unsigned int i = 0; i < node_vect_.size(); i++ )
+  {
+    BaseNeuron* bn = node_vect_[ i ];
+    if ( bn->get_spike_array_ != nullptr )
+    {
+      CUDAFREECTRL( "bn->get_spike_array_", bn->get_spike_array_ );
     }
   }
-  
+
   return 0;
 }
diff --git a/src/get_spike.h b/src/get_spike.h
index d1b398cc3..199e60896 100644
--- a/src/get_spike.h
+++ b/src/get_spike.h
@@ -20,90 +20,116 @@
  *
  */
 
-
 #ifndef GETSPIKE_H
 #define GETSPIKE_H
-#include "utilities.h"
-#include "send_spike.h"
+#include "conn12b.h"
+#include "conn16b.h"
 #include "connect.h"
 #include "node_group.h"
+#include "send_spike.h"
 #include "spike_buffer.h"
+#include "syn_model.h"
+#include "utilities.h"
 
 extern __constant__ NodeGroupStruct NodeGroupArray[];
-extern __device__ int16_t *NodeGroupMap;
+extern __device__ int16_t* NodeGroupMap;
 extern __constant__ float NESTGPUTimeResolution;
 extern __constant__ long long NESTGPUTimeIdx;
 
-template<int i_func>
-__device__  __forceinline__ void NestedLoopFunction(int i_spike, int i_syn);
-
 //////////////////////////////////////////////////////////////////////
 // This is the function called by the nested loop
 // that collects the spikes
-template<>
-__device__  __forceinline__ void NestedLoopFunction<0>(int i_spike, int i_syn)
+template < class ConnKeyT, class ConnStructT >
+__device__ __forceinline__ void
+NestedLoopFunction0( int i_spike, int i_syn )
 {
-  int i_source = SpikeSourceIdx[i_spike];
-  int i_source_conn_group = SpikeConnIdx[i_spike];
-  float height = SpikeHeight[i_spike];
-  int ig = ConnGroupIdx0[i_source] + i_source_conn_group;
+  int i_source = SpikeSourceIdx[ i_spike ];
+  int i_source_conn_group = SpikeConnIdx[ i_spike ];
+  float height = SpikeHeight[ i_spike ];
+  int ig = ConnGroupIdx0[ i_source ] + i_source_conn_group;
 
-  int64_t i_conn = ConnGroupIConn0[ig] + i_syn;
-  uint i_block = (uint)(i_conn / ConnBlockSize);
+  int64_t i_conn = ConnGroupIConn0[ ig ] + i_syn;
+  uint i_block = ( uint ) ( i_conn / ConnBlockSize );
   int64_t i_block_conn = i_conn % ConnBlockSize;
-  connection_struct conn = ConnectionArray[i_block][i_block_conn];
-  uint target_port_syn = conn.target_port_syn;
-  int i_target = target_port_syn >> MaxPortSynNBits;
-  uint port = (target_port_syn & PortSynMask) >> MaxSynNBits;
-  unsigned char syn_group = target_port_syn & SynMask;
-  float weight = conn.weight;
-  //printf("ok target: %d\tport: %d\t syn_group: %d\tweight-0.0005: %.7e\n",
+  // connection_struct conn = ConnectionArray[i_block][i_block_conn];
+  // uint target_port_syn = conn.target_port_syn;
+  // int i_target = target_port_syn >> MaxPortSynNBits;
+  // uint port = (target_port_syn & PortSynMask) >> MaxSynNBits;
+  // unsigned char syn_group = target_port_syn & SynMask;
+  ConnKeyT& conn_key = ( ( ConnKeyT** ) ConnKeyArray )[ i_block ][ i_block_conn ];
+  ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ];
+  inode_t i_target = getConnTarget< ConnStructT >( conn_struct );
+  int port = getConnPort< ConnKeyT, ConnStructT >( conn_key, conn_struct );
+  int syn_group = getConnSyn< ConnKeyT, ConnStructT >( conn_key, conn_struct );
+  float weight = conn_struct.weight;
+  // printf("ok target: %d\tport: %d\t syn_group: %d\tweight-0.0005: %.7e\n",
   //	 i_target, port, syn_group, weight-0.0005);
 
-  //printf("handles spike %d src %d conn %ld syn %d target %d"
+  // printf("handles spike %d src %d conn %ld syn %d target %d"
   //	 " port %d weight %f syn_group %d\n",
   //	 i_spike, i_source, i_conn, i_syn, i_target,
   //	 port, weight, syn_group);
-  
+
   /////////////////////////////////////////////////////////////////
-  int i_group=NodeGroupMap[i_target];
-  int i = port*NodeGroupArray[i_group].n_node_ + i_target
-    - NodeGroupArray[i_group].i_node_0_;
-  double d_val = (double)(height*weight);
+  int i_group = NodeGroupMap[ i_target ];
+  int64_t i = ( int64_t ) port * NodeGroupArray[ i_group ].n_node_ + i_target - NodeGroupArray[ i_group ].i_node_0_;
+  double d_val = ( double ) ( height * weight );
 
-  atomicAddDouble(&NodeGroupArray[i_group].get_spike_array_[i], d_val);
+  atomicAddDouble( &NodeGroupArray[ i_group ].get_spike_array_[ i ], d_val );
   // TO BE IMPROVED BY CHECKING IF THE SYNAPSE TYPE OF THE GROUP
   // REQUIRES AN UPDATE BASED ON POST-PRE SPIKE TIME DIFFERENCE
-  if (syn_group>0) {
-    //ConnectionGroupTargetSpikeTime[i_conn*NSpikeBuffer+i_source][i_syn]
-    ConnectionSpikeTime[i_conn]
-      = (unsigned short)(NESTGPUTimeIdx & 0xffff);
-    
-    long long Dt_int = NESTGPUTimeIdx - LastRevSpikeTimeIdx[i_target];
+  if ( syn_group > 0 )
+  {
+    // ConnectionGroupTargetSpikeTime[i_conn*NSpikeBuffer+i_source][i_syn]
+    ConnectionSpikeTime[ i_conn ] = ( unsigned short ) ( NESTGPUTimeIdx & 0xffff );
+
+    long long Dt_int = NESTGPUTimeIdx - LastRevSpikeTimeIdx[ i_target ];
 
     //    printf("spike src %d target %d weight %f syn_group %d "
     //	   "TimeIdx %lld LRST %lld Dt %lld\n",
     //	   i_source, i_target, weight, syn_group,
     //	   NESTGPUTimeIdx, LastRevSpikeTimeIdx[i_target], Dt_int);
-    
-     if (Dt_int>0 && Dt_int<MAX_SYN_DT) {
-       SynapseUpdate(syn_group,
-		     &(ConnectionArray[i_block][i_block_conn].weight),
-		     -NESTGPUTimeResolution*Dt_int);
+
+    if ( Dt_int > 0 && Dt_int < MAX_SYN_DT )
+    {
+      SynapseUpdate( syn_group, &( conn_struct.weight ), -NESTGPUTimeResolution * Dt_int );
     }
   }
   ////////////////////////////////////////////////////////////////
 }
 ///////////////
 
+template < int i_func >
+__device__ __forceinline__ void NestedLoopFunction( int i_spike, int i_syn );
+
+//////////////////////////////////////////////////////////////////////
+// This is the function called by the nested loop
+// that collects the spikes
+// Include more integer template specializations
+// for different connection types
+template <>
+__device__ __forceinline__ void
+NestedLoopFunction< 0 >( int i_spike, int i_syn )
+{
+  NestedLoopFunction0< conn12b_key, conn12b_struct >( i_spike, i_syn );
+}
+
+template <>
+__device__ __forceinline__ void
+NestedLoopFunction< 2 >( int i_spike, int i_syn )
+{
+  NestedLoopFunction0< conn16b_key, conn16b_struct >( i_spike, i_syn );
+}
 
-__global__ void GetSpikes(double *spike_array, int array_size, int n_port,
-			  int n_var,
-			  float *port_weight_arr,
-			  int port_weight_arr_step,
-			  int port_weight_port_step, //float *y_arr);
-			  float *port_input_arr,
-			  int port_input_arr_step,
-			  int port_input_port_step);
+__global__ void GetSpikes( double* spike_array,
+  int array_size,
+  int n_port,
+  int n_var,
+  float* port_weight_arr,
+  int port_weight_arr_step,
+  int port_weight_port_step, // float *y_arr);
+  float* port_input_arr,
+  int port_input_arr_step,
+  int port_input_port_step );
 
 #endif
diff --git a/src/iaf_psc_alpha.cu b/src/iaf_psc_alpha.cu
index 34310fa43..0bd3054b8 100644
--- a/src/iaf_psc_alpha.cu
+++ b/src/iaf_psc_alpha.cu
@@ -20,68 +20,64 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/iaf_psc_alpha.cpp
 
-#include <config.h>
-#include <cmath>
-#include <iostream>
 #include "iaf_psc_alpha.h"
 #include "propagator_stability.h"
 #include "spike_buffer.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 using namespace iaf_psc_alpha_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
-extern __device__ double propagator_31(double, double, double, double);
-extern __device__ double propagator_32(double, double, double, double);
-
-#define I_ex var[i_I_ex]
-#define I_in var[i_I_in]
-#define dI_ex var[i_dI_ex]
-#define dI_in var[i_dI_in]
-#define V_m_rel var[i_V_m_rel]
-#define refractory_step var[i_refractory_step]
-
-#define tau_m param[i_tau_m]
-#define C_m param[i_C_m]
-#define E_L param[i_E_L]
-#define I_e param[i_I_e]
-#define Theta_rel param[i_Theta_rel]
-#define V_reset_rel param[i_V_reset_rel]
-#define tau_ex param[i_tau_ex]
-#define tau_in param[i_tau_in]
-#define t_ref param[i_t_ref]
-#define den_delay param[i_den_delay]
-
-#define P11ex param[i_P11ex]
-#define P11in param[i_P11in]
-#define P21ex param[i_P21ex]
-#define P21in param[i_P21in]
-#define P22ex param[i_P22ex]
-#define P22in param[i_P22in]
-#define P31ex param[i_P31ex]
-#define P31in param[i_P31in]
-#define P32ex param[i_P32ex]
-#define P32in param[i_P32in]
-#define P30 param[i_P30]
-#define P33 param[i_P33]
-#define expm1_tau_m param[i_expm1_tau_m]
-#define EPSCInitialValue param[i_EPSCInitialValue]
-#define IPSCInitialValue param[i_IPSCInitialValue]
-
-
-__global__ void iaf_psc_alpha_Calibrate(int n_node, float *param_arr,
-				      int n_param, float h)
+extern __device__ double propagator_31( double, double, double, double );
+extern __device__ double propagator_32( double, double, double, double );
+
+#define I_ex var[ i_I_ex ]
+#define I_in var[ i_I_in ]
+#define dI_ex var[ i_dI_ex ]
+#define dI_in var[ i_dI_in ]
+#define V_m_rel var[ i_V_m_rel ]
+#define refractory_step var[ i_refractory_step ]
+
+#define tau_m param[ i_tau_m ]
+#define C_m param[ i_C_m ]
+#define E_L param[ i_E_L ]
+#define I_e param[ i_I_e ]
+#define Theta_rel param[ i_Theta_rel ]
+#define V_reset_rel param[ i_V_reset_rel ]
+#define tau_ex param[ i_tau_ex ]
+#define tau_in param[ i_tau_in ]
+#define t_ref param[ i_t_ref ]
+#define den_delay param[ i_den_delay ]
+
+#define P11ex param[ i_P11ex ]
+#define P11in param[ i_P11in ]
+#define P21ex param[ i_P21ex ]
+#define P21in param[ i_P21in ]
+#define P22ex param[ i_P22ex ]
+#define P22in param[ i_P22in ]
+#define P31ex param[ i_P31ex ]
+#define P31in param[ i_P31in ]
+#define P32ex param[ i_P32ex ]
+#define P32in param[ i_P32in ]
+#define P30 param[ i_P30 ]
+#define P33 param[ i_P33 ]
+#define expm1_tau_m param[ i_expm1_tau_m ]
+#define EPSCInitialValue param[ i_EPSCInitialValue ]
+#define IPSCInitialValue param[ i_IPSCInitialValue ]
+
+__global__ void
+iaf_psc_alpha_Calibrate( int n_node, float* param_arr, int n_param, float h )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *param = param_arr + n_param*i_neuron;
-    
+  if ( i_neuron < n_node )
+  {
+    float* param = param_arr + n_param * i_neuron;
+
     P11ex = P22ex = exp( -h / tau_ex );
     P11in = P22in = exp( -h / tau_in );
     P33 = exp( -h / tau_m );
@@ -91,35 +87,36 @@ __global__ void iaf_psc_alpha_Calibrate(int n_node, float *param_arr,
     P21ex = h * P11ex;
     P21in = h * P11in;
 
-    P31ex = (float)propagator_31( tau_ex, tau_m, C_m, h );
-    P32ex = (float)propagator_32( tau_ex, tau_m, C_m, h );
-    P31in = (float)propagator_31( tau_in, tau_m, C_m, h );
-    P32in = (float)propagator_32( tau_in, tau_m, C_m, h );
+    P31ex = ( float ) propagator_31( tau_ex, tau_m, C_m, h );
+    P32ex = ( float ) propagator_32( tau_ex, tau_m, C_m, h );
+    P31in = ( float ) propagator_31( tau_in, tau_m, C_m, h );
+    P32in = ( float ) propagator_32( tau_in, tau_m, C_m, h );
 
     EPSCInitialValue = M_E / tau_ex;
     IPSCInitialValue = M_E / tau_in;
-
   }
 }
 
-
-__global__ void iaf_psc_alpha_Update(int n_node, int i_node_0, float *var_arr,
-				   float *param_arr, int n_var, int n_param)
+__global__ void
+iaf_psc_alpha_Update( int n_node, int i_node_0, float* var_arr, float* param_arr, int n_var, int n_param )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
 
-    if ( refractory_step > 0.0 ) {
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V
-      V_m_rel = P30 * I_e + P31ex * dI_ex + P32ex * I_ex
-               + P31in * dI_in + P32in * I_in + expm1_tau_m * V_m_rel + V_m_rel;
+    else
+    { // neuron is not refractory, so evolve V
+      V_m_rel =
+        P30 * I_e + P31ex * dI_ex + P32ex * I_ex + P31in * dI_in + P32in * I_in + expm1_tau_m * V_m_rel + V_m_rel;
     }
-  
+
     // alpha shape PSCs
     I_ex = P21ex * dI_ex + P22ex * I_ex;
     dI_ex *= P11ex;
@@ -127,10 +124,11 @@ __global__ void iaf_psc_alpha_Update(int n_node, int i_node_0, float *var_arr,
     I_in = P21in * dI_in + P22in * I_in;
     dI_in *= P11in;
 
-    if (V_m_rel >= Theta_rel ) { // threshold crossing
-      PushSpike(i_node_0 + i_neuron, 1.0);
+    if ( V_m_rel >= Theta_rel )
+    { // threshold crossing
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m_rel = V_reset_rel;
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
     }
   }
 }
@@ -141,90 +139,92 @@ iaf_psc_alpha::~iaf_psc_alpha()
   FreeParamArr();
 }
 
-int iaf_psc_alpha::Init(int i_node_0, int n_node, int n_port,
-			 int i_group)
+int
+iaf_psc_alpha::Init( int i_node_0, int n_node, int n_port, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_iaf_psc_alpha_model;
 
   n_scal_var_ = N_SCAL_VAR;
   n_var_ = n_scal_var_;
   n_scal_param_ = N_SCAL_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
 
   scal_var_name_ = iaf_psc_alpha_scal_var_name;
   scal_param_name_ = iaf_psc_alpha_scal_param_name;
 
-  SetScalParam(0, n_node, "tau_m", 10.0 );           // in ms
-  SetScalParam(0, n_node, "C_m", 250.0 );            // in pF
-  SetScalParam(0, n_node, "E_L", -70.0 );            // in mV
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
-  SetScalParam(0, n_node, "Theta_rel", -55.0 - (-70.0) );   // relative to E_L_
-  SetScalParam(0, n_node, "V_reset_rel", -70.0 - (-70.0) ); // relative to E_L_
-  SetScalParam(0, n_node, "tau_syn_ex", 2.0 );           // in ms
-  SetScalParam(0, n_node, "tau_syn_in", 2.0 );           // in ms
-  SetScalParam(0, n_node, "t_ref",  2.0 );           // in ms
-  SetScalParam(0, n_node, "den_delay", 0.0);         // in ms
-  SetScalParam(0, n_node, "P11ex", 0.0);
-  SetScalParam(0, n_node, "P11in", 0.0);
-  SetScalParam(0, n_node, "P21ex", 0.0);
-  SetScalParam(0, n_node, "P21in", 0.0);
-  SetScalParam(0, n_node, "P22ex", 0.0);
-  SetScalParam(0, n_node, "P22in", 0.0);
-  SetScalParam(0, n_node, "P31ex", 0.0);
-  SetScalParam(0, n_node, "P31in", 0.0);
-  SetScalParam(0, n_node, "P32ex", 0.0);
-  SetScalParam(0, n_node, "P32in", 0.0);
-  SetScalParam(0, n_node, "P30", 0.0);
-  SetScalParam(0, n_node, "P33", 0.0);
-  SetScalParam(0, n_node, "EPSCInitialValue", 0.0);
-  SetScalParam(0, n_node, "IPSCInitialValue", 0.0);
-
-  SetScalVar(0, n_node, "I_syn_ex", 0.0 );
-  SetScalVar(0, n_node, "dI_ex", 0.0 );
-  SetScalVar(0, n_node, "I_syn_in", 0.0 );
-  SetScalVar(0, n_node, "dI_in", 0.0 );
-  SetScalVar(0, n_node, "V_m_rel", -70.0 - (-70.0) ); // in mV, relative to E_L
-  SetScalVar(0, n_node, "refractory_step", 0 );
-  
-  port_weight_arr_ = GetParamArr() + GetScalParamIdx("EPSCInitialValue");
+  SetScalParam( 0, n_node, "tau_m", 10.0 );                    // in ms
+  SetScalParam( 0, n_node, "C_m", 250.0 );                     // in pF
+  SetScalParam( 0, n_node, "E_L", -70.0 );                     // in mV
+  SetScalParam( 0, n_node, "I_e", 0.0 );                       // in pA
+  SetScalParam( 0, n_node, "Theta_rel", -55.0 - ( -70.0 ) );   // relative to E_L_
+  SetScalParam( 0, n_node, "V_reset_rel", -70.0 - ( -70.0 ) ); // relative to E_L_
+  SetScalParam( 0, n_node, "tau_syn_ex", 2.0 );                // in ms
+  SetScalParam( 0, n_node, "tau_syn_in", 2.0 );                // in ms
+  SetScalParam( 0, n_node, "t_ref", 2.0 );                     // in ms
+  SetScalParam( 0, n_node, "den_delay", 0.0 );                 // in ms
+  SetScalParam( 0, n_node, "P11ex", 0.0 );
+  SetScalParam( 0, n_node, "P11in", 0.0 );
+  SetScalParam( 0, n_node, "P21ex", 0.0 );
+  SetScalParam( 0, n_node, "P21in", 0.0 );
+  SetScalParam( 0, n_node, "P22ex", 0.0 );
+  SetScalParam( 0, n_node, "P22in", 0.0 );
+  SetScalParam( 0, n_node, "P31ex", 0.0 );
+  SetScalParam( 0, n_node, "P31in", 0.0 );
+  SetScalParam( 0, n_node, "P32ex", 0.0 );
+  SetScalParam( 0, n_node, "P32in", 0.0 );
+  SetScalParam( 0, n_node, "P30", 0.0 );
+  SetScalParam( 0, n_node, "P33", 0.0 );
+  SetScalParam( 0, n_node, "EPSCInitialValue", 0.0 );
+  SetScalParam( 0, n_node, "IPSCInitialValue", 0.0 );
+
+  SetScalVar( 0, n_node, "I_syn_ex", 0.0 );
+  SetScalVar( 0, n_node, "dI_ex", 0.0 );
+  SetScalVar( 0, n_node, "I_syn_in", 0.0 );
+  SetScalVar( 0, n_node, "dI_in", 0.0 );
+  SetScalVar( 0, n_node, "V_m_rel", -70.0 - ( -70.0 ) ); // in mV, relative to E_L
+  SetScalVar( 0, n_node, "refractory_step", 0 );
+
+  port_weight_arr_ = GetParamArr() + GetScalParamIdx( "EPSCInitialValue" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = 1;
 
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("dI_ex");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "dI_ex" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 1;
 
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
-  
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
+
   return 0;
 }
 
-int iaf_psc_alpha::Update(long long it, double t1)
+int
+iaf_psc_alpha::Update( long long it, double t1 )
 {
   // std::cout << "iaf_psc_alpha neuron update\n";
-  iaf_psc_alpha_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_);
+  iaf_psc_alpha_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>(
+    n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_ );
   // gpuErrchk( cudaDeviceSynchronize() );
-  
+
   return 0;
 }
 
-int iaf_psc_alpha::Free()
+int
+iaf_psc_alpha::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
-  
+
   return 0;
 }
 
-int iaf_psc_alpha::Calibrate(double, float time_resolution)
+int
+iaf_psc_alpha::Calibrate( double, float time_resolution )
 {
-  iaf_psc_alpha_Calibrate<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, param_arr_, n_param_, time_resolution);
+  iaf_psc_alpha_Calibrate<<< ( n_node_ + 1023 ) / 1024, 1024 >>>( n_node_, param_arr_, n_param_, time_resolution );
 
   return 0;
 }
diff --git a/src/iaf_psc_alpha.h b/src/iaf_psc_alpha.h
index a019c07a6..842cac3bf 100644
--- a/src/iaf_psc_alpha.h
+++ b/src/iaf_psc_alpha.h
@@ -20,24 +20,18 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/iaf_psc_alpha.h
 
-
 #ifndef IAFPSCALPHA_H
 #define IAFPSCALPHA_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 /* BeginUserDocs: neuron, integrate-and-fire, current-based
 
@@ -106,7 +100,7 @@ References
        DOI: https://doi.org/10.1007/s004220050570
 .. [2] Potjans TC. and Diesmann M. 2014. The cell-type specific cortical
        microcircuit: relating structure and activity in a full-scale spiking
-       network model. Cerebral Cortex. 24(3):785–806. 
+       network model. Cerebral Cortex. 24(3):785–806.
        DOI: https://doi.org/10.1093/cercor/bhs358.
 
 See also
@@ -116,32 +110,33 @@ iaf_psc_exp
 
 EndUserDocs */
 
-
 namespace iaf_psc_alpha_ns
 {
-enum ScalVarIndexes {
-  i_I_ex = 0,        // postsynaptic current for exc. inputs
-  i_I_in,            // postsynaptic current for inh. inputs
+enum ScalVarIndexes
+{
+  i_I_ex = 0, // postsynaptic current for exc. inputs
+  i_I_in,     // postsynaptic current for inh. inputs
   i_dI_ex,
   i_dI_in,
-  i_V_m_rel,                 // membrane potential
-  i_refractory_step,     // refractory step counter
+  i_V_m_rel,         // membrane potential
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_tau_m = 0,       // Membrane time constant in ms
-  i_C_m,             // Membrane capacitance in pF
-  i_E_L,             // Resting potential in mV
-  i_I_e,             // External current in pA
-  i_Theta_rel,       // Threshold, RELATIVE TO RESTING POTENTAIL(!)
-                     // i.e. the real threshold is (E_L_+Theta_rel_)
-  i_V_reset_rel,     // relative reset value of the membrane potential
-  i_tau_ex,          // Time constant of excitatory synaptic current in ms
-  i_tau_in,          // Time constant of inhibitory synaptic current in ms
+enum ScalParamIndexes
+{
+  i_tau_m = 0,   // Membrane time constant in ms
+  i_C_m,         // Membrane capacitance in pF
+  i_E_L,         // Resting potential in mV
+  i_I_e,         // External current in pA
+  i_Theta_rel,   // Threshold, RELATIVE TO RESTING POTENTAIL(!)
+                 // i.e. the real threshold is (E_L_+Theta_rel_)
+  i_V_reset_rel, // relative reset value of the membrane potential
+  i_tau_ex,      // Time constant of excitatory synaptic current in ms
+  i_tau_in,      // Time constant of inhibitory synaptic current in ms
   // i_rho,          // Stochastic firing intensity at threshold in 1/s
   // i_delta,        // Width of threshold region in mV
-  i_t_ref,           // Refractory period in ms
+  i_t_ref,     // Refractory period in ms
   i_den_delay, // dendritic backpropagation delay
   // time evolution operator
   i_P11ex,
@@ -162,19 +157,14 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
- 
-const std::string iaf_psc_alpha_scal_var_name[N_SCAL_VAR] = {
-  "I_syn_ex",
+const std::string iaf_psc_alpha_scal_var_name[ N_SCAL_VAR ] = { "I_syn_ex",
   "I_syn_in",
   "dI_ex",
   "dI_in",
   "V_m_rel",
-  "refractory_step"
-};
-
+  "refractory_step" };
 
-const std::string iaf_psc_alpha_scal_param_name[N_SCAL_PARAM] = {
-  "tau_m",
+const std::string iaf_psc_alpha_scal_param_name[ N_SCAL_PARAM ] = { "tau_m",
   "C_m",
   "E_L",
   "I_e",
@@ -198,25 +188,22 @@ const std::string iaf_psc_alpha_scal_param_name[N_SCAL_PARAM] = {
   "P33",
   "expm1_tau_m",
   "EPSCInitialValue",
-  "IPSCInitialValue"
-};
+  "IPSCInitialValue" };
+
+} // namespace iaf_psc_alpha_ns
 
-} // namespace
- 
 class iaf_psc_alpha : public BaseNeuron
 {
- public:
+public:
   ~iaf_psc_alpha();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
 
-  int Calibrate(double, float time_resolution);
-		
-  int Update(long long it, double t1);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
 
-  int Free();
+  int Calibrate( double, float time_resolution );
 
-};
+  int Update( long long it, double t1 );
 
+  int Free();
+};
 
 #endif
diff --git a/src/iaf_psc_exp.cu b/src/iaf_psc_exp.cu
index 6f5012736..8af4a1be6 100644
--- a/src/iaf_psc_exp.cu
+++ b/src/iaf_psc_exp.cu
@@ -20,92 +20,91 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/iaf_psc_exp.cpp
 
-#include <config.h>
-#include <cmath>
-#include <iostream>
 #include "iaf_psc_exp.h"
-#include "spike_buffer.h"
 #include "propagator_stability.h"
+#include "spike_buffer.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 using namespace iaf_psc_exp_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
-extern __device__ double propagator_32(double, double, double, double);
-
-#define I_syn_ex var[i_I_syn_ex]
-#define I_syn_in var[i_I_syn_in]
-#define V_m_rel var[i_V_m_rel]
-#define refractory_step var[i_refractory_step]
-
-#define tau_m param[i_tau_m]
-#define C_m param[i_C_m]
-#define E_L param[i_E_L]
-#define I_e param[i_I_e]
-#define Theta_rel param[i_Theta_rel]
-#define V_reset_rel param[i_V_reset_rel]
-#define tau_ex param[i_tau_ex]
-#define tau_in param[i_tau_in]
-//#define rho param[i_rho]
-//#define delta param[i_delta]
-#define t_ref param[i_t_ref]
-#define den_delay param[i_den_delay]
-
-#define P20 param[i_P20]
-#define P11ex param[i_P11ex]
-#define P11in param[i_P11in]
-#define P21ex param[i_P21ex]
-#define P21in param[i_P21in]
-#define P22 param[i_P22]
-
-
-__global__ void iaf_psc_exp_Calibrate(int n_node, float *param_arr,
-				      int n_param, float h)
+extern __device__ double propagator_32( double, double, double, double );
+
+#define I_syn_ex var[ i_I_syn_ex ]
+#define I_syn_in var[ i_I_syn_in ]
+#define V_m_rel var[ i_V_m_rel ]
+#define refractory_step var[ i_refractory_step ]
+
+#define tau_m param[ i_tau_m ]
+#define C_m param[ i_C_m ]
+#define E_L param[ i_E_L ]
+#define I_e param[ i_I_e ]
+#define Theta_rel param[ i_Theta_rel ]
+#define V_reset_rel param[ i_V_reset_rel ]
+#define tau_ex param[ i_tau_ex ]
+#define tau_in param[ i_tau_in ]
+// #define rho param[i_rho]
+// #define delta param[i_delta]
+#define t_ref param[ i_t_ref ]
+#define den_delay param[ i_den_delay ]
+
+#define P20 param[ i_P20 ]
+#define P11ex param[ i_P11ex ]
+#define P11in param[ i_P11in ]
+#define P21ex param[ i_P21ex ]
+#define P21in param[ i_P21in ]
+#define P22 param[ i_P22 ]
+
+__global__ void
+iaf_psc_exp_Calibrate( int n_node, float* param_arr, int n_param, float h )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *param = param_arr + n_param*i_neuron;
-    
+  if ( i_neuron < n_node )
+  {
+    float* param = param_arr + n_param * i_neuron;
+
     P11ex = exp( -h / tau_ex );
     P11in = exp( -h / tau_in );
     P22 = exp( -h / tau_m );
-    P21ex = (float)propagator_32( tau_ex, tau_m, C_m, h );
-    P21in = (float)propagator_32( tau_in, tau_m, C_m, h ); 
+    P21ex = ( float ) propagator_32( tau_ex, tau_m, C_m, h );
+    P21in = ( float ) propagator_32( tau_in, tau_m, C_m, h );
     P20 = tau_m / C_m * ( 1.0 - P22 );
   }
 }
 
-
-__global__ void iaf_psc_exp_Update(int n_node, int i_node_0, float *var_arr,
-				   float *param_arr, int n_var, int n_param)
+__global__ void
+iaf_psc_exp_Update( int n_node, int i_node_0, float* var_arr, float* param_arr, int n_var, int n_param )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V
+    else
+    { // neuron is not refractory, so evolve V
       V_m_rel = V_m_rel * P22 + I_syn_ex * P21ex + I_syn_in * P21in + I_e * P20;
     }
     // exponential decaying PSCs
     I_syn_ex *= P11ex;
     I_syn_in *= P11in;
-    
-    if (V_m_rel >= Theta_rel ) { // threshold crossing
-      PushSpike(i_node_0 + i_neuron, 1.0);
+
+    if ( V_m_rel >= Theta_rel )
+    { // threshold crossing
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m_rel = V_reset_rel;
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-    }    
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+    }
   }
 }
 
@@ -115,87 +114,88 @@ iaf_psc_exp::~iaf_psc_exp()
   FreeParamArr();
 }
 
-int iaf_psc_exp::Init(int i_node_0, int n_node, int /*n_port*/,
-			 int i_group)
+int
+iaf_psc_exp::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 2 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 2 /*n_port*/, i_group );
   node_type_ = i_iaf_psc_exp_model;
 
   n_scal_var_ = N_SCAL_VAR;
   n_var_ = n_scal_var_;
   n_scal_param_ = N_SCAL_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
 
   scal_var_name_ = iaf_psc_exp_scal_var_name;
   scal_param_name_ = iaf_psc_exp_scal_param_name;
 
-  SetScalParam(0, n_node, "tau_m", 10.0 );           // in ms
-  SetScalParam(0, n_node, "C_m", 250.0 );            // in pF
-  SetScalParam(0, n_node, "E_L", -70.0 );            // in mV
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
-  SetScalParam(0, n_node, "Theta_rel", -55.0 - (-70.0) );   // relative to E_L_
-  SetScalParam(0, n_node, "V_reset_rel", -70.0 - (-70.0) ); // relative to E_L_
-  SetScalParam(0, n_node, "tau_ex", 2.0 );           // in ms
-  SetScalParam(0, n_node, "tau_in", 2.0 );           // in ms
+  SetScalParam( 0, n_node, "tau_m", 10.0 );                    // in ms
+  SetScalParam( 0, n_node, "C_m", 250.0 );                     // in pF
+  SetScalParam( 0, n_node, "E_L", -70.0 );                     // in mV
+  SetScalParam( 0, n_node, "I_e", 0.0 );                       // in pA
+  SetScalParam( 0, n_node, "Theta_rel", -55.0 - ( -70.0 ) );   // relative to E_L_
+  SetScalParam( 0, n_node, "V_reset_rel", -70.0 - ( -70.0 ) ); // relative to E_L_
+  SetScalParam( 0, n_node, "tau_ex", 2.0 );                    // in ms
+  SetScalParam( 0, n_node, "tau_in", 2.0 );                    // in ms
   // SetScalParam(0, n_node, "rho", 0.01 );             // in 1/s
   // SetScalParam(0, n_node, "delta", 0.0 );            // in mV
-  SetScalParam(0, n_node, "t_ref",  2.0 );           // in ms
-  SetScalParam(0, n_node, "den_delay", 0.0);         // in ms
-  SetScalParam(0, n_node, "P20", 0.0);
-  SetScalParam(0, n_node, "P11ex", 0.0);
-  SetScalParam(0, n_node, "P11in", 0.0);
-  SetScalParam(0, n_node, "P21ex", 0.0);
-  SetScalParam(0, n_node, "P21in", 0.0);
-  SetScalParam(0, n_node, "P22", 0.0);
-
-  SetScalVar(0, n_node, "I_syn_ex", 0.0 );
-  SetScalVar(0, n_node, "I_syn_in", 0.0 );
-  SetScalVar(0, n_node, "V_m_rel", -70.0 - (-70.0) ); // in mV, relative to E_L
-  SetScalVar(0, n_node, "refractory_step", 0 );
+  SetScalParam( 0, n_node, "t_ref", 2.0 );     // in ms
+  SetScalParam( 0, n_node, "den_delay", 0.0 ); // in ms
+  SetScalParam( 0, n_node, "P20", 0.0 );
+  SetScalParam( 0, n_node, "P11ex", 0.0 );
+  SetScalParam( 0, n_node, "P11in", 0.0 );
+  SetScalParam( 0, n_node, "P21ex", 0.0 );
+  SetScalParam( 0, n_node, "P21in", 0.0 );
+  SetScalParam( 0, n_node, "P22", 0.0 );
+
+  SetScalVar( 0, n_node, "I_syn_ex", 0.0 );
+  SetScalVar( 0, n_node, "I_syn_in", 0.0 );
+  SetScalVar( 0, n_node, "V_m_rel", -70.0 - ( -70.0 ) ); // in mV, relative to E_L
+  SetScalVar( 0, n_node, "refractory_step", 0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn_ex, I_syn_in
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn_ex");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn_ex" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 1;
 
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
-  
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
+
   return 0;
 }
 
-int iaf_psc_exp::Update(long long it, double t1)
+int
+iaf_psc_exp::Update( long long it, double t1 )
 {
   // std::cout << "iaf_psc_exp neuron update\n";
-  iaf_psc_exp_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_);
+  iaf_psc_exp_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>(
+    n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_ );
   // gpuErrchk( cudaDeviceSynchronize() );
-  
+
   return 0;
 }
 
-int iaf_psc_exp::Free()
+int
+iaf_psc_exp::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
-  
+
   return 0;
 }
 
-int iaf_psc_exp::Calibrate(double, float time_resolution)
+int
+iaf_psc_exp::Calibrate( double, float time_resolution )
 {
-  iaf_psc_exp_Calibrate<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, param_arr_, n_param_, time_resolution);
+  iaf_psc_exp_Calibrate<<< ( n_node_ + 1023 ) / 1024, 1024 >>>( n_node_, param_arr_, n_param_, time_resolution );
 
   return 0;
 }
diff --git a/src/iaf_psc_exp.h b/src/iaf_psc_exp.h
index ee00de99f..be6595462 100644
--- a/src/iaf_psc_exp.h
+++ b/src/iaf_psc_exp.h
@@ -20,24 +20,18 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/iaf_psc_exp.h
 
-
 #ifndef IAFPSCEXP_H
 #define IAFPSCEXP_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 /* BeginUserDocs: neuron, integrate-and-fire, current-based
 
@@ -50,7 +44,7 @@ Description
 +++++++++++
 
 iaf_psc_exp is an implementation of a leaky integrate-and-fire model
-with exponential shaped postsynaptic currents (PSCs) according to 
+with exponential shaped postsynaptic currents (PSCs) according to
 equations 1, 2, 4 and 5 of [1]_ and equation 3 of [2]_.
 Thus, postsynaptic currents have an infinitely short rise time.
 
@@ -109,7 +103,7 @@ References
        DOI: https://doi.org/10.1007/s004220050570
 .. [4] Potjans TC. and Diesmann M. 2014. The cell-type specific cortical
        microcircuit: relating structure and activity in a full-scale spiking
-       network model. Cerebral Cortex. 24(3):785–806. 
+       network model. Cerebral Cortex. 24(3):785–806.
        DOI: https://doi.org/10.1093/cercor/bhs358.
 
 See also
@@ -119,30 +113,31 @@ iaf_psc_exp_g
 
 EndUserDocs */
 
-
 namespace iaf_psc_exp_ns
 {
-enum ScalVarIndexes {
-  i_I_syn_ex = 0,        // postsynaptic current for exc. inputs
-  i_I_syn_in,            // postsynaptic current for inh. inputs
-  i_V_m_rel,                 // membrane potential
-  i_refractory_step,     // refractory step counter
+enum ScalVarIndexes
+{
+  i_I_syn_ex = 0,    // postsynaptic current for exc. inputs
+  i_I_syn_in,        // postsynaptic current for inh. inputs
+  i_V_m_rel,         // membrane potential
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_tau_m = 0,       // Membrane time constant in ms
-  i_C_m,             // Membrane capacitance in pF
-  i_E_L,             // Resting potential in mV
-  i_I_e,             // External current in pA
-  i_Theta_rel,       // Threshold, RELATIVE TO RESTING POTENTAIL(!)
-                     // i.e. the real threshold is (E_L_+Theta_rel_)
-  i_V_reset_rel,     // relative reset value of the membrane potential
-  i_tau_ex,          // Time constant of excitatory synaptic current in ms
-  i_tau_in,          // Time constant of inhibitory synaptic current in ms
+enum ScalParamIndexes
+{
+  i_tau_m = 0,   // Membrane time constant in ms
+  i_C_m,         // Membrane capacitance in pF
+  i_E_L,         // Resting potential in mV
+  i_I_e,         // External current in pA
+  i_Theta_rel,   // Threshold, RELATIVE TO RESTING POTENTAIL(!)
+                 // i.e. the real threshold is (E_L_+Theta_rel_)
+  i_V_reset_rel, // relative reset value of the membrane potential
+  i_tau_ex,      // Time constant of excitatory synaptic current in ms
+  i_tau_in,      // Time constant of inhibitory synaptic current in ms
   // i_rho,          // Stochastic firing intensity at threshold in 1/s
   // i_delta,        // Width of threshold region in mV
-  i_t_ref,           // Refractory period in ms
+  i_t_ref,     // Refractory period in ms
   i_den_delay, // dendritic backpropagation delay
   // time evolution operator
   i_P20,
@@ -154,17 +149,9 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
- 
-const std::string iaf_psc_exp_scal_var_name[N_SCAL_VAR] = {
-  "I_syn_ex",
-  "I_syn_in",
-  "V_m_rel",
-  "refractory_step"
-};
-
+const std::string iaf_psc_exp_scal_var_name[ N_SCAL_VAR ] = { "I_syn_ex", "I_syn_in", "V_m_rel", "refractory_step" };
 
-const std::string iaf_psc_exp_scal_param_name[N_SCAL_PARAM] = {
-  "tau_m",
+const std::string iaf_psc_exp_scal_param_name[ N_SCAL_PARAM ] = { "tau_m",
   "C_m",
   "E_L",
   "I_e",
@@ -181,26 +168,22 @@ const std::string iaf_psc_exp_scal_param_name[N_SCAL_PARAM] = {
   "P11in",
   "P21ex",
   "P21in",
-  "P22"
-};
+  "P22" };
+
+} // namespace iaf_psc_exp_ns
 
-} // namespace
- 
 class iaf_psc_exp : public BaseNeuron
 {
- public:
+public:
   ~iaf_psc_exp();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Calibrate(double, float time_resolution);
-		
-  int Update(long long it, double t1);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
 
-  int Free();
+  int Calibrate( double, float time_resolution );
 
-};
+  int Update( long long it, double t1 );
 
+  int Free();
+};
 
 #endif
diff --git a/src/iaf_psc_exp_g.cu b/src/iaf_psc_exp_g.cu
index 01d17d689..49a46086e 100644
--- a/src/iaf_psc_exp_g.cu
+++ b/src/iaf_psc_exp_g.cu
@@ -20,74 +20,82 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
 #include "iaf_psc_exp_g.h"
 #include "spike_buffer.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 using namespace iaf_psc_exp_g_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
 
-#define I_syn var[i_I_syn]
-#define V_m_rel var[i_V_m_rel]
-#define refractory_step var[i_refractory_step]
-#define I_e param[i_I_e]
-
-#define tau_m_ group_param_[i_tau_m]
-#define C_m_ group_param_[i_C_m]
-#define E_L_ group_param_[i_E_L]
-#define Theta_rel_ group_param_[i_Theta_rel]
-#define V_reset_rel_ group_param_[i_V_reset_rel]
-#define tau_syn_ group_param_[i_tau_syn]
-#define t_ref_ group_param_[i_t_ref]
-
-__global__ void iaf_psc_exp_g_Update
-( int n_node, int i_node_0, float *var_arr, float *param_arr, int n_var,
-  int n_param, float Theta_rel, float V_reset_rel, int n_refractory_steps,
-  float P11, float P22, float P21, float P20 )
+#define I_syn var[ i_I_syn ]
+#define V_m_rel var[ i_V_m_rel ]
+#define refractory_step var[ i_refractory_step ]
+#define I_e param[ i_I_e ]
+
+#define tau_m_ group_param_[ i_tau_m ]
+#define C_m_ group_param_[ i_C_m ]
+#define E_L_ group_param_[ i_E_L ]
+#define Theta_rel_ group_param_[ i_Theta_rel ]
+#define V_reset_rel_ group_param_[ i_V_reset_rel ]
+#define tau_syn_ group_param_[ i_tau_syn ]
+#define t_ref_ group_param_[ i_t_ref ]
+
+__global__ void
+iaf_psc_exp_g_Update( int n_node,
+  int i_node_0,
+  float* var_arr,
+  float* param_arr,
+  int n_var,
+  int n_param,
+  float Theta_rel,
+  float V_reset_rel,
+  int n_refractory_steps,
+  float P11,
+  float P22,
+  float P21,
+  float P20 )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V
+    else
+    { // neuron is not refractory, so evolve V
       V_m_rel = V_m_rel * P22 + I_syn * P21 + I_e * P20;
     }
     // exponential decaying PSC
     I_syn *= P11;
-    
-    if (V_m_rel >= Theta_rel ) { // threshold crossing
-      PushSpike(i_node_0 + i_neuron, 1.0);
+
+    if ( V_m_rel >= Theta_rel )
+    { // threshold crossing
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m_rel = V_reset_rel;
       refractory_step = n_refractory_steps;
-    }    
+    }
   }
 }
 
-double h_propagator_32( double tau_syn, double tau, double C, double h )
+double
+h_propagator_32( double tau_syn, double tau, double C, double h )
 {
-  const double P32_linear = 1.0 / ( 2.0 * C * tau * tau ) * h * h
-    * ( tau_syn - tau ) * exp( -h / tau );
+  const double P32_linear = 1.0 / ( 2.0 * C * tau * tau ) * h * h * ( tau_syn - tau ) * exp( -h / tau );
   const double P32_singular = h / C * exp( -h / tau );
   const double P32 =
-    -tau / ( C * ( 1.0 - tau / tau_syn ) ) * exp( -h / tau_syn )
-    * expm1( h * ( 1.0 / tau_syn - 1.0 / tau ) );
+    -tau / ( C * ( 1.0 - tau / tau_syn ) ) * exp( -h / tau_syn ) * expm1( h * ( 1.0 / tau_syn - 1.0 / tau ) );
 
   const double dev_P32 = fabs( P32 - P32_singular );
 
-  if ( tau == tau_syn || ( fabs( tau - tau_syn ) < 0.1 && dev_P32 > 2.0
-			   * fabs( P32_linear ) ) )
+  if ( tau == tau_syn || ( fabs( tau - tau_syn ) < 0.1 && dev_P32 > 2.0 * fabs( P32_linear ) ) )
   {
     return P32_singular;
   }
@@ -103,10 +111,10 @@ iaf_psc_exp_g::~iaf_psc_exp_g()
   FreeParamArr();
 }
 
-int iaf_psc_exp_g::Init(int i_node_0, int n_node, int /*n_port*/,
-			   int i_group)
+int
+iaf_psc_exp_g::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 1 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 1 /*n_port*/, i_group );
   node_type_ = i_iaf_psc_exp_g_model;
 
   n_scal_var_ = N_SCAL_VAR;
@@ -114,46 +122,46 @@ int iaf_psc_exp_g::Init(int i_node_0, int n_node, int /*n_port*/,
   n_scal_param_ = N_SCAL_PARAM;
   n_group_param_ = N_GROUP_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = iaf_psc_exp_g_scal_var_name;
   scal_param_name_ = iaf_psc_exp_g_scal_param_name;
   group_param_name_ = iaf_psc_exp_g_group_param_name;
 
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
+  SetScalParam( 0, n_node, "I_e", 0.0 ); // in pA
 
-  SetScalVar(0, n_node, "I_syn", 0.0 );
-  SetScalVar(0, n_node, "V_m_rel", 0.0 ); // in mV
-  SetScalVar(0, n_node, "refractory_step", 0 );
+  SetScalVar( 0, n_node, "I_syn", 0.0 );
+  SetScalVar( 0, n_node, "V_m_rel", 0.0 ); // in mV
+  SetScalVar( 0, n_node, "refractory_step", 0 );
 
-  SetGroupParam("tau_m", 10.0);
-  SetGroupParam("C_m", 250.0);
-  SetGroupParam("E_L", -65.0);
-  SetGroupParam("Theta_rel", 15.0);
-  SetGroupParam("V_reset_rel", 0.0);
-  SetGroupParam("tau_syn", 0.5);
-  SetGroupParam("t_ref", 2.0);
+  SetGroupParam( "tau_m", 10.0 );
+  SetGroupParam( "C_m", 250.0 );
+  SetGroupParam( "E_L", -65.0 );
+  SetGroupParam( "Theta_rel", 15.0 );
+  SetGroupParam( "V_reset_rel", 0.0 );
+  SetGroupParam( "tau_syn", 0.5 );
+  SetGroupParam( "t_ref", 2.0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 0;
 
   return 0;
 }
 
-int iaf_psc_exp_g::Update(long long it, double t1)
+int
+iaf_psc_exp_g::Update( long long it, double t1 )
 {
   // std::cout << "iaf_psc_exp_g neuron update\n";
   float h = time_resolution_;
@@ -161,21 +169,32 @@ int iaf_psc_exp_g::Update(long long it, double t1)
   float P22 = exp( -h / tau_m_ );
   float P21 = h_propagator_32( tau_syn_, tau_m_, C_m_, h );
   float P20 = tau_m_ / C_m_ * ( 1.0 - P22 );
-  int n_refractory_steps = int(round(t_ref_ / h));
+  int n_refractory_steps = int( round( t_ref_ / h ) );
+
+  iaf_psc_exp_g_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>( n_node_,
+    i_node_0_,
+    var_arr_,
+    param_arr_,
+    n_var_,
+    n_param_,
+    Theta_rel_,
+    V_reset_rel_,
+    n_refractory_steps,
+    P11,
+    P22,
+    P21,
+    P20 );
+  // gpuErrchk( cudaDeviceSynchronize() );
 
-  iaf_psc_exp_g_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_,
-      Theta_rel_, V_reset_rel_, n_refractory_steps, P11, P22, P21, P20 );
-  //gpuErrchk( cudaDeviceSynchronize() );
-  
   return 0;
 }
 
-int iaf_psc_exp_g::Free()
+int
+iaf_psc_exp_g::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
   delete[] group_param_;
-  
+
   return 0;
 }
diff --git a/src/iaf_psc_exp_g.h b/src/iaf_psc_exp_g.h
index ae142f755..546b4fb5e 100644
--- a/src/iaf_psc_exp_g.h
+++ b/src/iaf_psc_exp_g.h
@@ -20,37 +20,32 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/iaf_psc_exp.h
 
-
 #ifndef IAFPSCEXPG_H
 #define IAFPSCEXPG_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 /* BeginUserDocs: neuron, integrate-and-fire, current-based
 
 Short description
 +++++++++++++++++
 
-Leaky integrate-and-fire neuron model with exponential PSCs and same parameters within a population
+Leaky integrate-and-fire neuron model with exponential PSCs and same parameters
+within a population
 
 Description
 +++++++++++
 
 iaf_psc_exp_g is an implementation of a leaky integrate-and-fire model
-with exponential shaped postsynaptic currents (PSCs) according to 
+with exponential shaped postsynaptic currents (PSCs) according to
 equations 1, 2, 4 and 5 of [1]_ and equation 3 of [2]_.
 Thus, postsynaptic currents have an infinitely short rise time.
 
@@ -116,7 +111,7 @@ References
        DOI: https://doi.org/10.1007/s004220050570
 .. [4] Potjans TC. and Diesmann M. 2014. The cell-type specific cortical
        microcircuit: relating structure and activity in a full-scale spiking
-       network model. Cerebral Cortex. 24(3):785–806. 
+       network model. Cerebral Cortex. 24(3):785–806.
        DOI: https://doi.org/10.1093/cercor/bhs358.
 
 See also
@@ -126,79 +121,68 @@ iaf_psc_exp
 
 EndUserDocs */
 
-
 namespace iaf_psc_exp_g_ns
 {
-enum ScalVarIndexes {
-  i_I_syn = 0,        // postsynaptic current for exc. inputs
-  i_V_m_rel,          // membrane potential relative to E_L
-  i_refractory_step,  // refractory step counter
+enum ScalVarIndexes
+{
+  i_I_syn = 0,       // postsynaptic current for exc. inputs
+  i_V_m_rel,         // membrane potential relative to E_L
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_I_e = 0,         // External current in pA
+enum ScalParamIndexes
+{
+  i_I_e = 0, // External current in pA
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
-  i_tau_m = 0,       // Membrane time constant in ms
-  i_C_m,             // Membrane capacitance in pF
-  i_E_L,             // Resting potential in mV
-  i_Theta_rel,       // Threshold, RELATIVE TO RESTING POTENTIAL(!)
-                     // i.e. the real threshold is (E_L_+Theta_rel_)
-  i_V_reset_rel,     // relative reset value of the membrane potential
-  i_tau_syn,         // Time constant of synaptic current in ms
-  i_t_ref,           // Refractory period in ms
+enum GroupParamIndexes
+{
+  i_tau_m = 0,   // Membrane time constant in ms
+  i_C_m,         // Membrane capacitance in pF
+  i_E_L,         // Resting potential in mV
+  i_Theta_rel,   // Threshold, RELATIVE TO RESTING POTENTIAL(!)
+                 // i.e. the real threshold is (E_L_+Theta_rel_)
+  i_V_reset_rel, // relative reset value of the membrane potential
+  i_tau_syn,     // Time constant of synaptic current in ms
+  i_t_ref,       // Refractory period in ms
   N_GROUP_PARAM
 };
 
+const std::string iaf_psc_exp_g_scal_var_name[ N_SCAL_VAR ] = { "I_syn", "V_m_rel", "refractory_step" };
 
- 
-const std::string iaf_psc_exp_g_scal_var_name[N_SCAL_VAR] = {
-  "I_syn",
-  "V_m_rel",
-  "refractory_step"
-};
-
-const std::string iaf_psc_exp_g_scal_param_name[N_SCAL_PARAM] = {
-  "I_e"
-};
+const std::string iaf_psc_exp_g_scal_param_name[ N_SCAL_PARAM ] = { "I_e" };
 
-const std::string iaf_psc_exp_g_group_param_name[N_GROUP_PARAM] = {
-  "tau_m",
+const std::string iaf_psc_exp_g_group_param_name[ N_GROUP_PARAM ] = { "tau_m",
   "C_m",
   "E_L",
   "Theta_rel",
   "V_reset_rel",
   "tau_syn",
-  "t_ref"
-};
- 
-} // namespace
- 
-
+  "t_ref" };
 
+} // namespace iaf_psc_exp_g_ns
 
 class iaf_psc_exp_g : public BaseNeuron
 {
   float time_resolution_;
 
- public:
+public:
   ~iaf_psc_exp_g();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
-  int Calibrate(double /*time_min*/, float time_res) {
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int
+  Calibrate( double /*time_min*/, float time_res )
+  {
     time_resolution_ = time_res;
     return 0;
   }
-  
-  int Update(long long it, double t1);
 
-  int Free();
+  int Update( long long it, double t1 );
 
+  int Free();
 };
 
-
 #endif
diff --git a/src/iaf_psc_exp_hc.cu b/src/iaf_psc_exp_hc.cu
index 743a105ae..a532ca975 100644
--- a/src/iaf_psc_exp_hc.cu
+++ b/src/iaf_psc_exp_hc.cu
@@ -20,51 +20,50 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
 #include "iaf_psc_exp_hc.h"
 #include "spike_buffer.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 using namespace iaf_psc_exp_hc_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
 
-#define I_syn var[i_I_syn]
-#define V_m_rel var[i_V_m_rel]
-#define refractory_step var[i_refractory_step]
-#define I_e param[i_I_e]
+#define I_syn var[ i_I_syn ]
+#define V_m_rel var[ i_V_m_rel ]
+#define refractory_step var[ i_refractory_step ]
+#define I_e param[ i_I_e ]
 
 #include "iaf_psc_exp_hc_params.h"
 
-__global__ void iaf_psc_exp_hc_Update(int n_node, int i_node_0,
-					float *var_arr, float *param_arr,
-					int n_var, int n_param)
+__global__ void
+iaf_psc_exp_hc_Update( int n_node, int i_node_0, float* var_arr, float* param_arr, int n_var, int n_param )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V
+    else
+    { // neuron is not refractory, so evolve V
       V_m_rel = V_m_rel * P22 + I_syn * P21 + I_e * P20;
     }
     // exponential decaying PSC
     I_syn *= P11;
-    
-    if (V_m_rel >= Theta_rel ) { // threshold crossing
-      PushSpike(i_node_0 + i_neuron, 1.0);
+
+    if ( V_m_rel >= Theta_rel )
+    { // threshold crossing
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m_rel = V_reset_rel;
       refractory_step = n_refractory_steps;
-    }    
+    }
   }
 }
 
@@ -74,59 +73,60 @@ iaf_psc_exp_hc::~iaf_psc_exp_hc()
   FreeParamArr();
 }
 
-int iaf_psc_exp_hc::Init(int i_node_0, int n_node, int /*n_port*/,
-			   int i_group)
+int
+iaf_psc_exp_hc::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 1 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 1 /*n_port*/, i_group );
   node_type_ = i_iaf_psc_exp_hc_model;
 
   n_scal_var_ = N_SCAL_VAR;
   n_var_ = n_scal_var_;
   n_scal_param_ = N_SCAL_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
 
   scal_var_name_ = iaf_psc_exp_hc_scal_var_name;
   scal_param_name_ = iaf_psc_exp_hc_scal_param_name;
 
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
+  SetScalParam( 0, n_node, "I_e", 0.0 ); // in pA
 
-  SetScalVar(0, n_node, "I_syn", 0.0 );
-  SetScalVar(0, n_node, "V_m_rel", 0.0 ); // in mV
-  SetScalVar(0, n_node, "refractory_step", 0 );
+  SetScalVar( 0, n_node, "I_syn", 0.0 );
+  SetScalVar( 0, n_node, "V_m_rel", 0.0 ); // in mV
+  SetScalVar( 0, n_node, "refractory_step", 0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 0;
 
   return 0;
 }
 
-int iaf_psc_exp_hc::Update(long long it, double t1)
+int
+iaf_psc_exp_hc::Update( long long it, double t1 )
 {
   // std::cout << "iaf_psc_exp_hc neuron update\n";
-  iaf_psc_exp_hc_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_);
-  //gpuErrchk( cudaDeviceSynchronize() );
-  
+  iaf_psc_exp_hc_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>(
+    n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_ );
+  // gpuErrchk( cudaDeviceSynchronize() );
+
   return 0;
 }
 
-int iaf_psc_exp_hc::Free()
+int
+iaf_psc_exp_hc::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
-  
+
   return 0;
 }
diff --git a/src/iaf_psc_exp_hc.h b/src/iaf_psc_exp_hc.h
index 58c1d868a..142294ff1 100644
--- a/src/iaf_psc_exp_hc.h
+++ b/src/iaf_psc_exp_hc.h
@@ -20,65 +20,51 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/iaf_psc_exp.h
 
-
 #ifndef IAFPSCEXPHC_H
 #define IAFPSCEXPHC_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 namespace iaf_psc_exp_hc_ns
 {
-enum ScalVarIndexes {
-  i_I_syn = 0,        // postsynaptic current for exc. inputs
-  i_V_m_rel,          // membrane potential relative to E_L
-  i_refractory_step,  // refractory step counter
+enum ScalVarIndexes
+{
+  i_I_syn = 0,       // postsynaptic current for exc. inputs
+  i_V_m_rel,         // membrane potential relative to E_L
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_I_e = 0,         // External current in pA
+enum ScalParamIndexes
+{
+  i_I_e = 0, // External current in pA
   N_SCAL_PARAM
 };
 
- const std::string iaf_psc_exp_hc_scal_var_name[N_SCAL_VAR] = {
-  "I_syn",
-  "V_m_rel",
-  "refractory_step"
-};
+const std::string iaf_psc_exp_hc_scal_var_name[ N_SCAL_VAR ] = { "I_syn", "V_m_rel", "refractory_step" };
 
-const std::string iaf_psc_exp_hc_scal_param_name[N_SCAL_PARAM] = {
-  "I_e"
-};
+const std::string iaf_psc_exp_hc_scal_param_name[ N_SCAL_PARAM ] = { "I_e" };
 
-} // namespace
- 
+} // namespace iaf_psc_exp_hc_ns
 
 class iaf_psc_exp_hc : public BaseNeuron
 {
- public:
+public:
   ~iaf_psc_exp_hc();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Update(long long it, double t1);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
 
-  int Free();
+  int Update( long long it, double t1 );
 
+  int Free();
 };
 
-
 #endif
diff --git a/src/izhikevich.cu b/src/izhikevich.cu
index 69b127e8f..9e362053e 100644
--- a/src/izhikevich.cu
+++ b/src/izhikevich.cu
@@ -20,81 +20,90 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
 #include "izhikevich.h"
 #include "spike_buffer.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 using namespace izhikevich_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
 
-#define I_syn var[i_I_syn]
-#define V_m var[i_V_m]
-#define u var[i_u]
-#define refractory_step var[i_refractory_step]
-#define I_e param[i_I_e]
-#define den_delay param[i_den_delay]
-
-#define V_th_ group_param_[i_V_th]
-#define a_ group_param_[i_a]
-#define b_ group_param_[i_b]
-#define c_ group_param_[i_c]
-#define d_ group_param_[i_d]
-#define t_ref_ group_param_[i_t_ref]
-
-__global__ void izhikevich_Update
-( int n_node, int i_node_0, float *var_arr, float *param_arr, int n_var,
-  int n_param, float V_th, float a, float b, float c, float d,
-  int n_refractory_steps, float h)
+#define I_syn var[ i_I_syn ]
+#define V_m var[ i_V_m ]
+#define u var[ i_u ]
+#define refractory_step var[ i_refractory_step ]
+#define I_e param[ i_I_e ]
+#define den_delay param[ i_den_delay ]
+
+#define V_th_ group_param_[ i_V_th ]
+#define a_ group_param_[ i_a ]
+#define b_ group_param_[ i_b ]
+#define c_ group_param_[ i_c ]
+#define d_ group_param_[ i_d ]
+#define t_ref_ group_param_[ i_t_ref ]
+
+__global__ void
+izhikevich_Update( int n_node,
+  int i_node_0,
+  float* var_arr,
+  float* param_arr,
+  int n_var,
+  int n_param,
+  float V_th,
+  float a,
+  float b,
+  float c,
+  float d,
+  int n_refractory_steps,
+  float h )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V and u
+    else
+    { // neuron is not refractory, so evolve V and u
       float v_old = V_m;
       float u_old = u;
 
-      V_m += h*(0.04 * v_old * v_old + 5.0 * v_old + 140.0 - u_old
-		+ I_e) + I_syn;
-      u += h*a*(b*v_old - u_old);
+      V_m += h * ( 0.04 * v_old * v_old + 5.0 * v_old + 140.0 - u_old + I_e ) + I_syn;
+      u += h * a * ( b * v_old - u_old );
     }
     I_syn = 0;
-    
-    if ( V_m >= V_th ) { // send spike
-      PushSpike(i_node_0 + i_neuron, 1.0);
+
+    if ( V_m >= V_th )
+    { // send spike
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m = c;
       u += d; // spike-driven adaptation
       refractory_step = n_refractory_steps;
-      if (refractory_step<0) {
-	refractory_step = 0;
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
 izhikevich::~izhikevich()
 {
   FreeVarArr();
   FreeParamArr();
 }
 
-int izhikevich::Init(int i_node_0, int n_node, int /*n_port*/,
-			   int i_group)
+int
+izhikevich::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 1 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 1 /*n_port*/, i_group );
   node_type_ = i_izhikevich_model;
 
   n_scal_var_ = N_SCAL_VAR;
@@ -102,65 +111,65 @@ int izhikevich::Init(int i_node_0, int n_node, int /*n_port*/,
   n_scal_param_ = N_SCAL_PARAM;
   n_group_param_ = N_GROUP_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = izhikevich_scal_var_name;
   scal_param_name_ = izhikevich_scal_param_name;
   group_param_name_ = izhikevich_group_param_name;
 
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
-  SetScalParam(0, n_node, "den_delay", 0.0 );        // in ms
-  
-  SetScalVar(0, n_node, "I_syn", 0.0 );
-  SetScalVar(0, n_node, "V_m", -70.0 ); // in mV
-  SetScalVar(0, n_node, "u", -70.0*0.2 );
-  SetScalVar(0, n_node, "refractory_step", 0 );
-
-  SetGroupParam("V_th", 30.0);
-  SetGroupParam("a", 0.02);
-  SetGroupParam("b", 0.2);
-  SetGroupParam("c", -65.0);
-  SetGroupParam("d", 8.0);
-  SetGroupParam("t_ref", 0.0);
+  SetScalParam( 0, n_node, "I_e", 0.0 );       // in pA
+  SetScalParam( 0, n_node, "den_delay", 0.0 ); // in ms
+
+  SetScalVar( 0, n_node, "I_syn", 0.0 );
+  SetScalVar( 0, n_node, "V_m", -70.0 ); // in mV
+  SetScalVar( 0, n_node, "u", -70.0 * 0.2 );
+  SetScalVar( 0, n_node, "refractory_step", 0 );
+
+  SetGroupParam( "V_th", 30.0 );
+  SetGroupParam( "a", 0.02 );
+  SetGroupParam( "b", 0.2 );
+  SetGroupParam( "c", -65.0 );
+  SetGroupParam( "d", 8.0 );
+  SetGroupParam( "t_ref", 0.0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 0;
 
   return 0;
 }
 
-int izhikevich::Update(long long it, double t1)
+int
+izhikevich::Update( long long it, double t1 )
 {
   // std::cout << "izhikevich neuron update\n";
   float h = time_resolution_;
-  int n_refractory_steps = int(round(t_ref_ / h));
+  int n_refractory_steps = int( round( t_ref_ / h ) );
+
+  izhikevich_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>(
+    n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_, V_th_, a_, b_, c_, d_, n_refractory_steps, h );
+  // gpuErrchk( cudaDeviceSynchronize() );
 
-  izhikevich_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_,
-     V_th_, a_, b_, c_, d_, n_refractory_steps, h);
-  //gpuErrchk( cudaDeviceSynchronize() );
-  
   return 0;
 }
 
-int izhikevich::Free()
+int
+izhikevich::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
   delete[] group_param_;
-  
+
   return 0;
 }
diff --git a/src/izhikevich.h b/src/izhikevich.h
index a98d04b85..f37425b37 100644
--- a/src/izhikevich.h
+++ b/src/izhikevich.h
@@ -20,19 +20,15 @@
  *
  */
 
-
-
-
-
 #ifndef IZHIKEVICH_H
 #define IZHIKEVICH_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 /* BeginUserDocs: neuron, integrate-and-fire
 
@@ -86,85 +82,68 @@ References
 ++++++++++
 
 .. [1] Izhikevich EM (2003). Simple model of spiking neurons. IEEE Transactions
-       on Neural Networks, 14:1569-1572. DOI: https://doi.org/10.1109/TNN.2003.820440
+       on Neural Networks, 14:1569-1572. DOI:
+https://doi.org/10.1109/TNN.2003.820440
 
 
 EndUserDocs */
 
-
 namespace izhikevich_ns
 {
-enum ScalVarIndexes {
-  i_I_syn = 0,        // input current
-  i_V_m,              // membrane potential
+enum ScalVarIndexes
+{
+  i_I_syn = 0, // input current
+  i_V_m,       // membrane potential
   i_u,
-  i_refractory_step,  // refractory step counter
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_I_e = 0,         // External current in pA
+enum ScalParamIndexes
+{
+  i_I_e = 0, // External current in pA
   i_den_delay,
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
+enum GroupParamIndexes
+{
   i_V_th = 0,
   i_a,
   i_b,
   i_c,
   i_d,
-  i_t_ref,           // Refractory period in ms
+  i_t_ref, // Refractory period in ms
   N_GROUP_PARAM
 };
 
+const std::string izhikevich_scal_var_name[ N_SCAL_VAR ] = { "I_syn", "V_m", "u", "refractory_step" };
 
- 
-const std::string izhikevich_scal_var_name[N_SCAL_VAR] = {
-  "I_syn",
-  "V_m",
-  "u",
-  "refractory_step"
-};
-
-const std::string izhikevich_scal_param_name[N_SCAL_PARAM] = {
-  "I_e",
-  "den_delay"
-};
-
-const std::string izhikevich_group_param_name[N_GROUP_PARAM] = {
-  "V_th",
-  "a",
-  "b",
-  "c",
-  "d",
-  "t_ref"
-};
- 
-} // namespace
- 
+const std::string izhikevich_scal_param_name[ N_SCAL_PARAM ] = { "I_e", "den_delay" };
 
+const std::string izhikevich_group_param_name[ N_GROUP_PARAM ] = { "V_th", "a", "b", "c", "d", "t_ref" };
 
+} // namespace izhikevich_ns
 
 class izhikevich : public BaseNeuron
 {
   float time_resolution_;
 
- public:
+public:
   ~izhikevich();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
-  int Calibrate(double /*time_min*/, float time_res) {
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int
+  Calibrate( double /*time_min*/, float time_res )
+  {
     time_resolution_ = time_res;
     return 0;
   }
-  
-  int Update(long long it, double t1);
 
-  int Free();
+  int Update( long long it, double t1 );
 
+  int Free();
 };
 
-
 #endif
diff --git a/src/izhikevich_cond_beta.cu b/src/izhikevich_cond_beta.cu
index 80f42706c..63acad023 100644
--- a/src/izhikevich_cond_beta.cu
+++ b/src/izhikevich_cond_beta.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
+#include "izhikevich_cond_beta.h"
 #include "izhikevich_cond_beta_kernel.h"
 #include "rk5.h"
-#include "izhikevich_cond_beta.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace izhikevich_cond_beta_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      izhikevich_cond_beta_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, izhikevich_cond_beta_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   V_th = 30.0;
   a = 0.02;
@@ -49,133 +44,135 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   I_e = 0.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = -70.0;
-  u = b*V_m;
+  u = b * V_m;
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
-    g(i) = 0;
-    g1(i) = 0;
-    E_rev(i) = 0.0;
-    tau_decay(i) = 20.0;
-    tau_rise(i) = 2.0;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    g( i ) = 0;
+    g1( i ) = 0;
+    E_rev( i ) = 0.0;
+    tau_decay( i ) = 20.0;
+    tau_rise( i ) = 2.0;
   }
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, izhikevich_cond_beta_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, izhikevich_cond_beta_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
+  for ( int i = 0; i < n_port; i++ )
+  {
     // denominator is computed here to check that it is != 0
-    float denom1 = tau_decay(i) - tau_rise(i);
+    float denom1 = tau_decay( i ) - tau_rise( i );
     float denom2 = 0;
-    if (denom1 != 0) {
+    if ( denom1 != 0 )
+    {
       // peak time
-      float t_p = tau_decay(i)*tau_rise(i)
-	*log(tau_decay(i)/tau_rise(i)) / denom1;
+      float t_p = tau_decay( i ) * tau_rise( i ) * log( tau_decay( i ) / tau_rise( i ) ) / denom1;
       // another denominator is computed here to check that it is != 0
-      denom2 = exp(-t_p / tau_decay(i))
-	- exp(-t_p / tau_rise(i));
+      denom2 = exp( -t_p / tau_decay( i ) ) - exp( -t_p / tau_rise( i ) );
     }
-    if (denom2 == 0) { // if rise time == decay time use alpha function
+    if ( denom2 == 0 )
+    { // if rise time == decay time use alpha function
       // use normalization for alpha function in this case
-      g0(i) = M_E / tau_decay(i);
+      g0( i ) = M_E / tau_decay( i );
     }
-    else { // if rise time != decay time use beta function
-      g0(i) // normalization factor for conductance
-	= ( 1. / tau_rise(i) - 1. / tau_decay(i) ) / denom2;
+    else
+    {         // if rise time != decay time use beta function
+      g0( i ) // normalization factor for conductance
+        = ( 1. / tau_rise( i ) - 1. / tau_decay( i ) ) / denom2;
     }
   }
 }
 
-}
-			    
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, izhikevich_cond_beta_rk5 data_struct)
+} // namespace izhikevich_cond_beta_ns
+
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, izhikevich_cond_beta_rk5 data_struct )
 {
-    izhikevich_cond_beta_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  izhikevich_cond_beta_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, izhikevich_cond_beta_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, izhikevich_cond_beta_rk5 data_struct )
 
 {
-    izhikevich_cond_beta_ns::NodeCalibrate(n_var, n_param, x, y, param,
-					   data_struct);
+  izhikevich_cond_beta_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace izhikevich_cond_beta_ns;
 
-int izhikevich_cond_beta::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
-  
+int
+izhikevich_cond_beta::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
+
   node_type_ = i_izhikevich_cond_beta_model;
   n_scal_var_ = N_SCAL_VAR;
   n_port_var_ = N_PORT_VAR;
   n_scal_param_ = N_SCAL_PARAM;
   n_port_param_ = N_PORT_PARAM;
   n_group_param_ = N_GROUP_PARAM;
- 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
 
-  group_param_ = new float[N_GROUP_PARAM];
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
+
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = izhikevich_cond_beta_scal_var_name;
-  port_var_name_= izhikevich_cond_beta_port_var_name;
+  port_var_name_ = izhikevich_cond_beta_port_var_name;
   scal_param_name_ = izhikevich_cond_beta_scal_param_name;
   port_param_name_ = izhikevich_cond_beta_port_param_name;
   group_param_name_ = izhikevich_cond_beta_group_param_name;
-  //rk5_data_struct_.node_type_ = i_izhikevich_cond_beta_model;
+  // rk5_data_struct_.node_type_ = i_izhikevich_cond_beta_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
 
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
-  port_weight_arr_ = GetParamArr() + n_scal_param_
-    + GetPortParamIdx("g0");
+  port_weight_arr_ = GetParamArr() + n_scal_param_ + GetPortParamIdx( "g0" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = n_port_param_;
 
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("g1");
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "g1" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int izhikevich_cond_beta::Calibrate(double time_min, float time_resolution)
+int
+izhikevich_cond_beta::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
 template <>
-int izhikevich_cond_beta::UpdateNR<0>(long long it, double t1)
+int
+izhikevich_cond_beta::UpdateNR< 0 >( long long it, double t1 )
 {
   return 0;
 }
 
-int izhikevich_cond_beta::Update(long long it, double t1) {
-  UpdateNR<MAX_PORT_NUM>(it, t1);
+int
+izhikevich_cond_beta::Update( long long it, double t1 )
+{
+  UpdateNR< MAX_PORT_NUM >( it, t1 );
 
   return 0;
 }
diff --git a/src/izhikevich_cond_beta.h b/src/izhikevich_cond_beta.h
index 1d3537469..e06533bd7 100644
--- a/src/izhikevich_cond_beta.h
+++ b/src/izhikevich_cond_beta.h
@@ -20,20 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef IZHIKEVICHCONDBETA_H
 #define IZHIKEVICHCONDBETA_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 /* BeginUserDocs: neuron, integrate-and-fire
 
@@ -64,15 +60,15 @@ The dynamics are given by:
    &v \text{ jumps on each spike arrival by the weight of the spike}
 
 This implementation uses the standard technique for forward Euler integration.
-This model is multisynapse, so it allows an arbitrary number of synaptic 
-rise time and decay time constants. The number of receptor ports must be specified 
-at neuron creation (default value is 1) and the receptor index starts from 0 
-(and not from 1 as in NEST multisynapse models).
-The time constants are supplied by by two arrays, ``tau_rise`` and ``tau_decay`` for
-the synaptic rise time and decay time, respectively. The synaptic
-reversal potentials are supplied by the array ``E_rev``. Port numbers
-are automatically assigned in the range from 0 to ``n_receptors-1``.
-During connection, the ports are selected with the synapse property ``receptor``.
+This model is multisynapse, so it allows an arbitrary number of synaptic
+rise time and decay time constants. The number of receptor ports must be
+specified at neuron creation (default value is 1) and the receptor index starts
+from 0 (and not from 1 as in NEST multisynapse models). The time constants are
+supplied by by two arrays, ``tau_rise`` and ``tau_decay`` for the synaptic rise
+time and decay time, respectively. The synaptic reversal potentials are supplied
+by the array ``E_rev``. Port numbers are automatically assigned in the range
+from 0 to ``n_receptors-1``. During connection, the ports are selected with the
+synapse property ``receptor``.
 
 Parameters
 ++++++++++
@@ -95,7 +91,7 @@ The following parameters can be set in the status dictionary.
  tau_decay              ms       Decay time constant of synaptic conductance
  h_min_rel              real     Starting step in ODE integration relative to
                                  time resolution
- h0_rel                 real     Minimum step in ODE integration relative to 
+ h0_rel                 real     Minimum step in ODE integration relative to
                                  time resolution
 ======================= =======  ==============================================
 
@@ -103,7 +99,8 @@ References
 ++++++++++
 
 .. [1] Izhikevich EM (2003). Simple model of spiking neurons. IEEE Transactions
-       on Neural Networks, 14:1569-1572. DOI: https://doi.org/10.1109/TNN.2003.820440
+       on Neural Networks, 14:1569-1572. DOI:
+https://doi.org/10.1109/TNN.2003.820440
 
 .. [2] A. Roth and M. C. W. van Rossum, Computational Modeling Methods
        for Neuroscientists, MIT Press 2013, Chapter 6.
@@ -126,30 +123,32 @@ struct izhikevich_cond_beta_rk5
 
 class izhikevich_cond_beta : public BaseNeuron
 {
- public:
-  RungeKutta5<izhikevich_cond_beta_rk5> rk5_;
+public:
+  RungeKutta5< izhikevich_cond_beta_rk5 > rk5_;
   float h_min_;
   float h_;
   izhikevich_cond_beta_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
-
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/izhikevich_cond_beta_kernel.h b/src/izhikevich_cond_beta_kernel.h
index 3b5f22650..105706a10 100644
--- a/src/izhikevich_cond_beta_kernel.h
+++ b/src/izhikevich_cond_beta_kernel.h
@@ -20,38 +20,37 @@
  *
  */
 
-
-
-
-
 #ifndef IZHIKEVICHCONDBETAKERNEL_H
 #define IZHIKEVICHCONDBETAKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
-#include "node_group.h"
 #include "izhikevich_cond_beta.h"
+#include "node_group.h"
+#include "spike_buffer.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace izhikevich_cond_beta_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_u,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_g = 0,
   i_g1,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_a,
   i_b,
@@ -64,7 +63,8 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_E_rev = 0,
   i_tau_rise,
   i_tau_decay,
@@ -72,25 +72,18 @@ enum PortParamIndexes {
   N_PORT_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
- 
-const std::string izhikevich_cond_beta_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "u"
-};
+const std::string izhikevich_cond_beta_scal_var_name[ N_SCAL_VAR ] = { "V_m", "u" };
 
-const std::string izhikevich_cond_beta_port_var_name[N_PORT_VAR] = {
-  "g",
-  "g1"
-};
+const std::string izhikevich_cond_beta_port_var_name[ N_PORT_VAR ] = { "g", "g1" };
 
-const std::string izhikevich_cond_beta_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string izhikevich_cond_beta_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "a",
   "b",
   "c",
@@ -98,162 +91,152 @@ const std::string izhikevich_cond_beta_scal_param_name[N_SCAL_PARAM] = {
   "I_e",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string izhikevich_cond_beta_port_param_name[N_PORT_PARAM] = {
-  "E_rev",
-  "tau_rise",
-  "tau_decay",
-  "g0"  
-};
+const std::string izhikevich_cond_beta_port_param_name[ N_PORT_PARAM ] = { "E_rev", "tau_rise", "tau_decay", "g0" };
 
-
-const std::string izhikevich_cond_beta_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string izhikevich_cond_beta_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define u y[i_u]
-#define g(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define g1(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define dVdt dydx[i_V_m]
-#define dudt dydx[i_u]
-#define dgdt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define dg1dt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define V_th param[i_V_th]
-#define a param[i_a]
-#define b param[i_b]
-#define c param[i_c]
-#define d param[i_d]
-#define I_e param[i_I_e]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define E_rev(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_E_rev]
-#define tau_rise(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_rise]
-#define tau_decay(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_decay]
-#define g0(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_g0]
-
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
-
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     izhikevich_cond_beta_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define u y[ i_u ]
+#define g( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define g1( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define dVdt dydx[ i_V_m ]
+#define dudt dydx[ i_u ]
+#define dgdt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define dg1dt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define V_th param[ i_V_th ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define c param[ i_c ]
+#define d param[ i_d ]
+#define I_e param[ i_I_e ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define E_rev( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_E_rev ]
+#define tau_rise( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_rise ]
+#define tau_decay( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_decay ]
+#define g0( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_g0 ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, izhikevich_cond_beta_rk5 data_struct )
 {
-  enum { n_port = (NVAR-N_SCAL_VAR)/N_PORT_VAR };
+  enum
+  {
+    n_port = ( NVAR - N_SCAL_VAR ) / N_PORT_VAR
+  };
   float I_syn = 0.0;
 
-  float V = ( refractory_step > 0 ) ? c :  V_m;
-  for (int i = 0; i<n_port; i++) {
-    I_syn += g(i)*(E_rev(i) - V);
+  float V = ( refractory_step > 0 ) ? c : V_m;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn += g( i ) * ( E_rev( i ) - V );
   }
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    0.04 * V * V + 5.0 * V + 140.0 - u + I_syn + I_e;
-  
-  dudt = a*(b*V - u);
+  dVdt = ( refractory_step > 0 ) ? 0 : 0.04 * V * V + 5.0 * V + 140.0 - u + I_syn + I_e;
+
+  dudt = a * ( b * V - u );
 
-  for (int i=0; i<n_port; i++) {
+  for ( int i = 0; i < n_port; i++ )
+  {
     // Synaptic conductance derivative
-    dg1dt(i) = -g1(i) / tau_rise(i);
-    dgdt(i) = g1(i) - g(i) / tau_decay(i);
+    dg1dt( i ) = -g1( i ) / tau_rise( i );
+    dgdt( i ) = g1( i ) - g( i ) / tau_decay( i );
   }
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			izhikevich_cond_beta_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, izhikevich_cond_beta_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = c;
-    u=0;
+    u = 0;
     return;
   }
-  if ( u < -1.0e6 || u > 1.0e6) { // numerical instability
-    printf("u out of bound\n");
+  if ( u < -1.0e6 || u > 1.0e6 )
+  { // numerical instability
+    printf( "u out of bound\n" );
     V_m = c;
-    u=0;
+    u = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = c;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_th ) { // send spike
+  else
+  {
+    if ( V_m >= V_th )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = c;
       u += d; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
-};
+}; // namespace izhikevich_cond_beta_ns
 
 template <>
-int izhikevich_cond_beta::UpdateNR<0>(long long it, double t1);
+int izhikevich_cond_beta::UpdateNR< 0 >( long long it, double t1 );
 
-template<int N_PORT>
-int izhikevich_cond_beta::UpdateNR(long long it, double t1)
+template < int N_PORT >
+int
+izhikevich_cond_beta::UpdateNR( long long it, double t1 )
 {
-  if (N_PORT == n_port_) {
-    const int NVAR = izhikevich_cond_beta_ns::N_SCAL_VAR
-      + izhikevich_cond_beta_ns::N_PORT_VAR*N_PORT;
-    const int NPARAM = izhikevich_cond_beta_ns::N_SCAL_PARAM
-      + izhikevich_cond_beta_ns::N_PORT_PARAM*N_PORT;
+  if ( N_PORT == n_port_ )
+  {
+    const int NVAR = izhikevich_cond_beta_ns::N_SCAL_VAR + izhikevich_cond_beta_ns::N_PORT_VAR * N_PORT;
+    const int NPARAM = izhikevich_cond_beta_ns::N_SCAL_PARAM + izhikevich_cond_beta_ns::N_PORT_PARAM * N_PORT;
 
-    rk5_.Update<NVAR, NPARAM>(t1, h_min_, rk5_data_struct_);
+    rk5_.Update< NVAR, NPARAM >( t1, h_min_, rk5_data_struct_ );
   }
-  else {
-    UpdateNR<N_PORT - 1>(it, t1);
+  else
+  {
+    UpdateNR< N_PORT - 1 >( it, t1 );
   }
 
   return 0;
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 izhikevich_cond_beta_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, izhikevich_cond_beta_rk5 data_struct )
 {
-    izhikevich_cond_beta_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  izhikevich_cond_beta_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    izhikevich_cond_beta_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, izhikevich_cond_beta_rk5 data_struct )
 {
-    izhikevich_cond_beta_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  izhikevich_cond_beta_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/izhikevich_cond_beta_rk5.h b/src/izhikevich_cond_beta_rk5.h
index c00f137e9..b732b511c 100644
--- a/src/izhikevich_cond_beta_rk5.h
+++ b/src/izhikevich_cond_beta_rk5.h
@@ -20,32 +20,22 @@
  *
  */
 
-
-
-
-
 #ifndef IZHIKEVICHCONDBETARK5_H
 #define IZHIKEVICHCONDBETARK5_H
 
 struct izhikevich_cond_beta_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void Derivatives( double x, float* y, float* dydx, float* param, izhikevich_cond_beta_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 izhikevich_cond_beta_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    izhikevich_cond_beta_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, izhikevich_cond_beta_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, izhikevich_cond_beta_rk5 data_struct);
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, izhikevich_cond_beta_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, izhikevich_cond_beta_rk5 data_struct);
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, izhikevich_cond_beta_rk5 data_struct );
 
 #endif
diff --git a/src/izhikevich_psc_exp.cu b/src/izhikevich_psc_exp.cu
index f0ff34a62..120811be8 100644
--- a/src/izhikevich_psc_exp.cu
+++ b/src/izhikevich_psc_exp.cu
@@ -20,83 +20,93 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
 #include "izhikevich_psc_exp.h"
 #include "spike_buffer.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 using namespace izhikevich_psc_exp_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
 
-#define I_syn var[i_I_syn]
-#define V_m var[i_V_m]
-#define u var[i_u]
-#define refractory_step var[i_refractory_step]
-#define I_e param[i_I_e]
-#define den_delay param[i_den_delay]
-
-#define V_th_ group_param_[i_V_th]
-#define a_ group_param_[i_a]
-#define b_ group_param_[i_b]
-#define c_ group_param_[i_c]
-#define d_ group_param_[i_d]
-#define tau_syn_ group_param_[i_tau_syn]
-#define t_ref_ group_param_[i_t_ref]
-
-__global__ void izhikevich_psc_exp_Update
-( int n_node, int i_node_0, float *var_arr, float *param_arr, int n_var,
-  int n_param, float V_th, float a, float b, float c, float d,
-  int n_refractory_steps, float h, float C_syn)
+#define I_syn var[ i_I_syn ]
+#define V_m var[ i_V_m ]
+#define u var[ i_u ]
+#define refractory_step var[ i_refractory_step ]
+#define I_e param[ i_I_e ]
+#define den_delay param[ i_den_delay ]
+
+#define V_th_ group_param_[ i_V_th ]
+#define a_ group_param_[ i_a ]
+#define b_ group_param_[ i_b ]
+#define c_ group_param_[ i_c ]
+#define d_ group_param_[ i_d ]
+#define tau_syn_ group_param_[ i_tau_syn ]
+#define t_ref_ group_param_[ i_t_ref ]
+
+__global__ void
+izhikevich_psc_exp_Update( int n_node,
+  int i_node_0,
+  float* var_arr,
+  float* param_arr,
+  int n_var,
+  int n_param,
+  float V_th,
+  float a,
+  float b,
+  float c,
+  float d,
+  int n_refractory_steps,
+  float h,
+  float C_syn )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V and u
+    else
+    { // neuron is not refractory, so evolve V and u
       float v_old = V_m;
       float u_old = u;
 
-      V_m += h*(0.04 * v_old * v_old + 5.0 * v_old + 140.0 - u_old
-		+ I_syn + I_e);
-      u += h*a*(b*v_old - u_old);
+      V_m += h * ( 0.04 * v_old * v_old + 5.0 * v_old + 140.0 - u_old + I_syn + I_e );
+      u += h * a * ( b * v_old - u_old );
     }
     // exponential decaying PSC
     I_syn *= C_syn;
-    
-    if ( V_m >= V_th ) { // send spike
-      PushSpike(i_node_0 + i_neuron, 1.0);
+
+    if ( V_m >= V_th )
+    { // send spike
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m = c;
       u += d; // spike-driven adaptation
       refractory_step = n_refractory_steps;
-      if (refractory_step<0) {
-	refractory_step = 0;
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
 izhikevich_psc_exp::~izhikevich_psc_exp()
 {
   FreeVarArr();
   FreeParamArr();
 }
 
-int izhikevich_psc_exp::Init(int i_node_0, int n_node, int /*n_port*/,
-			   int i_group)
+int
+izhikevich_psc_exp::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 1 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 1 /*n_port*/, i_group );
   node_type_ = i_izhikevich_psc_exp_model;
 
   n_scal_var_ = N_SCAL_VAR;
@@ -104,67 +114,67 @@ int izhikevich_psc_exp::Init(int i_node_0, int n_node, int /*n_port*/,
   n_scal_param_ = N_SCAL_PARAM;
   n_group_param_ = N_GROUP_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = izhikevich_psc_exp_scal_var_name;
   scal_param_name_ = izhikevich_psc_exp_scal_param_name;
   group_param_name_ = izhikevich_psc_exp_group_param_name;
 
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
-  SetScalParam(0, n_node, "den_delay", 0.0 );        // in ms
-  
-  SetScalVar(0, n_node, "I_syn", 0.0 );
-  SetScalVar(0, n_node, "V_m", -70.0 ); // in mV
-  SetScalVar(0, n_node, "u", -70.0*0.2 );
-  SetScalVar(0, n_node, "refractory_step", 0 );
-
-  SetGroupParam("V_th", 30.0);
-  SetGroupParam("a", 0.02);
-  SetGroupParam("b", 0.2);
-  SetGroupParam("c", -65.0);
-  SetGroupParam("d", 8.0);
-  SetGroupParam("tau_syn", 0.5);
-  SetGroupParam("t_ref", 0.0);
+  SetScalParam( 0, n_node, "I_e", 0.0 );       // in pA
+  SetScalParam( 0, n_node, "den_delay", 0.0 ); // in ms
+
+  SetScalVar( 0, n_node, "I_syn", 0.0 );
+  SetScalVar( 0, n_node, "V_m", -70.0 ); // in mV
+  SetScalVar( 0, n_node, "u", -70.0 * 0.2 );
+  SetScalVar( 0, n_node, "refractory_step", 0 );
+
+  SetGroupParam( "V_th", 30.0 );
+  SetGroupParam( "a", 0.02 );
+  SetGroupParam( "b", 0.2 );
+  SetGroupParam( "c", -65.0 );
+  SetGroupParam( "d", 8.0 );
+  SetGroupParam( "tau_syn", 0.5 );
+  SetGroupParam( "t_ref", 0.0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 0;
 
   return 0;
 }
 
-int izhikevich_psc_exp::Update(long long it, double t1)
+int
+izhikevich_psc_exp::Update( long long it, double t1 )
 {
   // std::cout << "izhikevich_psc_exp neuron update\n";
   float h = time_resolution_;
   float C_syn = exp( -h / tau_syn_ );
-  int n_refractory_steps = int(round(t_ref_ / h));
+  int n_refractory_steps = int( round( t_ref_ / h ) );
+
+  izhikevich_psc_exp_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>(
+    n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_, V_th_, a_, b_, c_, d_, n_refractory_steps, h, C_syn );
+  // gpuErrchk( cudaDeviceSynchronize() );
 
-  izhikevich_psc_exp_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_,
-     V_th_, a_, b_, c_, d_, n_refractory_steps, h, C_syn);
-  //gpuErrchk( cudaDeviceSynchronize() );
-  
   return 0;
 }
 
-int izhikevich_psc_exp::Free()
+int
+izhikevich_psc_exp::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
   delete[] group_param_;
-  
+
   return 0;
 }
diff --git a/src/izhikevich_psc_exp.h b/src/izhikevich_psc_exp.h
index f1324a3ad..43a8449a5 100644
--- a/src/izhikevich_psc_exp.h
+++ b/src/izhikevich_psc_exp.h
@@ -20,21 +20,15 @@
  *
  */
 
-
-
-
-
-
 #ifndef IZHIKEVICHPSCEXP_H
 #define IZHIKEVICHPSCEXP_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 /* BeginUserDocs: neuron, integrate-and-fire
 
@@ -90,87 +84,70 @@ References
 ++++++++++
 
 .. [1] Izhikevich EM (2003). Simple model of spiking neurons. IEEE Transactions
-       on Neural Networks, 14:1569-1572. DOI: https://doi.org/10.1109/TNN.2003.820440
+       on Neural Networks, 14:1569-1572. DOI:
+https://doi.org/10.1109/TNN.2003.820440
 
 
 EndUserDocs */
 
-
 namespace izhikevich_psc_exp_ns
 {
-enum ScalVarIndexes {
-  i_I_syn = 0,        // postsynaptic current for exc. inputs
-  i_V_m,              // membrane potential
+enum ScalVarIndexes
+{
+  i_I_syn = 0, // postsynaptic current for exc. inputs
+  i_V_m,       // membrane potential
   i_u,
-  i_refractory_step,  // refractory step counter
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_I_e = 0,         // External current in pA
+enum ScalParamIndexes
+{
+  i_I_e = 0, // External current in pA
   i_den_delay,
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
+enum GroupParamIndexes
+{
   i_V_th = 0,
   i_a,
   i_b,
   i_c,
   i_d,
-  i_tau_syn,         // Time constant of synaptic current in ms
-  i_t_ref,           // Refractory period in ms
+  i_tau_syn, // Time constant of synaptic current in ms
+  i_t_ref,   // Refractory period in ms
   N_GROUP_PARAM
 };
 
+const std::string izhikevich_psc_exp_scal_var_name[ N_SCAL_VAR ] = { "I_syn", "V_m", "u", "refractory_step" };
 
- 
-const std::string izhikevich_psc_exp_scal_var_name[N_SCAL_VAR] = {
-  "I_syn",
-  "V_m",
-  "u",
-  "refractory_step"
-};
-
-const std::string izhikevich_psc_exp_scal_param_name[N_SCAL_PARAM] = {
-  "I_e",
-  "den_delay"
-};
-
-const std::string izhikevich_psc_exp_group_param_name[N_GROUP_PARAM] = {
-  "V_th",
-  "a",
-  "b",
-  "c",
-  "d",
-  "tau_syn",
-  "t_ref"
-};
- 
-} // namespace
- 
+const std::string izhikevich_psc_exp_scal_param_name[ N_SCAL_PARAM ] = { "I_e", "den_delay" };
 
+const std::string
+  izhikevich_psc_exp_group_param_name[ N_GROUP_PARAM ] = { "V_th", "a", "b", "c", "d", "tau_syn", "t_ref" };
 
+} // namespace izhikevich_psc_exp_ns
 
 class izhikevich_psc_exp : public BaseNeuron
 {
   float time_resolution_;
 
- public:
+public:
   ~izhikevich_psc_exp();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
-  int Calibrate(double /*time_min*/, float time_res) {
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int
+  Calibrate( double /*time_min*/, float time_res )
+  {
     time_resolution_ = time_res;
     return 0;
   }
-  
-  int Update(long long it, double t1);
 
-  int Free();
+  int Update( long long it, double t1 );
 
+  int Free();
 };
 
-
 #endif
diff --git a/src/izhikevich_psc_exp_2s.cu b/src/izhikevich_psc_exp_2s.cu
index c5fdaf6e1..8b10dc257 100644
--- a/src/izhikevich_psc_exp_2s.cu
+++ b/src/izhikevich_psc_exp_2s.cu
@@ -20,90 +20,102 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
 #include "izhikevich_psc_exp_2s.h"
 #include "spike_buffer.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 using namespace izhikevich_psc_exp_2s_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
 
 #define INTEGR_STEPS 2
-#define I_syn var[i_I_syn]
-#define V_m var[i_V_m]
-#define u var[i_u]
-#define refractory_step var[i_refractory_step]
-#define I_e param[i_I_e]
-#define den_delay param[i_den_delay]
-
-#define V_th_ group_param_[i_V_th]
-#define a_ group_param_[i_a]
-#define b_ group_param_[i_b]
-#define c_ group_param_[i_c]
-#define d_ group_param_[i_d]
-#define tau_syn_ group_param_[i_tau_syn]
-#define t_ref_ group_param_[i_t_ref]
-
-__global__ void izhikevich_psc_exp_2s_Update
-( int n_node, int i_node_0, float *var_arr, float *param_arr, int n_var,
-  int n_param, float V_th, float a, float b, float c, float d,
-  int n_refractory_steps, float h, float C_syn)
+#define I_syn var[ i_I_syn ]
+#define V_m var[ i_V_m ]
+#define u var[ i_u ]
+#define refractory_step var[ i_refractory_step ]
+#define I_e param[ i_I_e ]
+#define den_delay param[ i_den_delay ]
+
+#define V_th_ group_param_[ i_V_th ]
+#define a_ group_param_[ i_a ]
+#define b_ group_param_[ i_b ]
+#define c_ group_param_[ i_c ]
+#define d_ group_param_[ i_d ]
+#define tau_syn_ group_param_[ i_tau_syn ]
+#define t_ref_ group_param_[ i_t_ref ]
+
+__global__ void
+izhikevich_psc_exp_2s_Update( int n_node,
+  int i_node_0,
+  float* var_arr,
+  float* param_arr,
+  int n_var,
+  int n_param,
+  float V_th,
+  float a,
+  float b,
+  float c,
+  float d,
+  int n_refractory_steps,
+  float h,
+  float C_syn )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
-      
-      for (int i=0; i<INTEGR_STEPS; i++) {
-	// exponential decaying PSC
-	I_syn *= C_syn;
+
+      for ( int i = 0; i < INTEGR_STEPS; i++ )
+      {
+        // exponential decaying PSC
+        I_syn *= C_syn;
       }
     }
-    else { // neuron is not refractory, so evolve V and u
-      for (int i=0; i<INTEGR_STEPS; i++) {
-	float v_old = V_m;
-	float u_old = u;
-
-	V_m += h*(0.04 * v_old * v_old + 5.0 * v_old + 140.0 - u_old
-		  + I_syn + I_e);
-	u += h*a*(b*v_old - u_old);
-	// exponential decaying PSC
-	I_syn *= C_syn;
+    else
+    { // neuron is not refractory, so evolve V and u
+      for ( int i = 0; i < INTEGR_STEPS; i++ )
+      {
+        float v_old = V_m;
+        float u_old = u;
+
+        V_m += h * ( 0.04 * v_old * v_old + 5.0 * v_old + 140.0 - u_old + I_syn + I_e );
+        u += h * a * ( b * v_old - u_old );
+        // exponential decaying PSC
+        I_syn *= C_syn;
       }
     }
-    if ( V_m >= V_th ) { // send spike
-      PushSpike(i_node_0 + i_neuron, 1.0);
+    if ( V_m >= V_th )
+    { // send spike
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m = c;
       u += d; // spike-driven adaptation
       refractory_step = n_refractory_steps;
-      if (refractory_step<0) {
-	refractory_step = 0;
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
 izhikevich_psc_exp_2s::~izhikevich_psc_exp_2s()
 {
   FreeVarArr();
   FreeParamArr();
 }
 
-int izhikevich_psc_exp_2s::Init(int i_node_0, int n_node, int /*n_port*/,
-			   int i_group)
+int
+izhikevich_psc_exp_2s::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 1 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 1 /*n_port*/, i_group );
   node_type_ = i_izhikevich_psc_exp_2s_model;
 
   n_scal_var_ = N_SCAL_VAR;
@@ -111,67 +123,67 @@ int izhikevich_psc_exp_2s::Init(int i_node_0, int n_node, int /*n_port*/,
   n_scal_param_ = N_SCAL_PARAM;
   n_group_param_ = N_GROUP_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = izhikevich_psc_exp_2s_scal_var_name;
   scal_param_name_ = izhikevich_psc_exp_2s_scal_param_name;
   group_param_name_ = izhikevich_psc_exp_2s_group_param_name;
 
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
-  SetScalParam(0, n_node, "den_delay", 0.0 );        // in ms
-  
-  SetScalVar(0, n_node, "I_syn", 0.0 );
-  SetScalVar(0, n_node, "V_m", -70.0 ); // in mV
-  SetScalVar(0, n_node, "u", -70.0*0.2 );
-  SetScalVar(0, n_node, "refractory_step", 0 );
-
-  SetGroupParam("V_th", 30.0);
-  SetGroupParam("a", 0.02);
-  SetGroupParam("b", 0.2);
-  SetGroupParam("c", -65.0);
-  SetGroupParam("d", 8.0);
-  SetGroupParam("tau_syn", 2.0);
-  SetGroupParam("t_ref", 0.0);
+  SetScalParam( 0, n_node, "I_e", 0.0 );       // in pA
+  SetScalParam( 0, n_node, "den_delay", 0.0 ); // in ms
+
+  SetScalVar( 0, n_node, "I_syn", 0.0 );
+  SetScalVar( 0, n_node, "V_m", -70.0 ); // in mV
+  SetScalVar( 0, n_node, "u", -70.0 * 0.2 );
+  SetScalVar( 0, n_node, "refractory_step", 0 );
+
+  SetGroupParam( "V_th", 30.0 );
+  SetGroupParam( "a", 0.02 );
+  SetGroupParam( "b", 0.2 );
+  SetGroupParam( "c", -65.0 );
+  SetGroupParam( "d", 8.0 );
+  SetGroupParam( "tau_syn", 2.0 );
+  SetGroupParam( "t_ref", 0.0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 0;
 
   return 0;
 }
 
-int izhikevich_psc_exp_2s::Update(long long it, double t1)
+int
+izhikevich_psc_exp_2s::Update( long long it, double t1 )
 {
   // std::cout << "izhikevich_psc_exp_2s neuron update\n";
-  float h = time_resolution_/INTEGR_STEPS;
+  float h = time_resolution_ / INTEGR_STEPS;
   float C_syn = exp( -h / tau_syn_ );
-  int n_refractory_steps = int(round(t_ref_ / h));
+  int n_refractory_steps = int( round( t_ref_ / h ) );
+
+  izhikevich_psc_exp_2s_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>(
+    n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_, V_th_, a_, b_, c_, d_, n_refractory_steps, h, C_syn );
+  // gpuErrchk( cudaDeviceSynchronize() );
 
-  izhikevich_psc_exp_2s_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_,
-     V_th_, a_, b_, c_, d_, n_refractory_steps, h, C_syn);
-  //gpuErrchk( cudaDeviceSynchronize() );
-  
   return 0;
 }
 
-int izhikevich_psc_exp_2s::Free()
+int
+izhikevich_psc_exp_2s::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
   delete[] group_param_;
-  
+
   return 0;
 }
diff --git a/src/izhikevich_psc_exp_2s.h b/src/izhikevich_psc_exp_2s.h
index 1b2203e92..1f8f27b15 100644
--- a/src/izhikevich_psc_exp_2s.h
+++ b/src/izhikevich_psc_exp_2s.h
@@ -20,97 +20,74 @@
  *
  */
 
-
-
-
-
-
 #ifndef IZHIKEVICHPSCEXP2S_H
 #define IZHIKEVICHPSCEXP2S_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 namespace izhikevich_psc_exp_2s_ns
 {
-enum ScalVarIndexes {
-  i_I_syn = 0,        // postsynaptic current for exc. inputs
-  i_V_m,              // membrane potential
+enum ScalVarIndexes
+{
+  i_I_syn = 0, // postsynaptic current for exc. inputs
+  i_V_m,       // membrane potential
   i_u,
-  i_refractory_step,  // refractory step counter
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_I_e = 0,         // External current in pA
+enum ScalParamIndexes
+{
+  i_I_e = 0, // External current in pA
   i_den_delay,
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
+enum GroupParamIndexes
+{
   i_V_th = 0,
   i_a,
   i_b,
   i_c,
   i_d,
-  i_tau_syn,         // Time constant of synaptic current in ms
-  i_t_ref,           // Refractory period in ms
+  i_tau_syn, // Time constant of synaptic current in ms
+  i_t_ref,   // Refractory period in ms
   N_GROUP_PARAM
 };
 
+const std::string izhikevich_psc_exp_2s_scal_var_name[ N_SCAL_VAR ] = { "I_syn", "V_m", "u", "refractory_step" };
 
- 
-const std::string izhikevich_psc_exp_2s_scal_var_name[N_SCAL_VAR] = {
-  "I_syn",
-  "V_m",
-  "u",
-  "refractory_step"
-};
-
-const std::string izhikevich_psc_exp_2s_scal_param_name[N_SCAL_PARAM] = {
-  "I_e",
-  "den_delay"
-};
-
-const std::string izhikevich_psc_exp_2s_group_param_name[N_GROUP_PARAM] = {
-  "V_th",
-  "a",
-  "b",
-  "c",
-  "d",
-  "tau_syn",
-  "t_ref"
-};
- 
-} // namespace
- 
+const std::string izhikevich_psc_exp_2s_scal_param_name[ N_SCAL_PARAM ] = { "I_e", "den_delay" };
 
+const std::string
+  izhikevich_psc_exp_2s_group_param_name[ N_GROUP_PARAM ] = { "V_th", "a", "b", "c", "d", "tau_syn", "t_ref" };
 
+} // namespace izhikevich_psc_exp_2s_ns
 
 class izhikevich_psc_exp_2s : public BaseNeuron
 {
   float time_resolution_;
 
- public:
+public:
   ~izhikevich_psc_exp_2s();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
-  int Calibrate(double /*time_min*/, float time_res) {
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int
+  Calibrate( double /*time_min*/, float time_res )
+  {
     time_resolution_ = time_res;
     return 0;
   }
-  
-  int Update(long long it, double t1);
 
-  int Free();
+  int Update( long long it, double t1 );
 
+  int Free();
 };
 
-
 #endif
diff --git a/src/izhikevich_psc_exp_5s.cu b/src/izhikevich_psc_exp_5s.cu
index 632fc5d98..9f8a7f9f2 100644
--- a/src/izhikevich_psc_exp_5s.cu
+++ b/src/izhikevich_psc_exp_5s.cu
@@ -20,90 +20,102 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
 #include "izhikevich_psc_exp_5s.h"
 #include "spike_buffer.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 using namespace izhikevich_psc_exp_5s_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
 
 #define INTEGR_STEPS 5
-#define I_syn var[i_I_syn]
-#define V_m var[i_V_m]
-#define u var[i_u]
-#define refractory_step var[i_refractory_step]
-#define I_e param[i_I_e]
-#define den_delay param[i_den_delay]
-
-#define V_th_ group_param_[i_V_th]
-#define a_ group_param_[i_a]
-#define b_ group_param_[i_b]
-#define c_ group_param_[i_c]
-#define d_ group_param_[i_d]
-#define tau_syn_ group_param_[i_tau_syn]
-#define t_ref_ group_param_[i_t_ref]
-
-__global__ void izhikevich_psc_exp_5s_Update
-( int n_node, int i_node_0, float *var_arr, float *param_arr, int n_var,
-  int n_param, float V_th, float a, float b, float c, float d,
-  int n_refractory_steps, float h, float C_syn)
+#define I_syn var[ i_I_syn ]
+#define V_m var[ i_V_m ]
+#define u var[ i_u ]
+#define refractory_step var[ i_refractory_step ]
+#define I_e param[ i_I_e ]
+#define den_delay param[ i_den_delay ]
+
+#define V_th_ group_param_[ i_V_th ]
+#define a_ group_param_[ i_a ]
+#define b_ group_param_[ i_b ]
+#define c_ group_param_[ i_c ]
+#define d_ group_param_[ i_d ]
+#define tau_syn_ group_param_[ i_tau_syn ]
+#define t_ref_ group_param_[ i_t_ref ]
+
+__global__ void
+izhikevich_psc_exp_5s_Update( int n_node,
+  int i_node_0,
+  float* var_arr,
+  float* param_arr,
+  int n_var,
+  int n_param,
+  float V_th,
+  float a,
+  float b,
+  float c,
+  float d,
+  int n_refractory_steps,
+  float h,
+  float C_syn )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
-      
-      for (int i=0; i<INTEGR_STEPS; i++) {
-	// exponential decaying PSC
-	I_syn *= C_syn;
+
+      for ( int i = 0; i < INTEGR_STEPS; i++ )
+      {
+        // exponential decaying PSC
+        I_syn *= C_syn;
       }
     }
-    else { // neuron is not refractory, so evolve V and u
-      for (int i=0; i<INTEGR_STEPS; i++) {
-	float v_old = V_m;
-	float u_old = u;
-
-	V_m += h*(0.04 * v_old * v_old + 5.0 * v_old + 140.0 - u_old
-		  + I_syn + I_e);
-	u += h*a*(b*v_old - u_old);
-	// exponential decaying PSC
-	I_syn *= C_syn;
+    else
+    { // neuron is not refractory, so evolve V and u
+      for ( int i = 0; i < INTEGR_STEPS; i++ )
+      {
+        float v_old = V_m;
+        float u_old = u;
+
+        V_m += h * ( 0.04 * v_old * v_old + 5.0 * v_old + 140.0 - u_old + I_syn + I_e );
+        u += h * a * ( b * v_old - u_old );
+        // exponential decaying PSC
+        I_syn *= C_syn;
       }
     }
-    if ( V_m >= V_th ) { // send spike
-      PushSpike(i_node_0 + i_neuron, 1.0);
+    if ( V_m >= V_th )
+    { // send spike
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m = c;
       u += d; // spike-driven adaptation
       refractory_step = n_refractory_steps;
-      if (refractory_step<0) {
-	refractory_step = 0;
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
 izhikevich_psc_exp_5s::~izhikevich_psc_exp_5s()
 {
   FreeVarArr();
   FreeParamArr();
 }
 
-int izhikevich_psc_exp_5s::Init(int i_node_0, int n_node, int /*n_port*/,
-			   int i_group)
+int
+izhikevich_psc_exp_5s::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 1 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 1 /*n_port*/, i_group );
   node_type_ = i_izhikevich_psc_exp_5s_model;
 
   n_scal_var_ = N_SCAL_VAR;
@@ -111,67 +123,67 @@ int izhikevich_psc_exp_5s::Init(int i_node_0, int n_node, int /*n_port*/,
   n_scal_param_ = N_SCAL_PARAM;
   n_group_param_ = N_GROUP_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = izhikevich_psc_exp_5s_scal_var_name;
   scal_param_name_ = izhikevich_psc_exp_5s_scal_param_name;
   group_param_name_ = izhikevich_psc_exp_5s_group_param_name;
 
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
-  SetScalParam(0, n_node, "den_delay", 0.0 );        // in ms
-  
-  SetScalVar(0, n_node, "I_syn", 0.0 );
-  SetScalVar(0, n_node, "V_m", -70.0 ); // in mV
-  SetScalVar(0, n_node, "u", -70.0*0.2 );
-  SetScalVar(0, n_node, "refractory_step", 0 );
-
-  SetGroupParam("V_th", 30.0);
-  SetGroupParam("a", 0.02);
-  SetGroupParam("b", 0.2);
-  SetGroupParam("c", -65.0);
-  SetGroupParam("d", 8.0);
-  SetGroupParam("tau_syn", 2.0);
-  SetGroupParam("t_ref", 0.0);
+  SetScalParam( 0, n_node, "I_e", 0.0 );       // in pA
+  SetScalParam( 0, n_node, "den_delay", 0.0 ); // in ms
+
+  SetScalVar( 0, n_node, "I_syn", 0.0 );
+  SetScalVar( 0, n_node, "V_m", -70.0 ); // in mV
+  SetScalVar( 0, n_node, "u", -70.0 * 0.2 );
+  SetScalVar( 0, n_node, "refractory_step", 0 );
+
+  SetGroupParam( "V_th", 30.0 );
+  SetGroupParam( "a", 0.02 );
+  SetGroupParam( "b", 0.2 );
+  SetGroupParam( "c", -65.0 );
+  SetGroupParam( "d", 8.0 );
+  SetGroupParam( "tau_syn", 2.0 );
+  SetGroupParam( "t_ref", 0.0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 0;
 
   return 0;
 }
 
-int izhikevich_psc_exp_5s::Update(long long it, double t1)
+int
+izhikevich_psc_exp_5s::Update( long long it, double t1 )
 {
   // std::cout << "izhikevich_psc_exp_5s neuron update\n";
-  float h = time_resolution_/INTEGR_STEPS;
+  float h = time_resolution_ / INTEGR_STEPS;
   float C_syn = exp( -h / tau_syn_ );
-  int n_refractory_steps = int(round(t_ref_ / h));
+  int n_refractory_steps = int( round( t_ref_ / h ) );
+
+  izhikevich_psc_exp_5s_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>(
+    n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_, V_th_, a_, b_, c_, d_, n_refractory_steps, h, C_syn );
+  // gpuErrchk( cudaDeviceSynchronize() );
 
-  izhikevich_psc_exp_5s_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_,
-     V_th_, a_, b_, c_, d_, n_refractory_steps, h, C_syn);
-  //gpuErrchk( cudaDeviceSynchronize() );
-  
   return 0;
 }
 
-int izhikevich_psc_exp_5s::Free()
+int
+izhikevich_psc_exp_5s::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
   delete[] group_param_;
-  
+
   return 0;
 }
diff --git a/src/izhikevich_psc_exp_5s.h b/src/izhikevich_psc_exp_5s.h
index 6ccdac815..31f48285a 100644
--- a/src/izhikevich_psc_exp_5s.h
+++ b/src/izhikevich_psc_exp_5s.h
@@ -20,97 +20,74 @@
  *
  */
 
-
-
-
-
-
 #ifndef IZHIKEVICHPSCEXP5S_H
 #define IZHIKEVICHPSCEXP5S_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 namespace izhikevich_psc_exp_5s_ns
 {
-enum ScalVarIndexes {
-  i_I_syn = 0,        // postsynaptic current for exc. inputs
-  i_V_m,              // membrane potential
+enum ScalVarIndexes
+{
+  i_I_syn = 0, // postsynaptic current for exc. inputs
+  i_V_m,       // membrane potential
   i_u,
-  i_refractory_step,  // refractory step counter
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_I_e = 0,         // External current in pA
+enum ScalParamIndexes
+{
+  i_I_e = 0, // External current in pA
   i_den_delay,
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
+enum GroupParamIndexes
+{
   i_V_th = 0,
   i_a,
   i_b,
   i_c,
   i_d,
-  i_tau_syn,         // Time constant of synaptic current in ms
-  i_t_ref,           // Refractory period in ms
+  i_tau_syn, // Time constant of synaptic current in ms
+  i_t_ref,   // Refractory period in ms
   N_GROUP_PARAM
 };
 
+const std::string izhikevich_psc_exp_5s_scal_var_name[ N_SCAL_VAR ] = { "I_syn", "V_m", "u", "refractory_step" };
 
- 
-const std::string izhikevich_psc_exp_5s_scal_var_name[N_SCAL_VAR] = {
-  "I_syn",
-  "V_m",
-  "u",
-  "refractory_step"
-};
-
-const std::string izhikevich_psc_exp_5s_scal_param_name[N_SCAL_PARAM] = {
-  "I_e",
-  "den_delay"
-};
-
-const std::string izhikevich_psc_exp_5s_group_param_name[N_GROUP_PARAM] = {
-  "V_th",
-  "a",
-  "b",
-  "c",
-  "d",
-  "tau_syn",
-  "t_ref"
-};
- 
-} // namespace
- 
+const std::string izhikevich_psc_exp_5s_scal_param_name[ N_SCAL_PARAM ] = { "I_e", "den_delay" };
 
+const std::string
+  izhikevich_psc_exp_5s_group_param_name[ N_GROUP_PARAM ] = { "V_th", "a", "b", "c", "d", "tau_syn", "t_ref" };
 
+} // namespace izhikevich_psc_exp_5s_ns
 
 class izhikevich_psc_exp_5s : public BaseNeuron
 {
   float time_resolution_;
 
- public:
+public:
   ~izhikevich_psc_exp_5s();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
-  int Calibrate(double /*time_min*/, float time_res) {
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int
+  Calibrate( double /*time_min*/, float time_res )
+  {
     time_resolution_ = time_res;
     return 0;
   }
-  
-  int Update(long long it, double t1);
 
-  int Free();
+  int Update( long long it, double t1 );
 
+  int Free();
 };
 
-
 #endif
diff --git a/src/mpi_comm.cu b/src/mpi_comm.cu
index 05eb4fd41..2098e5be5 100644
--- a/src/mpi_comm.cu
+++ b/src/mpi_comm.cu
@@ -22,180 +22,194 @@
 
 #include <config.h>
 
+#include <list>
 #include <stdio.h>
 #include <stdlib.h>
-#include <list>
 
-#include "nestgpu.h"
 #include "cuda_error.h"
 #include "getRealTime.h"
+#include "nestgpu.h"
 
+#include "mpi_comm.h"
 #include "remote_connect.h"
 #include "remote_spike.h"
-#include "mpi_comm.h"
 
 #ifdef HAVE_MPI
 #include <mpi.h>
-MPI_Request *recv_mpi_request;
+MPI_Request* recv_mpi_request;
 #endif
 
 // Send spikes to remote MPI processes
-int NESTGPU::SendSpikeToRemote(int n_ext_spikes)
+int
+NESTGPU::SendSpikeToRemote( int n_ext_spikes )
 {
 #ifdef HAVE_MPI
   MPI_Request request;
   int mpi_id, tag = 1; // id is already in the class, can be removed
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_id);
+  MPI_Comm_rank( MPI_COMM_WORLD, &mpi_id );
 
   double time_mark = getRealTime();
-  gpuErrchk(cudaMemcpy(h_ExternalTargetSpikeNum, d_ExternalTargetSpikeNum,
-		       n_hosts_*sizeof(int), cudaMemcpyDeviceToHost));
-  SendSpikeToRemote_CUDAcp_time_ += (getRealTime() - time_mark);
-  
+  gpuErrchk( cudaMemcpy(
+    h_ExternalTargetSpikeNum, d_ExternalTargetSpikeNum, n_hosts_ * sizeof( int ), cudaMemcpyDeviceToHost ) );
+  SendSpikeToRemote_CUDAcp_time_ += ( getRealTime() - time_mark );
+
   time_mark = getRealTime();
   int n_spike_tot = 0;
   // copy spikes from GPU to CPU memory
-  if (n_ext_spikes > 0) {
-    gpuErrchk(cudaMemcpy(&n_spike_tot, d_ExternalTargetSpikeIdx0 + n_hosts_,
-			 sizeof(int), cudaMemcpyDeviceToHost));
-    if (n_spike_tot >= max_remote_spike_num_) {
-      throw ngpu_exception
-	(std::string("Number of spikes to be sent remotely ")
-	 + std::to_string(n_spike_tot)
-	 + " larger than limit " + std::to_string(max_remote_spike_num_));
+  if ( n_ext_spikes > 0 )
+  {
+    gpuErrchk(
+      cudaMemcpy( &n_spike_tot, d_ExternalTargetSpikeIdx0 + n_hosts_, sizeof( int ), cudaMemcpyDeviceToHost ) );
+    if ( n_spike_tot >= max_remote_spike_num_ )
+    {
+      throw ngpu_exception( std::string( "Number of spikes to be sent remotely " ) + std::to_string( n_spike_tot )
+        + " larger than limit " + std::to_string( max_remote_spike_num_ ) );
     }
-    
-    gpuErrchk(cudaMemcpy(h_ExternalTargetSpikeNodeId,
-			 d_ExternalTargetSpikeNodeId,
-			 n_spike_tot*sizeof(int),
-			 cudaMemcpyDeviceToHost));
-    gpuErrchk(cudaMemcpy(h_ExternalTargetSpikeIdx0,
-			 d_ExternalTargetSpikeIdx0,
-			 (n_hosts_ + 1)*sizeof(int),
-			 cudaMemcpyDeviceToHost));
+
+    gpuErrchk( cudaMemcpy(
+      h_ExternalTargetSpikeNodeId, d_ExternalTargetSpikeNodeId, n_spike_tot * sizeof( int ), cudaMemcpyDeviceToHost ) );
+    gpuErrchk( cudaMemcpy( h_ExternalTargetSpikeIdx0,
+      d_ExternalTargetSpikeIdx0,
+      ( n_hosts_ + 1 ) * sizeof( int ),
+      cudaMemcpyDeviceToHost ) );
   }
-  else {
-    for (int i=0; i<n_hosts_+1; i++) {
-      h_ExternalTargetSpikeIdx0[i] = 0;
+  else
+  {
+    for ( int i = 0; i < n_hosts_ + 1; i++ )
+    {
+      h_ExternalTargetSpikeIdx0[ i ] = 0;
     }
   }
-  
-  SendSpikeToRemote_CUDAcp_time_ += (getRealTime() - time_mark);
+
+  SendSpikeToRemote_CUDAcp_time_ += ( getRealTime() - time_mark );
   time_mark = getRealTime();
 
   // loop on remote MPI proc
-  for (int ih=0; ih<n_hosts_; ih++) {
-    if (ih == mpi_id) { // skip self MPI proc
+  for ( int ih = 0; ih < n_hosts_; ih++ )
+  {
+    if ( ( int ) ih == mpi_id )
+    { // skip self MPI proc
       continue;
     }
     // get index and size of spike packet that must be sent to MPI proc ih
     // array_idx is the first index of the packet for host ih
-    int array_idx = h_ExternalTargetSpikeIdx0[ih];
-    int n_spikes = h_ExternalTargetSpikeIdx0[ih+1] - array_idx;
-    //printf("MPI_Send (src,tgt,nspike): %d %d %d\n", mpi_id, ih, n_spike);
-    
+    int array_idx = h_ExternalTargetSpikeIdx0[ ih ];
+    int n_spikes = h_ExternalTargetSpikeIdx0[ ih + 1 ] - array_idx;
+    // printf("MPI_Send (src,tgt,nspike): %d %d %d\n", mpi_id, ih, n_spike);
+
     // nonblocking sent of spike packet to MPI proc ih
-    MPI_Isend(&h_ExternalTargetSpikeNodeId[array_idx],
-	      n_spikes, MPI_INT, ih, tag, MPI_COMM_WORLD, &request);
-    
-    //printf("MPI_Send nspikes (src,tgt,nspike): "
+    MPI_Isend( &h_ExternalTargetSpikeNodeId[ array_idx ], n_spikes, MPI_INT, ih, tag, MPI_COMM_WORLD, &request );
+
+    // printf("MPI_Send nspikes (src,tgt,nspike): "
     //	   "%d %d %d\n", mpi_id, ih, n_spikes);
-    //printf("MPI_Send 1st-neuron-idx (src,tgt,idx): "
+    // printf("MPI_Send 1st-neuron-idx (src,tgt,idx): "
     //	   "%d %d %d\n", mpi_id, ih,
     //	   h_ExternalTargetSpikeNodeId[array_idx]);
   }
-  SendSpikeToRemote_comm_time_ += (getRealTime() - time_mark);
-  
+  SendSpikeToRemote_comm_time_ += ( getRealTime() - time_mark );
+
   return 0;
 #else
-  throw ngpu_exception("MPI is not available in your build");
+  throw ngpu_exception( "MPI is not available in your build" );
 #endif
 }
 
 // Receive spikes from remote MPI processes
-int NESTGPU::RecvSpikeFromRemote()
-  
+int
+NESTGPU::RecvSpikeFromRemote()
+
 {
 #ifdef HAVE_MPI
   int mpi_id, tag = 1; // id is already in the class, can be removed
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_id);
-  
+  MPI_Comm_rank( MPI_COMM_WORLD, &mpi_id );
+
   double time_mark = getRealTime();
-  
+
   // loop on remote MPI proc
-  for (int i_host=0; i_host<n_hosts_; i_host++) {
-    if (i_host == mpi_id) continue; // skip self MPI proc
+  for ( int i_host = 0; i_host < n_hosts_; i_host++ )
+  {
+    if ( ( int ) i_host == mpi_id )
+    {
+      continue; // skip self MPI proc
+    }
     // start nonblocking MPI receive from MPI proc i_host
-    MPI_Irecv(&h_ExternalSourceSpikeNodeId[i_host*max_spike_per_host_],
-	      max_spike_per_host_, MPI_INT, i_host, tag, MPI_COMM_WORLD,
-	      &recv_mpi_request[i_host]);
+    MPI_Irecv( &h_ExternalSourceSpikeNodeId[ i_host * max_spike_per_host_ ],
+      max_spike_per_host_,
+      MPI_INT,
+      i_host,
+      tag,
+      MPI_COMM_WORLD,
+      &recv_mpi_request[ i_host ] );
   }
-  
-  MPI_Status statuses[n_hosts_];
-  recv_mpi_request[mpi_id] = MPI_REQUEST_NULL;
-  MPI_Waitall(n_hosts_, recv_mpi_request, statuses);
-
-  for (int i_host=0; i_host<n_hosts_; i_host++) {
-    if (i_host == mpi_id) {
-      h_ExternalSourceSpikeNum[i_host] = 0;
+
+  MPI_Status statuses[ n_hosts_ ];
+  recv_mpi_request[ mpi_id ] = MPI_REQUEST_NULL;
+  MPI_Waitall( n_hosts_, recv_mpi_request, statuses );
+
+  for ( int i_host = 0; i_host < n_hosts_; i_host++ )
+  {
+    if ( ( int ) i_host == mpi_id )
+    {
+      h_ExternalSourceSpikeNum[ i_host ] = 0;
       continue;
     }
     int count;
-    MPI_Get_count(&statuses[i_host], MPI_INT, &count);
-    h_ExternalSourceSpikeNum[i_host] = count;
+    MPI_Get_count( &statuses[ i_host ], MPI_INT, &count );
+    h_ExternalSourceSpikeNum[ i_host ] = count;
   }
   // Maybe the barrier is not necessary?
-  MPI_Barrier(MPI_COMM_WORLD);
-  RecvSpikeFromRemote_comm_time_ += (getRealTime() - time_mark);
-  
+  MPI_Barrier( MPI_COMM_WORLD );
+  RecvSpikeFromRemote_comm_time_ += ( getRealTime() - time_mark );
+
   return 0;
 #else
-  throw ngpu_exception("MPI is not available in your build");
+  throw ngpu_exception( "MPI is not available in your build" );
 #endif
 }
 
-
-
-int NESTGPU::ConnectMpiInit(int argc, char *argv[])
+int
+NESTGPU::ConnectMpiInit( int argc, char* argv[] )
 {
 #ifdef HAVE_MPI
-  CheckUncalibrated("MPI connections cannot be initialized after calibration");
+  CheckUncalibrated( "MPI connections cannot be initialized after calibration" );
   int initialized;
-  MPI_Initialized(&initialized);
-  if (!initialized) {
-    MPI_Init(&argc,&argv);
+  MPI_Initialized( &initialized );
+  if ( !initialized )
+  {
+    MPI_Init( &argc, &argv );
   }
   int n_hosts;
   int this_host;
-  MPI_Comm_size(MPI_COMM_WORLD, &n_hosts);
-  MPI_Comm_rank(MPI_COMM_WORLD, &this_host);
+  MPI_Comm_size( MPI_COMM_WORLD, &n_hosts );
+  MPI_Comm_rank( MPI_COMM_WORLD, &this_host );
   mpi_flag_ = true;
-  setHostNum(n_hosts);
-  setThisHost(this_host);
-  RemoteConnectionMapInit(n_hosts);
-  recv_mpi_request = new MPI_Request[n_hosts_];
-  
+  setNHosts( n_hosts );
+  setThisHost( this_host );
+  conn_->remoteConnectionMapInit();
+  recv_mpi_request = new MPI_Request[ n_hosts_ ];
+
   return 0;
 #else
-  throw ngpu_exception("MPI is not available in your build");
+  throw ngpu_exception( "MPI is not available in your build" );
 #endif
 }
 
-
-int NESTGPU::MpiFinalize()
+int
+NESTGPU::MpiFinalize()
 {
 #ifdef HAVE_MPI
-  if (mpi_flag_) {
+  if ( mpi_flag_ )
+  {
     int finalized;
-    MPI_Finalized(&finalized);
-    if (!finalized) {
+    MPI_Finalized( &finalized );
+    if ( !finalized )
+    {
       MPI_Finalize();
     }
   }
-  
+
   return 0;
 #else
-  throw ngpu_exception("MPI is not available in your build");
+  throw ngpu_exception( "MPI is not available in your build" );
 #endif
 }
-
diff --git a/src/multimeter.cu b/src/multimeter.cu
index 6989f8130..24a0e2bda 100644
--- a/src/multimeter.cu
+++ b/src/multimeter.cu
@@ -20,144 +20,169 @@
  *
  */
 
-
-
-
-
+#include "cuda_error.h"
+#include "multimeter.h"
 #include <config.h>
 #include <iostream>
 #include <vector>
-#include "multimeter.h"
-#include "cuda_error.h"
 
-const   std::string SpikeVarName = "spike";
-
-Record::Record(std::vector<BaseNeuron*> neur_vect, std::string file_name,
-	       std::vector<std::string> var_name_vect,
-	       std::vector<int> i_neur_vect, std::vector<int> port_vect):
-  neuron_vect_(neur_vect), file_name_(file_name),
-  var_name_vect_(var_name_vect),
-  i_neuron_vect_(i_neur_vect),
-  port_vect_(port_vect)
+const std::string SpikeVarName = "spike";
+
+Record::Record( std::vector< BaseNeuron* > neur_vect,
+  std::string file_name,
+  std::vector< std::string > var_name_vect,
+  std::vector< int > i_neur_vect,
+  std::vector< int > port_vect )
+  : neuron_vect_( neur_vect )
+  , file_name_( file_name )
+  , var_name_vect_( var_name_vect )
+  , i_neuron_vect_( i_neur_vect )
+  , port_vect_( port_vect )
 {
   data_vect_flag_ = true;
-  if (file_name=="") {
+  if ( file_name == "" )
+  {
     out_file_flag_ = false;
-  } else {
+  }
+  else
+  {
     out_file_flag_ = true;
   }
   var_pt_vect_.clear();
-  for (unsigned int i=0; i<var_name_vect.size(); i++) {
-    if (var_name_vect[i]!=SpikeVarName) {
-      float *var_pt = neur_vect[i]->GetVarPt(i_neur_vect[i], var_name_vect[i],
-					     port_vect[i]);
-      var_pt_vect_.push_back(var_pt);
+  for ( unsigned int i = 0; i < var_name_vect.size(); i++ )
+  {
+    if ( var_name_vect[ i ] != SpikeVarName )
+    {
+      float* var_pt = neur_vect[ i ]->GetVarPt( i_neur_vect[ i ], var_name_vect[ i ], port_vect[ i ] );
+      var_pt_vect_.push_back( var_pt );
     }
-    else {
-      var_pt_vect_.push_back(NULL);
+    else
+    {
+      var_pt_vect_.push_back( nullptr );
     }
   }
 }
 
-int Record::OpenFile()
+int
+Record::OpenFile()
 {
-  fp_=fopen(file_name_.c_str(), "w");
-	   
+  fp_ = fopen( file_name_.c_str(), "w" );
+
   return 0;
 }
 
-int Record::CloseFile()
+int
+Record::CloseFile()
 {
-  fclose(fp_);
-	   
+  fclose( fp_ );
+
   return 0;
 }
 
-int Record::WriteRecord(float t)
+int
+Record::WriteRecord( float t )
 {
   float var;
-  std::vector<float> vect;
-  
-  if (out_file_flag_) {
-    fprintf(fp_,"%f", t);
+  std::vector< float > vect;
+
+  if ( out_file_flag_ )
+  {
+    fprintf( fp_, "%f", t );
   }
-  if (data_vect_flag_) {
-    vect.push_back(t);
+  if ( data_vect_flag_ )
+  {
+    vect.push_back( t );
   }
-  for (unsigned int i=0; i<var_name_vect_.size(); i++) {
-    if (var_name_vect_[i]!=SpikeVarName) {
-      gpuErrchk(cudaMemcpy(&var, var_pt_vect_[i], sizeof(float),
-			   cudaMemcpyDeviceToHost));
+  for ( unsigned int i = 0; i < var_name_vect_.size(); i++ )
+  {
+    if ( var_name_vect_[ i ] != SpikeVarName )
+    {
+      gpuErrchk( cudaMemcpy( &var, var_pt_vect_[ i ], sizeof( float ), cudaMemcpyDeviceToHost ) );
     }
-    else {
-      var = neuron_vect_[i]->GetSpikeActivity(i_neuron_vect_[i]);
+    else
+    {
+      var = neuron_vect_[ i ]->GetSpikeActivity( i_neuron_vect_[ i ] );
     }
-    if (out_file_flag_) {
-      fprintf(fp_,"\t%f", var);
+    if ( out_file_flag_ )
+    {
+      fprintf( fp_, "\t%f", var );
     }
-    if (data_vect_flag_) {
-      vect.push_back(var);
+    if ( data_vect_flag_ )
+    {
+      vect.push_back( var );
     }
   }
-  if (out_file_flag_) {
-    fprintf(fp_,"\n");
+  if ( out_file_flag_ )
+  {
+    fprintf( fp_, "\n" );
   }
-  if (data_vect_flag_) {
-    data_vect_.push_back(vect);
+  if ( data_vect_flag_ )
+  {
+    data_vect_.push_back( vect );
   }
 
   return 0;
 }
 
-int Multimeter::CreateRecord(std::vector<BaseNeuron*> neur_vect,
-			     std::string file_name,
-			     std::vector<std::string> var_name_vect,
-			     std::vector<int> i_neur_vect,
-			     std::vector<int> port_vect)
+int
+Multimeter::CreateRecord( std::vector< BaseNeuron* > neur_vect,
+  std::string file_name,
+  std::vector< std::string > var_name_vect,
+  std::vector< int > i_neur_vect,
+  std::vector< int > port_vect )
 {
-  Record record(neur_vect, file_name, var_name_vect, i_neur_vect,
-		port_vect);
-  record_vect_.push_back(record);
+  Record record( neur_vect, file_name, var_name_vect, i_neur_vect, port_vect );
+  record_vect_.push_back( record );
 
-  return (record_vect_.size() - 1);
+  return ( record_vect_.size() - 1 );
 }
 
-int Multimeter::OpenFiles()
+int
+Multimeter::OpenFiles()
 {
-  for (unsigned int i=0; i<record_vect_.size(); i++) {
-    if (record_vect_[i].out_file_flag_) {
-      record_vect_[i].OpenFile();
+  for ( unsigned int i = 0; i < record_vect_.size(); i++ )
+  {
+    if ( record_vect_[ i ].out_file_flag_ )
+    {
+      record_vect_[ i ].OpenFile();
     }
   }
-  
+
   return 0;
 }
 
-int Multimeter::CloseFiles()
-{  
-  for (unsigned int i=0; i<record_vect_.size(); i++) {
-    if (record_vect_[i].out_file_flag_) {
-      record_vect_[i].CloseFile();
+int
+Multimeter::CloseFiles()
+{
+  for ( unsigned int i = 0; i < record_vect_.size(); i++ )
+  {
+    if ( record_vect_[ i ].out_file_flag_ )
+    {
+      record_vect_[ i ].CloseFile();
     }
   }
-  
+
   return 0;
 }
 
-int Multimeter::WriteRecords(float t)
-{  
-  for (unsigned int i=0; i<record_vect_.size(); i++) {
-    record_vect_[i].WriteRecord(t);
+int
+Multimeter::WriteRecords( float t )
+{
+  for ( unsigned int i = 0; i < record_vect_.size(); i++ )
+  {
+    record_vect_[ i ].WriteRecord( t );
   }
-  
+
   return 0;
 }
 
-std::vector<std::vector<float> > *Multimeter::GetRecordData(int i_record)
+std::vector< std::vector< float > >*
+Multimeter::GetRecordData( int i_record )
 {
-  if (i_record<0 || i_record>=(int)record_vect_.size()) {
-    throw ngpu_exception("Record does not exist.");
+  if ( i_record < 0 || i_record >= ( int ) record_vect_.size() )
+  {
+    throw ngpu_exception( "Record does not exist." );
   }
-  
-  return &record_vect_[i_record].data_vect_;
+
+  return &record_vect_[ i_record ].data_vect_;
 }
diff --git a/src/multimeter.h b/src/multimeter.h
index eabc060e1..431495238 100644
--- a/src/multimeter.h
+++ b/src/multimeter.h
@@ -20,17 +20,12 @@
  *
  */
 
-
-
-
-
 #ifndef MULTIMETER_H
 #define MULTIMETER_H
+#include "base_neuron.h"
 #include <stdio.h>
 #include <string>
 #include <vector>
-#include "base_neuron.h"
-
 
 /* BeginUserDocs: device, recorder
 
@@ -84,7 +79,7 @@ can also be retreived through the commands ``GetRecordDataRows`` and
    print("recorder has {} rows and {} columns".format(rows, columns))
 
    recorded_data = nestgpu.GetRecordData(record)
-   
+
    time = [row[0] for row in recorded_data]
    variable = [row[1] for row in recorded_data]
 
@@ -94,52 +89,50 @@ See also
 
 EndUserDocs */
 
-
-
 class Record
 {
- public:
+public:
   bool data_vect_flag_;
   bool out_file_flag_;
-  std::vector<std::vector<float> > data_vect_;
-  std::vector<BaseNeuron*> neuron_vect_;
+  std::vector< std::vector< float > > data_vect_;
+  std::vector< BaseNeuron* > neuron_vect_;
   std::string file_name_;
-  std::vector<std::string> var_name_vect_;
-  std::vector<int> i_neuron_vect_;
-  std::vector<int> port_vect_;
-  std::vector<float*> var_pt_vect_;
-  FILE *fp_;
-
-  Record(std::vector<BaseNeuron*> neur_vect, std::string file_name,
-	 std::vector<std::string> var_name_vect,
-	 std::vector<int> i_neur_vect, std::vector<int> port_vect);
+  std::vector< std::string > var_name_vect_;
+  std::vector< int > i_neuron_vect_;
+  std::vector< int > port_vect_;
+  std::vector< float* > var_pt_vect_;
+  FILE* fp_;
+
+  Record( std::vector< BaseNeuron* > neur_vect,
+    std::string file_name,
+    std::vector< std::string > var_name_vect,
+    std::vector< int > i_neur_vect,
+    std::vector< int > port_vect );
 
   int OpenFile();
-  
+
   int CloseFile();
-  
-  int WriteRecord(float t);
 
+  int WriteRecord( float t );
 };
-  
+
 class Multimeter
 {
- public:
-  std::vector<Record> record_vect_;
-
-  int CreateRecord(std::vector<BaseNeuron*> neur_vect,
-		   std::string file_name,
-		   std::vector<std::string> var_name_vect,
-		   std::vector<int> i_neur_vect,
-		   std::vector<int> port_vect);
+public:
+  std::vector< Record > record_vect_;
+
+  int CreateRecord( std::vector< BaseNeuron* > neur_vect,
+    std::string file_name,
+    std::vector< std::string > var_name_vect,
+    std::vector< int > i_neur_vect,
+    std::vector< int > port_vect );
   int OpenFiles();
 
   int CloseFiles();
 
-  int WriteRecords(float t);
+  int WriteRecords( float t );
 
-  std::vector<std::vector<float> > *GetRecordData(int i_record);
-	     
+  std::vector< std::vector< float > >* GetRecordData( int i_record );
 };
 
 #endif
diff --git a/src/nested_loop.cu b/src/nested_loop.cu
index 24a03109f..2290bcc84 100644
--- a/src/nested_loop.cu
+++ b/src/nested_loop.cu
@@ -24,15 +24,14 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-				    //#include "cuda_error_nl.h"
+// #include "cuda_error_nl.h"
 #include "cuda_error.h"
-#include "utilities.h"
-#include "syn_model.h"
 #include "nested_loop.h"
+#include "syn_model.h"
+#include "utilities.h"
 
 const int Ny_arr_size_ = 24;
-int Ny_th_arr_[] = {
-  355375,
+int Ny_th_arr_[] = { 355375,
   215546,
   48095,
   29171,
@@ -55,105 +54,107 @@ int Ny_th_arr_[] = {
   119,
   72,
   72,
-  72
-};
+  72 };
 
 namespace NestedLoop
 {
-  //#include "Ny_th.h"
-  void *d_sort_storage_;
-  size_t sort_storage_bytes_;
-  void *d_reduce_storage_;
-  size_t reduce_storage_bytes_;
-
-  int Nx_max_;
-  int *d_max_Ny_;
-  int *d_sorted_Ny_;
-
-  int *d_idx_;
-  int *d_sorted_idx_;
-
-  int block_dim_x_;
-  int block_dim_y_;
-  int frame_area_;
-  float x_lim_;
-}
-
-//TMP
+// #include "Ny_th.h"
+void* d_sort_storage_;
+size_t sort_storage_bytes_;
+void* d_reduce_storage_;
+size_t reduce_storage_bytes_;
+
+int Nx_max_;
+int* d_max_Ny_;
+int* d_sorted_Ny_;
+
+int* d_idx_;
+int* d_sorted_idx_;
+
+int block_dim_x_;
+int block_dim_y_;
+int frame_area_;
+float x_lim_;
+} // namespace NestedLoop
+
+// TMP
 #include "getRealTime.h"
 //
 
 //////////////////////////////////////////////////////////////////////
-// declare here the functions called by the nested loop 
+// declare here the functions called by the nested loop
 //__device__ void NestedLoopFunction0(int ix, int iy);
 //__device__ void NestedLoopFunction1(int ix, int iy);
 //////////////////////////////////////////////////////////////////////
 extern __constant__ long long NESTGPUTimeIdx;
 extern __constant__ float NESTGPUTimeResolution;
 extern __constant__ NodeGroupStruct NodeGroupArray[];
-extern __device__ int16_t *NodeGroupMap;
-
-
+extern __device__ int16_t* NodeGroupMap;
 
 namespace NestedLoop
 {
-  int *d_Ny_cumul_sum_;
-  PrefixScan prefix_scan_;
-}
+int* d_Ny_cumul_sum_;
+PrefixScan prefix_scan_;
+} // namespace NestedLoop
 
 //////////////////////////////////////////////////////////////////////
-int NestedLoop::Init()
+int
+NestedLoop::Init()
 {
-  //return Init(65536*1024);
-  return Init(128*1024);
+  // return Init(65536*1024);
+  return Init( 128 * 1024 );
 }
 
 //////////////////////////////////////////////////////////////////////
-int NestedLoop::Init(int Nx_max)
+int
+NestedLoop::Init( int Nx_max )
 {
-  //prefix_scan_.Init();
-  CUDAMALLOCCTRL("&d_Ny_cumul_sum_",&d_Ny_cumul_sum_,
-			  PrefixScan::AllocSize*sizeof(int));
+  // prefix_scan_.Init();
+  CUDAMALLOCCTRL( "&d_Ny_cumul_sum_", &d_Ny_cumul_sum_, PrefixScan::AllocSize * sizeof( int ) );
 
-  if (Nx_max <= 0) return 0;
+  if ( Nx_max <= 0 )
+  {
+    return 0;
+  }
 
   block_dim_x_ = 32;
   block_dim_y_ = 32;
-  frame_area_ = 65536*64;
+  frame_area_ = 65536 * 64;
   x_lim_ = 0.75;
   Nx_max_ = Nx_max;
 
-  CUDAMALLOCCTRL("&d_max_Ny_",&d_max_Ny_, sizeof(int));  
-  CUDAMALLOCCTRL("&d_sorted_Ny_",&d_sorted_Ny_, Nx_max*sizeof(int));
-  CUDAMALLOCCTRL("&d_idx_",&d_idx_, Nx_max*sizeof(int));
-  CUDAMALLOCCTRL("&d_sorted_idx_",&d_sorted_idx_, Nx_max*sizeof(int));
-
-  int *h_idx = new int[Nx_max];
-  for(int i=0; i<Nx_max; i++) {
-    h_idx[i] = i;
-  }  
-  gpuErrchk(cudaMemcpy(d_idx_, h_idx, Nx_max*sizeof(int),
-			  cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&d_max_Ny_", &d_max_Ny_, sizeof( int ) );
+  CUDAMALLOCCTRL( "&d_sorted_Ny_", &d_sorted_Ny_, Nx_max * sizeof( int ) );
+  CUDAMALLOCCTRL( "&d_idx_", &d_idx_, Nx_max * sizeof( int ) );
+  CUDAMALLOCCTRL( "&d_sorted_idx_", &d_sorted_idx_, Nx_max * sizeof( int ) );
+
+  int* h_idx = new int[ Nx_max ];
+  for ( int i = 0; i < Nx_max; i++ )
+  {
+    h_idx[ i ] = i;
+  }
+  gpuErrchk( cudaMemcpy( d_idx_, h_idx, Nx_max * sizeof( int ), cudaMemcpyHostToDevice ) );
   delete[] h_idx;
-    
+
   // Determine temporary storage requirements for RadixSort
-  d_sort_storage_ = NULL;
+  d_sort_storage_ = nullptr;
   sort_storage_bytes_ = 0;
-  cub::DeviceRadixSort::SortPairs(d_sort_storage_, sort_storage_bytes_,
-				  d_sorted_Ny_, d_sorted_Ny_, d_idx_,
-				  d_sorted_idx_, Nx_max);
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceRadixSort::SortPairs(
+    d_sort_storage_, sort_storage_bytes_, d_sorted_Ny_, d_sorted_Ny_, d_idx_, d_sorted_idx_, Nx_max );
+  //<END-CLANG-TIDY-SKIP>//
+
   // Determine temporary device storage requirements for Reduce
-  d_reduce_storage_ = NULL;
+  d_reduce_storage_ = nullptr;
   reduce_storage_bytes_ = 0;
-  int *d_Ny = NULL;
-  cub::DeviceReduce::Max(d_reduce_storage_, reduce_storage_bytes_, d_Ny,
-			 d_max_Ny_, Nx_max);
+  int* d_Ny = nullptr;
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceReduce::Max( d_reduce_storage_, reduce_storage_bytes_, d_Ny, d_max_Ny_, Nx_max );
+  //<END-CLANG-TIDY-SKIP>//
 
   // Allocate temporary storage
-  CUDAMALLOCCTRL("&d_sort_storage_",&d_sort_storage_, sort_storage_bytes_);
-  CUDAMALLOCCTRL("&d_reduce_storage_",&d_reduce_storage_, reduce_storage_bytes_);
+  CUDAMALLOCCTRL( "&d_sort_storage_", &d_sort_storage_, sort_storage_bytes_ );
+  CUDAMALLOCCTRL( "&d_reduce_storage_", &d_reduce_storage_, reduce_storage_bytes_ );
 
   return 0;
 }
-
-
diff --git a/src/nested_loop.h b/src/nested_loop.h
index 2d149a818..5b24a9120 100644
--- a/src/nested_loop.h
+++ b/src/nested_loop.h
@@ -23,17 +23,17 @@
 #ifndef NESTED_LOOP_H
 #define NESTED_LOOP_H
 
-#include <cub/cub.cuh>
 #include "cuda_error.h"
-#include "prefix_scan.h"
 #include "get_spike.h"
+#include "prefix_scan.h"
 #include "rev_spike.h"
-
+// #include <cub/cub.cuh>
 
 extern const int Ny_arr_size_;
 extern int Ny_th_arr_[];
 
-enum NestedLoopAlgo {
+enum NestedLoopAlgo
+{
   BlockStepNestedLoopAlgo,
   CumulSumNestedLoopAlgo,
   SimpleNestedLoopAlgo,
@@ -46,586 +46,669 @@ enum NestedLoopAlgo {
 };
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func> 
-__global__ void BlockStepNestedLoopKernel(int Nx, int *Ny)
+template < int i_func >
+__global__ void
+BlockStepNestedLoopKernel( int Nx, int* Ny )
 {
- const int ix = blockIdx.x;
-  if (ix < Nx) {
-    const int ny = Ny[ix];
-    for (int iy = threadIdx.x; iy < ny; iy += blockDim.x){
-      NestedLoopFunction<i_func>(ix, iy);
+  const int ix = blockIdx.x;
+  if ( ix < Nx )
+  {
+    const int ny = Ny[ ix ];
+    for ( int iy = threadIdx.x; iy < ny; iy += blockDim.x )
+    {
+      NestedLoopFunction< i_func >( ix, iy );
     }
   }
 }
 
-template<int i_func> 
-__global__ void CumulSumNestedLoopKernel(int Nx, int *Ny_cumul_sum,
-					 int Ny_sum)
+template < int i_func >
+__global__ void
+CumulSumNestedLoopKernel( int Nx, int* Ny_cumul_sum, int Ny_sum )
 {
-  int blockId   = blockIdx.y * gridDim.x + blockIdx.x;
+  int blockId = blockIdx.y * gridDim.x + blockIdx.x;
   int array_idx = blockId * blockDim.x + threadIdx.x;
-  if (array_idx<Ny_sum) {
-    int ix = locate(array_idx, Ny_cumul_sum, Nx + 1);
-    int iy = (int)(array_idx - Ny_cumul_sum[ix]);
-    NestedLoopFunction<i_func>(ix, iy);
+  if ( array_idx < Ny_sum )
+  {
+    int ix = locate( array_idx, Ny_cumul_sum, Nx + 1 );
+    int iy = ( int ) ( array_idx - Ny_cumul_sum[ ix ] );
+    NestedLoopFunction< i_func >( ix, iy );
   }
 }
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func> 
-__global__ void SimpleNestedLoopKernel(int Nx, int *Ny)
+template < int i_func >
+__global__ void
+SimpleNestedLoopKernel( int Nx, int* Ny )
 {
-  int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
-  int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
-  if (ix<Nx && iy<Ny[ix]) {
-    NestedLoopFunction<i_func>(ix, iy);
+  int ix = ( blockIdx.x * blockDim.x ) + threadIdx.x;
+  int iy = ( blockIdx.y * blockDim.y ) + threadIdx.y;
+  if ( ix < Nx && iy < Ny[ ix ] )
+  {
+    NestedLoopFunction< i_func >( ix, iy );
   }
 }
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func> 
-__global__ void  ParallelInnerNestedLoopKernel(int ix, int Ny)
+template < int i_func >
+__global__ void
+ParallelInnerNestedLoopKernel( int ix, int Ny )
 {
   int iy = threadIdx.x + blockIdx.x * blockDim.x;
-  if (iy<Ny) {
-    NestedLoopFunction<i_func>(ix, iy);
+  if ( iy < Ny )
+  {
+    NestedLoopFunction< i_func >( ix, iy );
   }
 }
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func> 
-__global__ void  ParallelOuterNestedLoopKernel(int Nx, int *d_Ny)
+template < int i_func >
+__global__ void
+ParallelOuterNestedLoopKernel( int Nx, int* d_Ny )
 {
   int ix = threadIdx.x + blockIdx.x * blockDim.x;
-  if (ix<Nx) {
-    for (int iy=0; iy<d_Ny[ix]; iy++) {
-      NestedLoopFunction<i_func>(ix, iy);
+  if ( ix < Nx )
+  {
+    for ( int iy = 0; iy < d_Ny[ ix ]; iy++ )
+    {
+      NestedLoopFunction< i_func >( ix, iy );
     }
   }
 }
 
-
 //////////////////////////////////////////////////////////////////////
-template<int i_func> 
-__global__ void Frame1DNestedLoopKernel(int ix0, int dim_x, int dim_y,
-					int *sorted_idx, int *sorted_Ny)
+template < int i_func >
+__global__ void
+Frame1DNestedLoopKernel( int ix0, int dim_x, int dim_y, int* sorted_idx, int* sorted_Ny )
 {
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<dim_x*dim_y) {
+  if ( array_idx < dim_x * dim_y )
+  {
     int ix = ix0 + array_idx % dim_x;
     int iy = array_idx / dim_x;
-    if (iy<sorted_Ny[ix]) {
+    if ( iy < sorted_Ny[ ix ] )
+    {
       // call here the function that should be called by the nested loop
-      NestedLoopFunction<i_func>(sorted_idx[ix], iy);
+      NestedLoopFunction< i_func >( sorted_idx[ ix ], iy );
     }
   }
 }
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func> 
-__global__ void Frame2DNestedLoopKernel(int ix0, int dim_x, int dim_y,
-					int *sorted_idx, int *sorted_Ny)
+template < int i_func >
+__global__ void
+Frame2DNestedLoopKernel( int ix0, int dim_x, int dim_y, int* sorted_idx, int* sorted_Ny )
 {
-  int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
-  int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
-  if (ix<dim_x && iy<sorted_Ny[ix+ix0]) {
+  int ix = ( blockIdx.x * blockDim.x ) + threadIdx.x;
+  int iy = ( blockIdx.y * blockDim.y ) + threadIdx.y;
+  if ( ix < dim_x && iy < sorted_Ny[ ix + ix0 ] )
+  {
     // call here the function that should be called by the nested loop
-    NestedLoopFunction<i_func>(sorted_idx[ix+ix0], iy);
+    NestedLoopFunction< i_func >( sorted_idx[ ix + ix0 ], iy );
   }
 }
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func> 
-__global__ void Smart1DNestedLoopKernel(int ix0, int iy0, int dim_x, int dim_y,
-                                 int *sorted_idx, int *sorted_Ny)
+template < int i_func >
+__global__ void
+Smart1DNestedLoopKernel( int ix0, int iy0, int dim_x, int dim_y, int* sorted_idx, int* sorted_Ny )
 {
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<dim_x*dim_y) {
+  if ( array_idx < dim_x * dim_y )
+  {
     int ix = ix0 + array_idx % dim_x;
     int iy = iy0 + array_idx / dim_x;
-    if (iy<sorted_Ny[ix]) {
+    if ( iy < sorted_Ny[ ix ] )
+    {
       // call here the function that should be called by the nested loop
-      NestedLoopFunction<i_func>(sorted_idx[ix], iy);
+      NestedLoopFunction< i_func >( sorted_idx[ ix ], iy );
     }
   }
 }
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func> 
-__global__ void Smart2DNestedLoopKernel(int ix0, int iy0, int dim_x,
-					int dim_y, int *sorted_idx,
-					int *sorted_Ny)
+template < int i_func >
+__global__ void
+Smart2DNestedLoopKernel( int ix0, int iy0, int dim_x, int dim_y, int* sorted_idx, int* sorted_Ny )
 {
-  int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
-  int iy = iy0 + (blockIdx.y * blockDim.y) + threadIdx.y;
-  if (ix<dim_x && iy<sorted_Ny[ix+ix0]) {
+  int ix = ( blockIdx.x * blockDim.x ) + threadIdx.x;
+  int iy = iy0 + ( blockIdx.y * blockDim.y ) + threadIdx.y;
+  if ( ix < dim_x && iy < sorted_Ny[ ix + ix0 ] )
+  {
     // call here the function that should be called by the nested loop
-    NestedLoopFunction<i_func>(sorted_idx[ix+ix0], iy);
+    NestedLoopFunction< i_func >( sorted_idx[ ix + ix0 ], iy );
   }
 }
 
+namespace NestedLoop
+{
+extern void* d_sort_storage_;
+extern size_t sort_storage_bytes_;
+extern void* d_reduce_storage_;
+extern size_t reduce_storage_bytes_;
 
+extern int Nx_max_;
+extern int* d_max_Ny_;
+extern int* d_sorted_Ny_;
 
+extern int* d_idx_;
+extern int* d_sorted_idx_;
 
+extern int block_dim_x_;
+extern int block_dim_y_;
+extern int frame_area_;
+extern float x_lim_;
 
-namespace NestedLoop
-{
-  extern void *d_sort_storage_;
-  extern size_t sort_storage_bytes_;
-  extern void *d_reduce_storage_;
-  extern size_t reduce_storage_bytes_;
-
-  extern int Nx_max_;
-  extern int *d_max_Ny_;
-  extern int *d_sorted_Ny_;
+extern int* d_Ny_cumul_sum_;
 
-  extern int *d_idx_;
-  extern int *d_sorted_idx_;
+extern PrefixScan prefix_scan_;
 
-  extern int block_dim_x_;
-  extern int block_dim_y_;
-  extern int frame_area_;
-  extern float x_lim_;
-  
-  extern int *d_Ny_cumul_sum_;
+int Init();
 
-  extern PrefixScan prefix_scan_;
-  
-  int Init();
+int Init( int Nx_max );
 
-  int Init(int Nx_max);
+template < int i_func >
+int Run( int nested_loop_algo, int Nx, int* d_Ny );
 
-  template<int i_func>
-  int Run(int nested_loop_algo, int Nx, int *d_Ny);
+template < int i_func >
+int BlockStepNestedLoop( int Nx, int* d_Ny );
 
-  template<int i_func>
-  int BlockStepNestedLoop(int Nx, int *d_Ny);
-  
-  template<int i_func>
-  int CumulSumNestedLoop(int Nx, int *d_Ny);  
+template < int i_func >
+int CumulSumNestedLoop( int Nx, int* d_Ny );
 
-  template<int i_func>
-  int SimpleNestedLoop(int Nx, int *d_Ny);
+template < int i_func >
+int SimpleNestedLoop( int Nx, int* d_Ny );
 
-  template<int i_func>
-  int SimpleNestedLoop(int Nx, int *d_Ny, int max_Ny);
+template < int i_func >
+int SimpleNestedLoop( int Nx, int* d_Ny, int max_Ny );
 
-  template<int i_func>
-  int ParallelInnerNestedLoop(int Nx, int *d_Ny);
+template < int i_func >
+int ParallelInnerNestedLoop( int Nx, int* d_Ny );
 
-  template<int i_func>
-  int ParallelOuterNestedLoop(int Nx, int *d_Ny);
+template < int i_func >
+int ParallelOuterNestedLoop( int Nx, int* d_Ny );
 
-  template<int i_func>
-  int Frame1DNestedLoop(int Nx, int *d_Ny);
+template < int i_func >
+int Frame1DNestedLoop( int Nx, int* d_Ny );
 
-  template<int i_func>
-  int Frame2DNestedLoop(int Nx, int *d_Ny);
+template < int i_func >
+int Frame2DNestedLoop( int Nx, int* d_Ny );
 
-  template<int i_func>
-  int Smart1DNestedLoop(int Nx, int *d_Ny);
+template < int i_func >
+int Smart1DNestedLoop( int Nx, int* d_Ny );
 
-  template<int i_func>
-  int Smart2DNestedLoop(int Nx, int *d_Ny);
+template < int i_func >
+int Smart2DNestedLoop( int Nx, int* d_Ny );
 
-  int Free();
-}
+int Free();
+} // namespace NestedLoop
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func>
-int NestedLoop::Run(int nested_loop_algo, int Nx, int *d_Ny)
+template < int i_func >
+int
+NestedLoop::Run( int nested_loop_algo, int Nx, int* d_Ny )
 {
-  switch(nested_loop_algo) {
+  switch ( nested_loop_algo )
+  {
   case BlockStepNestedLoopAlgo:
-    return BlockStepNestedLoop<i_func>(Nx, d_Ny);
+    return BlockStepNestedLoop< i_func >( Nx, d_Ny );
     break;
   case CumulSumNestedLoopAlgo:
-    return CumulSumNestedLoop<i_func>(Nx, d_Ny);
+    return CumulSumNestedLoop< i_func >( Nx, d_Ny );
     break;
   case SimpleNestedLoopAlgo:
-    return SimpleNestedLoop<i_func>(Nx, d_Ny);
+    return SimpleNestedLoop< i_func >( Nx, d_Ny );
     break;
   case ParallelInnerNestedLoopAlgo:
-    return ParallelInnerNestedLoop<i_func>(Nx, d_Ny);
+    return ParallelInnerNestedLoop< i_func >( Nx, d_Ny );
     break;
   case ParallelOuterNestedLoopAlgo:
-    return ParallelOuterNestedLoop<i_func>(Nx, d_Ny);
+    return ParallelOuterNestedLoop< i_func >( Nx, d_Ny );
     break;
   case Frame1DNestedLoopAlgo:
-    return Frame1DNestedLoop<i_func>(Nx, d_Ny);
+    return Frame1DNestedLoop< i_func >( Nx, d_Ny );
     break;
   case Frame2DNestedLoopAlgo:
-    return Frame2DNestedLoop<i_func>(Nx, d_Ny);
+    return Frame2DNestedLoop< i_func >( Nx, d_Ny );
     break;
   case Smart1DNestedLoopAlgo:
-    return Smart1DNestedLoop<i_func>(Nx, d_Ny);
+    return Smart1DNestedLoop< i_func >( Nx, d_Ny );
     break;
   case Smart2DNestedLoopAlgo:
-    return Smart2DNestedLoop<i_func>(Nx, d_Ny);
+    return Smart2DNestedLoop< i_func >( Nx, d_Ny );
     break;
   default:
     return -1;
   }
 }
 
-
 //////////////////////////////////////////////////////////////////////
-template<int i_func>
-int NestedLoop::BlockStepNestedLoop(int Nx, int *d_Ny)
+template < int i_func >
+int
+NestedLoop::BlockStepNestedLoop( int Nx, int* d_Ny )
 {
-  BlockStepNestedLoopKernel<i_func><<<Nx, 1024>>>(Nx, d_Ny);
-  gpuErrchk(cudaPeekAtLastError());
-  //gpuErrchk(cudaDeviceSynchronize());
-  
+  BlockStepNestedLoopKernel< i_func > <<< Nx, 1024 >>>( Nx, d_Ny );
+  gpuErrchk( cudaPeekAtLastError() );
+  // gpuErrchk(cudaDeviceSynchronize());
+
   return 0;
 }
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func>
-int NestedLoop::SimpleNestedLoop(int Nx, int *d_Ny)
+template < int i_func >
+int
+NestedLoop::SimpleNestedLoop( int Nx, int* d_Ny )
 {
   // Find max value of Ny
-  cub::DeviceReduce::Max(d_reduce_storage_, reduce_storage_bytes_, d_Ny,
-			 d_max_Ny_, Nx);
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceReduce::Max( d_reduce_storage_, reduce_storage_bytes_, d_Ny, d_max_Ny_, Nx );
+  //<END-CLANG-TIDY-SKIP>//
+
   int max_Ny;
-  gpuErrchk(cudaMemcpy(&max_Ny, d_max_Ny_, sizeof(int),
-			  cudaMemcpyDeviceToHost));
-  return SimpleNestedLoop<i_func>(Nx, d_Ny, max_Ny);
+  gpuErrchk( cudaMemcpy( &max_Ny, d_max_Ny_, sizeof( int ), cudaMemcpyDeviceToHost ) );
+  return SimpleNestedLoop< i_func >( Nx, d_Ny, max_Ny );
 }
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func>
-int NestedLoop::SimpleNestedLoop(int Nx, int *d_Ny, int max_Ny)
+template < int i_func >
+int
+NestedLoop::SimpleNestedLoop( int Nx, int* d_Ny, int max_Ny )
 {
-  if (max_Ny < 1) max_Ny = 1;
-  dim3 threadsPerBlock(block_dim_x_, block_dim_y_);  // block size
-  dim3 numBlocks((Nx - 1)/threadsPerBlock.x + 1,
-		 (max_Ny - 1)/threadsPerBlock.y + 1);
-  SimpleNestedLoopKernel<i_func> <<<numBlocks,threadsPerBlock>>>(Nx, d_Ny);
-  gpuErrchk(cudaPeekAtLastError());
-  //gpuErrchk(cudaDeviceSynchronize());
+  if ( max_Ny < 1 )
+  {
+    max_Ny = 1;
+  }
+  dim3 threadsPerBlock( block_dim_x_, block_dim_y_ ); // block size
+  dim3 numBlocks( ( Nx - 1 ) / threadsPerBlock.x + 1, ( max_Ny - 1 ) / threadsPerBlock.y + 1 );
+  SimpleNestedLoopKernel< i_func > <<< numBlocks, threadsPerBlock>>>( Nx, d_Ny );
+  gpuErrchk( cudaPeekAtLastError() );
+  // gpuErrchk(cudaDeviceSynchronize());
 
   return 0;
 }
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func>
-int NestedLoop::ParallelInnerNestedLoop(int Nx, int *d_Ny)
+template < int i_func >
+int
+NestedLoop::ParallelInnerNestedLoop( int Nx, int* d_Ny )
 {
-  int h_Ny[Nx];
-  gpuErrchk(cudaMemcpy(h_Ny, d_Ny, Nx*sizeof(int),
-		       cudaMemcpyDeviceToHost));
-  for (int ix=0; ix<Nx; ix++) {
-    int Ny = h_Ny[ix];
-    ParallelInnerNestedLoopKernel<i_func><<<(Ny+1023)/1024, 1024>>>(ix, Ny);
+  int h_Ny[ Nx ];
+  gpuErrchk( cudaMemcpy( h_Ny, d_Ny, Nx * sizeof( int ), cudaMemcpyDeviceToHost ) );
+  for ( int ix = 0; ix < Nx; ix++ )
+  {
+    int Ny = h_Ny[ ix ];
+    ParallelInnerNestedLoopKernel< i_func > <<< ( Ny + 1023 ) / 1024, 1024 >>>( ix, Ny );
     // gpuErrchk(cudaPeekAtLastError()); // uncomment only for debugging
     // gpuErrchk(cudaDeviceSynchronize()); // uncomment only for debugging
   }
-  gpuErrchk(cudaPeekAtLastError());
-  //gpuErrchk(cudaDeviceSynchronize());
+  gpuErrchk( cudaPeekAtLastError() );
+  // gpuErrchk(cudaDeviceSynchronize());
 
   return 0;
 }
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func>
-int NestedLoop::ParallelOuterNestedLoop(int Nx, int *d_Ny)
+template < int i_func >
+int
+NestedLoop::ParallelOuterNestedLoop( int Nx, int* d_Ny )
 {
-  ParallelOuterNestedLoopKernel<i_func><<<(Nx+1023)/1024, 1024>>>(Nx, d_Ny);
-  gpuErrchk(cudaPeekAtLastError());
-  //gpuErrchk(cudaDeviceSynchronize());
-  
+  ParallelOuterNestedLoopKernel< i_func > <<< ( Nx + 1023 ) / 1024, 1024 >>>( Nx, d_Ny );
+  gpuErrchk( cudaPeekAtLastError() );
+  // gpuErrchk(cudaDeviceSynchronize());
+
   return 0;
 }
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func>
-int NestedLoop::Frame1DNestedLoop(int Nx, int *d_Ny)
+template < int i_func >
+int
+NestedLoop::Frame1DNestedLoop( int Nx, int* d_Ny )
 {
-  if (Nx <= 0) return 0;
+  if ( Nx <= 0 )
+  {
+    return 0;
+  }
   int dim_x, dim_y;
 
   // Run sorting operation
-  cub::DeviceRadixSort::SortPairs(d_sort_storage_, sort_storage_bytes_,
-				  d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_,
-				  Nx);
-  
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceRadixSort::SortPairs(
+    d_sort_storage_, sort_storage_bytes_, d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_, Nx );
+  //<END-CLANG-TIDY-SKIP>//
+
   int ix0 = Nx;
-  while(ix0>0) {
-    gpuErrchk(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int),
-			    cudaMemcpyDeviceToHost));
-    if (dim_y < 1) dim_y = 1;
-    dim_x = (frame_area_ - 1) / dim_y + 1;
+  while ( ix0 > 0 )
+  {
+    gpuErrchk( cudaMemcpy( &dim_y, &d_sorted_Ny_[ ix0 - 1 ], sizeof( int ), cudaMemcpyDeviceToHost ) );
+    if ( dim_y < 1 )
+    {
+      dim_y = 1;
+    }
+    dim_x = ( frame_area_ - 1 ) / dim_y + 1;
     ix0 -= dim_x;
-    if (ix0<0) {
+    if ( ix0 < 0 )
+    {
       dim_x += ix0;
       ix0 = 0;
-    } 
-    Frame1DNestedLoopKernel<i_func><<<(dim_x*dim_y+1023)/1024, 1024>>>
-      (ix0, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_);
+    }
+    Frame1DNestedLoopKernel< i_func > <<< ( dim_x * dim_y + 1023 ) / 1024, 1024 >>>(
+      ix0, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_ );
   }
-  gpuErrchk(cudaPeekAtLastError());
-  //gpuErrchk(cudaDeviceSynchronize());
-  
+  gpuErrchk( cudaPeekAtLastError() );
+  // gpuErrchk(cudaDeviceSynchronize());
+
   return 0;
 }
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func>
-int NestedLoop::Frame2DNestedLoop(int Nx, int *d_Ny)
+template < int i_func >
+int
+NestedLoop::Frame2DNestedLoop( int Nx, int* d_Ny )
 {
-  if (Nx <= 0) return 0;
+  if ( Nx <= 0 )
+  {
+    return 0;
+  }
   // Sort the pairs (ix, Ny) with ix=0,..,Nx-1 in ascending order of Ny.
   // After the sorting operation, d_sorted_idx_ are the reordered indexes ix
-  // and d_sorted_Ny_ are the sorted values of Ny 
-  cub::DeviceRadixSort::SortPairs(d_sort_storage_, sort_storage_bytes_,
-				  d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_,
-				  Nx);  
-  int ix0 = Nx;	      // proceeds from right to left
-  while(ix0>0) {
-    int dim_x, dim_y;  // width and height of the rectangular frame
-    gpuErrchk(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int),
-			    cudaMemcpyDeviceToHost));
-    if (dim_y < 1) dim_y = 1;
+  // and d_sorted_Ny_ are the sorted values of Ny
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceRadixSort::SortPairs(
+    d_sort_storage_, sort_storage_bytes_, d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_, Nx );
+  //<END-CLANG-TIDY-SKIP>//
+
+  int ix0 = Nx; // proceeds from right to left
+  while ( ix0 > 0 )
+  {
+    int dim_x, dim_y; // width and height of the rectangular frame
+    gpuErrchk( cudaMemcpy( &dim_y, &d_sorted_Ny_[ ix0 - 1 ], sizeof( int ), cudaMemcpyDeviceToHost ) );
+    if ( dim_y < 1 )
+    {
+      dim_y = 1;
+    }
     // frame_area_ is the fixed value of the the rectangular frame area
-    dim_x = (frame_area_ - 1) / dim_y + 1; // width of the rectangular frame
-    ix0 -= dim_x; // update the index value
-    if (ix0<0) {
-      dim_x += ix0;  // adjust the width if ix0<0 
+    dim_x = ( frame_area_ - 1 ) / dim_y + 1; // width of the rectangular frame
+    ix0 -= dim_x;                            // update the index value
+    if ( ix0 < 0 )
+    {
+      dim_x += ix0; // adjust the width if ix0<0
       ix0 = 0;
-    }    
-    dim3 threadsPerBlock(block_dim_x_, block_dim_y_);  // block size
-    dim3 numBlocks((dim_x - 1)/threadsPerBlock.x + 1,
-		   (dim_y - 1)/threadsPerBlock.y + 1);
+    }
+    dim3 threadsPerBlock( block_dim_x_, block_dim_y_ ); // block size
+    dim3 numBlocks( ( dim_x - 1 ) / threadsPerBlock.x + 1, ( dim_y - 1 ) / threadsPerBlock.y + 1 );
     // run a nested loop kernel on the rectangular frame
-    Frame2DNestedLoopKernel<i_func> <<<numBlocks,threadsPerBlock>>>
-      (ix0, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_);
-
+    Frame2DNestedLoopKernel< i_func > <<< numBlocks, threadsPerBlock>>>(
+      ix0, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_ );
   }
-  gpuErrchk(cudaPeekAtLastError());
-  //gpuErrchk(cudaDeviceSynchronize());
+  gpuErrchk( cudaPeekAtLastError() );
+  // gpuErrchk(cudaDeviceSynchronize());
 
   return 0;
 }
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func>
-int NestedLoop::Smart1DNestedLoop(int Nx, int *d_Ny)
+template < int i_func >
+int
+NestedLoop::Smart1DNestedLoop( int Nx, int* d_Ny )
 {
   // Find max value of Ny
-  cub::DeviceReduce::Max(d_reduce_storage_, reduce_storage_bytes_, d_Ny,
-			 d_max_Ny_, Nx);
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceReduce::Max( d_reduce_storage_, reduce_storage_bytes_, d_Ny, d_max_Ny_, Nx );
+  //<END-CLANG-TIDY-SKIP>//
+
   int max_Ny;
-  gpuErrchk(cudaMemcpy(&max_Ny, d_max_Ny_, sizeof(int),
-			  cudaMemcpyDeviceToHost));
-  if (Nx <= 0) return 0;
-  float f_Nx = 2.0*log((float)Nx)-5;
-  int i_Nx = (int)floor(f_Nx);
+  gpuErrchk( cudaMemcpy( &max_Ny, d_max_Ny_, sizeof( int ), cudaMemcpyDeviceToHost ) );
+  if ( Nx <= 0 )
+  {
+    return 0;
+  }
+  float f_Nx = 2.0 * log( ( float ) Nx ) - 5;
+  int i_Nx = ( int ) floor( f_Nx );
   int Ny_th;
-  if (i_Nx<0) {
-    Ny_th = Ny_th_arr_[0];
+  if ( i_Nx < 0 )
+  {
+    Ny_th = Ny_th_arr_[ 0 ];
+  }
+  else if ( i_Nx >= Ny_arr_size_ - 1 )
+  {
+    Ny_th = Ny_th_arr_[ Ny_arr_size_ - 1 ];
   }
-  else if (i_Nx>=Ny_arr_size_-1) {
-    Ny_th = Ny_th_arr_[Ny_arr_size_-1];
+  else
+  {
+    float t = f_Nx - ( float ) i_Nx;
+    Ny_th = Ny_th_arr_[ i_Nx ] * ( 1.0 - t ) + Ny_th_arr_[ i_Nx + 1 ] * t;
   }
-  else {
-    float t = f_Nx - (float)i_Nx;
-    Ny_th = Ny_th_arr_[i_Nx]*(1.0 - t) + Ny_th_arr_[i_Nx+1]*t;
+  if ( max_Ny < Ny_th )
+  {
+    return SimpleNestedLoop< i_func >( Nx, d_Ny, max_Ny );
   }
-  if (max_Ny<Ny_th) {
-    return SimpleNestedLoop<i_func>(Nx, d_Ny, max_Ny);
+
+  if ( max_Ny < 1 )
+  {
+    max_Ny = 1;
   }
 
-  if(max_Ny < 1) max_Ny = 1;
-  
   int dim_x, dim_y;
 
   // Run sorting operation
-  cub::DeviceRadixSort::SortPairs(d_sort_storage_, sort_storage_bytes_,
-				  d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_,
-				  Nx);
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceRadixSort::SortPairs(
+    d_sort_storage_, sort_storage_bytes_, d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_, Nx );
+  //<END-CLANG-TIDY-SKIP>//
+
   // CudaCheckError(); // uncomment only for debugging
-  
-  int ix1 = (int)round(x_lim_*Nx);
-  if (ix1==Nx) ix1 = Nx - 1;
+
+  int ix1 = ( int ) round( x_lim_ * Nx );
+  if ( ix1 == Nx )
+  {
+    ix1 = Nx - 1;
+  }
   int Ny1;
-  gpuErrchk(cudaMemcpy(&Ny1, &d_sorted_Ny_[ix1], sizeof(int),
-			  cudaMemcpyDeviceToHost));
-  if(Ny1 < 1) Ny1 = 1;
-
-  dim3 threadsPerBlock(block_dim_x_, block_dim_y_);  // block size
-  int nbx = (Nx - 1)/threadsPerBlock.x + 1;
-  int nby = (Ny1 - 1)/threadsPerBlock.y + 1;
-  Ny1 = nby*threadsPerBlock.y;
-  
-  dim3 numBlocks(nbx, nby);
-  SimpleNestedLoopKernel<i_func> <<<numBlocks,threadsPerBlock>>>(Nx, d_Ny);
-  //CudaCheckError(); // uncomment only for debugging
-  
+  gpuErrchk( cudaMemcpy( &Ny1, &d_sorted_Ny_[ ix1 ], sizeof( int ), cudaMemcpyDeviceToHost ) );
+  if ( Ny1 < 1 )
+  {
+    Ny1 = 1;
+  }
+
+  dim3 threadsPerBlock( block_dim_x_, block_dim_y_ ); // block size
+  int nbx = ( Nx - 1 ) / threadsPerBlock.x + 1;
+  int nby = ( Ny1 - 1 ) / threadsPerBlock.y + 1;
+  Ny1 = nby * threadsPerBlock.y;
+
+  dim3 numBlocks( nbx, nby );
+  SimpleNestedLoopKernel< i_func > <<< numBlocks, threadsPerBlock>>>( Nx, d_Ny );
+  // CudaCheckError(); // uncomment only for debugging
+
   int ix0 = Nx;
-  while(ix0>ix1) {
-    gpuErrchk(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int),
-			    cudaMemcpyDeviceToHost));
+  while ( ix0 > ix1 )
+  {
+    gpuErrchk( cudaMemcpy( &dim_y, &d_sorted_Ny_[ ix0 - 1 ], sizeof( int ), cudaMemcpyDeviceToHost ) );
     dim_y -= Ny1;
-    if (dim_y<=0) break;
-    dim_x = (frame_area_ - 1) / dim_y + 1;
+    if ( dim_y <= 0 )
+    {
+      break;
+    }
+    dim_x = ( frame_area_ - 1 ) / dim_y + 1;
     ix0 -= dim_x;
-    if (ix0<ix1) {
+    if ( ix0 < ix1 )
+    {
       dim_x += ix0 - ix1;
       ix0 = ix1;
-    } 
-    Smart1DNestedLoopKernel<i_func><<<(dim_x*dim_y+1023)/1024, 1024>>>
-      (ix0, Ny1, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_);
-    //CudaCheckError(); // uncomment only for debugging
+    }
+    Smart1DNestedLoopKernel< i_func > <<< ( dim_x * dim_y + 1023 ) / 1024, 1024 >>>(
+      ix0, Ny1, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_ );
+    // CudaCheckError(); // uncomment only for debugging
   }
-  gpuErrchk(cudaPeekAtLastError());
-  //gpuErrchk(cudaDeviceSynchronize());
+  gpuErrchk( cudaPeekAtLastError() );
+  // gpuErrchk(cudaDeviceSynchronize());
 
   return 0;
 }
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func>
-int NestedLoop::Smart2DNestedLoop(int Nx, int *d_Ny)
+template < int i_func >
+int
+NestedLoop::Smart2DNestedLoop( int Nx, int* d_Ny )
 {
   // Find max value of Ny
-  cub::DeviceReduce::Max(d_reduce_storage_, reduce_storage_bytes_, d_Ny,
-			 d_max_Ny_, Nx);
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceReduce::Max( d_reduce_storage_, reduce_storage_bytes_, d_Ny, d_max_Ny_, Nx );
+  //<END-CLANG-TIDY-SKIP>//
+
   int max_Ny;
-  gpuErrchk(cudaMemcpy(&max_Ny, d_max_Ny_, sizeof(int),
-			  cudaMemcpyDeviceToHost));
-  if (Nx <= 0) return 0;
-  float f_Nx = 2.0*log((float)Nx)-5;
-  int i_Nx = (int)floor(f_Nx);
+  gpuErrchk( cudaMemcpy( &max_Ny, d_max_Ny_, sizeof( int ), cudaMemcpyDeviceToHost ) );
+  if ( Nx <= 0 )
+  {
+    return 0;
+  }
+  float f_Nx = 2.0 * log( ( float ) Nx ) - 5;
+  int i_Nx = ( int ) floor( f_Nx );
   int Ny_th;
-  if (i_Nx<0) {
-    Ny_th = Ny_th_arr_[0];
+  if ( i_Nx < 0 )
+  {
+    Ny_th = Ny_th_arr_[ 0 ];
   }
-  else if (i_Nx>=Ny_arr_size_-1) {
-    Ny_th = Ny_th_arr_[Ny_arr_size_-1];
+  else if ( i_Nx >= Ny_arr_size_ - 1 )
+  {
+    Ny_th = Ny_th_arr_[ Ny_arr_size_ - 1 ];
   }
-  else {
-    float t = f_Nx - (float)i_Nx;
-    Ny_th = Ny_th_arr_[i_Nx]*(1.0 - t) + Ny_th_arr_[i_Nx+1]*t;
+  else
+  {
+    float t = f_Nx - ( float ) i_Nx;
+    Ny_th = Ny_th_arr_[ i_Nx ] * ( 1.0 - t ) + Ny_th_arr_[ i_Nx + 1 ] * t;
   }
-  if (max_Ny<Ny_th) {
-    return SimpleNestedLoop<i_func>(Nx, d_Ny, max_Ny);
+  if ( max_Ny < Ny_th )
+  {
+    return SimpleNestedLoop< i_func >( Nx, d_Ny, max_Ny );
   }
 
-  if(max_Ny < 1) max_Ny = 1;
+  if ( max_Ny < 1 )
+  {
+    max_Ny = 1;
+  }
 
   int dim_x, dim_y;
 
   // Run sorting operation
-  cub::DeviceRadixSort::SortPairs(d_sort_storage_, sort_storage_bytes_,
-				  d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_,
-				  Nx);
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceRadixSort::SortPairs(
+    d_sort_storage_, sort_storage_bytes_, d_Ny, d_sorted_Ny_, d_idx_, d_sorted_idx_, Nx );
+  //<END-CLANG-TIDY-SKIP>//
+
   // CudaCheckError(); // uncomment only for debugging
-  
-  int ix1 = (int)round(x_lim_*Nx);
-  if (ix1==Nx) ix1 = Nx - 1;
+
+  int ix1 = ( int ) round( x_lim_ * Nx );
+  if ( ix1 == Nx )
+  {
+    ix1 = Nx - 1;
+  }
   int Ny1;
-  gpuErrchk(cudaMemcpy(&Ny1, &d_sorted_Ny_[ix1], sizeof(int),
-			  cudaMemcpyDeviceToHost));
-  if(Ny1 < 1) Ny1 = 1;
-
-  dim3 threadsPerBlock(block_dim_x_, block_dim_y_);  // block size
-  int nbx = (Nx - 1)/threadsPerBlock.x + 1;
-  int nby = (Ny1 - 1)/threadsPerBlock.y + 1;
-  Ny1 = nby*threadsPerBlock.y;
-  
-  dim3 numBlocks(nbx, nby);
-  SimpleNestedLoopKernel<i_func> <<<numBlocks,threadsPerBlock>>>(Nx, d_Ny);
-  //CudaCheckError(); // uncomment only for debugging
-  
+  gpuErrchk( cudaMemcpy( &Ny1, &d_sorted_Ny_[ ix1 ], sizeof( int ), cudaMemcpyDeviceToHost ) );
+  if ( Ny1 < 1 )
+  {
+    Ny1 = 1;
+  }
+
+  dim3 threadsPerBlock( block_dim_x_, block_dim_y_ ); // block size
+  int nbx = ( Nx - 1 ) / threadsPerBlock.x + 1;
+  int nby = ( Ny1 - 1 ) / threadsPerBlock.y + 1;
+  Ny1 = nby * threadsPerBlock.y;
+
+  dim3 numBlocks( nbx, nby );
+  SimpleNestedLoopKernel< i_func > <<< numBlocks, threadsPerBlock>>>( Nx, d_Ny );
+  // CudaCheckError(); // uncomment only for debugging
+
   int ix0 = Nx;
-  while(ix0>ix1) {
-    gpuErrchk(cudaMemcpy(&dim_y, &d_sorted_Ny_[ix0-1], sizeof(int),
-			    cudaMemcpyDeviceToHost));
+  while ( ix0 > ix1 )
+  {
+    gpuErrchk( cudaMemcpy( &dim_y, &d_sorted_Ny_[ ix0 - 1 ], sizeof( int ), cudaMemcpyDeviceToHost ) );
     dim_y -= Ny1;
-    if (dim_y<=0) break;
-    dim_x = (frame_area_ - 1) / dim_y + 1;
+    if ( dim_y <= 0 )
+    {
+      break;
+    }
+    dim_x = ( frame_area_ - 1 ) / dim_y + 1;
     ix0 -= dim_x;
-    if (ix0<ix1) {
+    if ( ix0 < ix1 )
+    {
       dim_x += ix0 - ix1;
       ix0 = ix1;
     }
 
-    dim3 threadsPerBlock(block_dim_x_, block_dim_y_);  // block size
-    dim3 numBlocks((dim_x - 1)/threadsPerBlock.x + 1,
-		   (dim_y - 1)/threadsPerBlock.y + 1);
-    Smart2DNestedLoopKernel<i_func> <<<numBlocks,threadsPerBlock>>>
-      (ix0, Ny1, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_);
-    //CudaCheckError(); // uncomment only for debugging      
+    dim3 threadsPerBlock( block_dim_x_, block_dim_y_ ); // block size
+    dim3 numBlocks( ( dim_x - 1 ) / threadsPerBlock.x + 1, ( dim_y - 1 ) / threadsPerBlock.y + 1 );
+    Smart2DNestedLoopKernel< i_func > <<< numBlocks, threadsPerBlock>>>(
+      ix0, Ny1, dim_x, dim_y, d_sorted_idx_, d_sorted_Ny_ );
+    // CudaCheckError(); // uncomment only for debugging
   }
-  gpuErrchk(cudaPeekAtLastError());
-  //gpuErrchk(cudaDeviceSynchronize());
-  
+  gpuErrchk( cudaPeekAtLastError() );
+  // gpuErrchk(cudaDeviceSynchronize());
+
   return 0;
 }
 
 //////////////////////////////////////////////////////////////////////
-template<int i_func>
-int NestedLoop::CumulSumNestedLoop(int Nx, int *d_Ny)
+template < int i_func >
+int
+NestedLoop::CumulSumNestedLoop( int Nx, int* d_Ny )
 {
-  //TMP
-  //double time_mark=getRealTime();
+  // TMP
+  // double time_mark=getRealTime();
+  //
+  prefix_scan_.Scan( d_Ny_cumul_sum_, d_Ny, Nx + 1 );
+  // TMP
+  // printf("pst: %lf\n", getRealTime()-time_mark);
   //
-  prefix_scan_.Scan(d_Ny_cumul_sum_, d_Ny, Nx+1);
-  //TMP
-  //printf("pst: %lf\n", getRealTime()-time_mark);
-  //	 
   int Ny_sum;
-  gpuErrchk(cudaMemcpy(&Ny_sum, &d_Ny_cumul_sum_[Nx],
-			  sizeof(int), cudaMemcpyDeviceToHost));
+  gpuErrchk( cudaMemcpy( &Ny_sum, &d_Ny_cumul_sum_[ Nx ], sizeof( int ), cudaMemcpyDeviceToHost ) );
 
-  //printf("CSNL: %d %d\n", Nx, Ny_sum);
-  
-  //printf("Ny_sum %u\n", Ny_sum);
-  //temporary - remove
+  // printf("CSNL: %d %d\n", Nx, Ny_sum);
+
+  // printf("Ny_sum %u\n", Ny_sum);
+  // temporary - remove
   /*
   if (Ny_sum==0) {
     printf("Nx %d\n", Nx);
     for (int i=0; i<Nx+1; i++) {
       int psum;
       gpuErrchk(cudaMemcpy(&psum, &d_Ny_cumul_sum_[i],
-  			      sizeof(int), cudaMemcpyDeviceToHost));
+                              sizeof(int), cudaMemcpyDeviceToHost));
       printf("%d %d\n", i, psum);
     }
   }
-  */    
+  */
   ////
-  if(Ny_sum>0) {
+  if ( Ny_sum > 0 )
+  {
     int grid_dim_x, grid_dim_y;
-    if (Ny_sum<65536*1024) { // max grid dim * max block dim
-      grid_dim_x = (Ny_sum+1023)/1024;
+    if ( Ny_sum < 65536 * 1024 )
+    { // max grid dim * max block dim
+      grid_dim_x = ( Ny_sum + 1023 ) / 1024;
       grid_dim_y = 1;
     }
-    else {
+    else
+    {
       grid_dim_x = 32; // I think it's not necessary to increase it
-      if (Ny_sum>grid_dim_x*1024*65535) {
-	throw ngpu_exception(std::string("Ny sum ") + std::to_string(Ny_sum) +
-			     " larger than threshold "
-			     + std::to_string(grid_dim_x*1024*65535));
+      if ( Ny_sum > grid_dim_x * 1024 * 65535 )
+      {
+        throw ngpu_exception( std::string( "Ny sum " ) + std::to_string( Ny_sum ) + " larger than threshold "
+          + std::to_string( grid_dim_x * 1024 * 65535 ) );
       }
-      grid_dim_y = (Ny_sum + grid_dim_x*1024 -1) / (grid_dim_x*1024);
+      grid_dim_y = ( Ny_sum + grid_dim_x * 1024 - 1 ) / ( grid_dim_x * 1024 );
     }
-    dim3 numBlocks(grid_dim_x, grid_dim_y);
-    //TMP
-    //double time_mark=getRealTime();
+    dim3 numBlocks( grid_dim_x, grid_dim_y );
+    // TMP
+    // double time_mark=getRealTime();
     //
-    CumulSumNestedLoopKernel<i_func><<<numBlocks, 1024>>>
-    (Nx, d_Ny_cumul_sum_, Ny_sum);
-    gpuErrchk(cudaPeekAtLastError());
-    //gpuErrchk(cudaDeviceSynchronize());
+    CumulSumNestedLoopKernel< i_func > <<< numBlocks, 1024 >>>( Nx, d_Ny_cumul_sum_, Ny_sum );
+    gpuErrchk( cudaPeekAtLastError() );
+    // gpuErrchk(cudaDeviceSynchronize());
 
-    //TMP
-    //printf("cst: %lf\n", getRealTime()-time_mark);
+    // TMP
+    // printf("cst: %lf\n", getRealTime()-time_mark);
     //
   }
-    
+
   return 0;
 }
 
-
 #endif
diff --git a/src/nestgpu.cu b/src/nestgpu.cu
index 3f6011193..0220cc425 100644
--- a/src/nestgpu.cu
+++ b/src/nestgpu.cu
@@ -20,50 +20,57 @@
  *
  */
 
-//#define CHECKRC
-
-
-
-#include <config.h>
-#include <stdio.h>
-#include <stdint.h>
+#include <algorithm>
 #include <cmath>
+#include <config.h>
+#include <curand.h>
 #include <iostream>
+#include <stdint.h>
+#include <stdio.h>
 #include <string>
-#include <algorithm>
-#include <curand.h>
 
-#include "distribution.h"
-#include "syn_model.h"
-#include "spike_buffer.h"
 #include "cuda_error.h"
-#include "send_spike.h"
+#include "distribution.h"
 #include "get_spike.h"
+#include "send_spike.h"
+#include "spike_buffer.h"
+#include "syn_model.h"
 
-#include "spike_generator.h"
-#include "multimeter.h"
+#include "connect.h"
 #include "getRealTime.h"
-#include "random.h"
-#include "nestgpu.h"
+#include "multimeter.h"
 #include "nested_loop.h"
-#include "rev_spike.h"
-#include "remote_spike.h"
-#include "connect.h"
+#include "nestgpu.h"
 #include "poiss_gen.h"
+#include "random.h"
+#include "remote_spike.h"
+#include "rev_spike.h"
+#include "spike_generator.h"
 
+#include "conn12b.h"
+#include "conn16b.h"
 #include "remote_connect.h"
 
 ////////////// TEMPORARY
 #include "scan.h"
 //////////////////////
 
-//#define VERBOSE_TIME
+// #define VERBOSE_TIME
 
 __constant__ double NESTGPUTime;
 __constant__ long long NESTGPUTimeIdx;
 __constant__ float NESTGPUTimeResolution;
 
-enum KernelFloatParamIndexes {
+namespace cuda_error_ns
+{
+std::map< void*, size_t > alloc_map_;
+size_t mem_used_;
+size_t mem_max_;
+int verbose_;
+} // namespace cuda_error_ns
+
+enum KernelFloatParamIndexes
+{
   i_time_resolution = 0,
   i_max_spike_num_fact,
   i_max_spike_per_host_fact,
@@ -71,254 +78,125 @@ enum KernelFloatParamIndexes {
   N_KERNEL_FLOAT_PARAM
 };
 
-enum KernelIntParamIndexes {
+enum KernelIntParamIndexes
+{
   i_rnd_seed = 0,
   i_verbosity_level,
   i_max_spike_buffer_size,
   i_max_node_n_bits,
   i_max_syn_n_bits,
+  i_max_delay_n_bits,
+  i_conn_struct_type,
   N_KERNEL_INT_PARAM
 };
 
-enum KernelBoolParamIndexes {
+enum KernelBoolParamIndexes
+{
   i_print_time,
   i_remove_conn_key,
   i_remote_spike_height,
   N_KERNEL_BOOL_PARAM
 };
 
-const std::string kernel_float_param_name[N_KERNEL_FLOAT_PARAM] = {
-  "time_resolution",
+enum ConnStructType
+{
+  i_conn12b,
+  i_conn16b,
+  N_CONN_STRUCT_TYPE
+};
+
+const std::string kernel_float_param_name[ N_KERNEL_FLOAT_PARAM ] = { "time_resolution",
   "max_spike_num_fact",
   "max_spike_per_host_fact",
-  "max_remote_spike_num_fact"
-};
+  "max_remote_spike_num_fact" };
 
-const std::string kernel_int_param_name[N_KERNEL_INT_PARAM] = {
-  "rnd_seed",
+const std::string kernel_int_param_name[ N_KERNEL_INT_PARAM ] = { "rnd_seed",
   "verbosity_level",
   "max_spike_buffer_size",
   "max_node_n_bits",
-  "max_syn_n_bits"
-};
+  "max_syn_n_bits",
+  "max_delay_n_bits",
+  "conn_struct_type" };
 
-const std::string kernel_bool_param_name[N_KERNEL_BOOL_PARAM] = {
-  "print_time",
+const std::string kernel_bool_param_name[ N_KERNEL_BOOL_PARAM ] = { "print_time",
   "remove_conn_key",
-  "remote_spike_height"
-};
-
-int NESTGPU::FreeConnRandomGenerator()
-{
-  for (int i_host=0; i_host<n_hosts_; i_host++) {
-    for (int j_host=0; j_host<n_hosts_; j_host++) {
-      CURAND_CALL(curandDestroyGenerator
-		  (conn_random_generator_[i_host][j_host]));
-    }
-  }
-
-  return 0;
-}
-
-int NESTGPU::InitConnRandomGenerator()
-{
-  conn_random_generator_.resize(n_hosts_);
-  for (int i_host=0; i_host<n_hosts_; i_host++) {
-    conn_random_generator_[i_host].resize(n_hosts_);
-    for (int j_host=0; j_host<n_hosts_; j_host++) {
-      CURAND_CALL(curandCreateGenerator
-		  (&conn_random_generator_[i_host][j_host],
-		   CURAND_RNG_PSEUDO_DEFAULT));
-    }
-  }
-
-  return 0;
-}
+  "remote_spike_height" };
 
-int NESTGPU::setHostNum(int n_hosts)
+int
+NESTGPU::setNHosts( int n_hosts )
 {
-  // free previous instances before creating new
-  FreeConnRandomGenerator();
   n_hosts_ = n_hosts;
-  InitConnRandomGenerator();
-  SetRandomSeed(kernel_seed_);
-  n_remote_nodes_.assign(n_hosts_, 0);
-  external_spike_flag_ = (n_hosts > 1) ? true : false;
-  gpuErrchk(cudaMemcpyToSymbolAsync(ExternalSpikeFlag, &external_spike_flag_,
-				    sizeof(bool)));
+  conn_->setNHosts( n_hosts );
+  SetRandomSeed( kernel_seed_ );
+  n_remote_nodes_.assign( n_hosts_, 0 );
+  external_spike_flag_ = ( n_hosts > 1 ) ? true : false;
+  gpuErrchk( cudaMemcpyToSymbolAsync( ExternalSpikeFlag, &external_spike_flag_, sizeof( bool ) ) );
 
   return 0;
 }
 
-int NESTGPU::setThisHost(int i_host)
+int
+NESTGPU::setThisHost( int i_host )
 {
   this_host_ = i_host;
-  SetRandomSeed(kernel_seed_);
+  conn_->setThisHost( i_host );
+  SetRandomSeed( kernel_seed_ );
 
   return 0;
 }
 
-
-
-
-
-
 NESTGPU::NESTGPU()
 {
   n_hosts_ = 1;
   this_host_ = 0;
   external_spike_flag_ = false;
-  InitConnRandomGenerator();
-  
+
+  time_resolution_ = 0.1; // time resolution in ms
+
   random_generator_ = new curandGenerator_t;
-  CURAND_CALL(curandCreateGenerator(random_generator_,
-				    CURAND_RNG_PSEUDO_DEFAULT));
-  
-  //SetRandomSeed(54321ULL);
-  //SetRandomSeed(54322ULL);
-  //SetRandomSeed(54323ULL);
-  //SetRandomSeed(54328ULL);
-  SetRandomSeed(54328ULL-5-12345);
+  CURAND_CALL( curandCreateGenerator( random_generator_, CURAND_RNG_PSEUDO_DEFAULT ) );
+  kernel_seed_ = 123456789ULL;
+  CURAND_CALL( curandSetPseudoRandomGeneratorSeed( *random_generator_, kernel_seed_ + this_host_ ) );
+
+  conn_ = nullptr;
+  // by default, connection structure type used is the 12-byte type
+  setConnStructType( i_conn12b );
+  // setConnStructType( i_conn16b );
 
   distribution_ = new Distribution;
   multimeter_ = new Multimeter;
 
   calibrate_flag_ = false;
   create_flag_ = false;
-  ConnectionSpikeTimeFlag = false;
-  rev_conn_flag_ = false;
-  h_NRevConn = 0;
-  
+
+  cuda_error_ns::mem_used_ = 0;
+  cuda_error_ns::mem_max_ = 0;
+
   start_real_time_ = getRealTime();
   max_spike_buffer_size_ = 20;
   t_min_ = 0.0;
-  sim_time_ = 1000.0;        //Simulation time in ms
-  //n_poiss_nodes_ = 0;
-  n_remote_nodes_.assign(1, 0);
-  n_image_nodes_ = 0;
-  SetTimeResolution(0.1);  // time resolution in ms
+  sim_time_ = 1000.0; // Simulation time in ms
+  // n_poiss_nodes_ = 0;
+  n_remote_nodes_.assign( 1, 0 );
+
   max_spike_num_fact_ = 1.0;
   max_spike_per_host_fact_ = 1.0;
   max_remote_spike_num_fact_ = 1.0;
-  setMaxNodeNBits(20); // maximum number of nodes is 2^20
-  setMaxSynNBits(6); // maximum number of synapse groups is 2^6
-  
+
   error_flag_ = false;
   error_message_ = "";
   error_code_ = 0;
-  
+
   on_exception_ = ON_EXCEPTION_EXIT;
 
   verbosity_level_ = 4;
+  cuda_error_ns::verbose_ = 0;
   print_time_ = false;
   remove_conn_key_ = false;
-  
+
   mpi_flag_ = false;
   remote_spike_height_ = false;
 
-#ifdef CHECKRC
-    // TEMPORARY, REMOVE!!!!!!!!!!!!!!!!!
-  int this_host = 0;
-  //int this_host = 1;
-  setHostNum(5);
-  setThisHost(this_host);
-  
-  RemoteConnectionMapInit(n_hosts_); // (uint n_hosts)
-
-  int n_neurons = 30;
-  int CE = 3;
-  Create("iaf_psc_exp", n_neurons);
-
-  float mean_delay = 0.5;
-  float std_delay = 0.25;
-  float min_delay = 0.1;
-  float w = 1.0;
-
-  ConnSpec conn_spec1(FIXED_INDEGREE, CE);
-  SynSpec syn_spec1;
-  syn_spec1.SetParam("receptor", 0);
-  syn_spec1.SetParam("weight", w);
-  syn_spec1.SetParam("delay_distribution", DISTR_TYPE_NORMAL_CLIPPED);
-  syn_spec1.SetParam("delay_mu", mean_delay);
-  syn_spec1.SetParam("delay_low", min_delay);
-  syn_spec1.SetParam("delay_high", mean_delay+3*std_delay);
-  syn_spec1.SetParam("delay_sigma", std_delay);
-
-  const int n_source = 10;
-  int h_source_node_index[n_source] = {21, 24, 21, 24, 22, 21, 23, 25, 26, 22};
-  int *d_source_node_index;
-  CUDAMALLOCCTRL("&d_source_node_index",&d_source_node_index, n_source*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_source_node_index, h_source_node_index,
-		       n_source*sizeof(int), cudaMemcpyHostToDevice));
-
-
-  _RemoteConnect(this_host, 1, d_source_node_index, 10, 0, 10, 3,
-		 conn_spec1, syn_spec1);
-
-  //_RemoteConnectSource(1, d_source_node_index, 10, 10, 3,
-  //		       conn_spec1, syn_spec1);
-  //_RemoteConnectTarget(0, d_source_node_index, 10, 10, 3,
-  //		       conn_spec1, syn_spec1);
-  
-
-  std::cout << "##################################################\n";
-  std::cout << "##################################################\n";
-  std::cout << "SECOND CONNECT COMMAND\n";
-  std::cout << "##################################################\n";
-  std::cout << "##################################################\n";
-  _RemoteConnect(this_host, 1, 20, 10, 0, 10, 3, conn_spec1, syn_spec1);
-  //_RemoteConnectSource(1, 20, 10, 10, 3, conn_spec1, syn_spec1);
-  //_RemoteConnectTarget(0, 20, 10, 10, 3, conn_spec1, syn_spec1);
-
-  ConnSpec conn_spec2(ALL_TO_ALL);
-  
-  int n_source2 = 4;
-  int h_source_node_index2[n_source2] =
-    {1, 2, 3, 4};
-  int *d_source_node_index2;
-  CUDAMALLOCCTRL("&d_source_node_index2",&d_source_node_index2, n_source2*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_source_node_index2, h_source_node_index2,
-		       n_source2*sizeof(int), cudaMemcpyHostToDevice));
-  _RemoteConnect(this_host, 1, d_source_node_index2, n_source2, 3, 0, 1,
-  		 conn_spec2, syn_spec1);
-
-  int n_source3 = 3;
-  int h_source_node_index3[n_source3] =
-    {2, 3, 4};
-  int *d_source_node_index3;
-  CUDAMALLOCCTRL("&d_source_node_index3",&d_source_node_index3, n_source3*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_source_node_index3, h_source_node_index3,
-		       n_source3*sizeof(int), cudaMemcpyHostToDevice));
-  _RemoteConnect(this_host, 1, d_source_node_index3, n_source3, 2, 0, 1,
-  		 conn_spec2, syn_spec1);
-
-  int n_source4 = 2;
-  int h_source_node_index4[n_source4] =
-    {3, 4};
-  int *d_source_node_index4;
-  CUDAMALLOCCTRL("&d_source_node_index4",&d_source_node_index4, n_source4*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_source_node_index4, h_source_node_index4,
-		       n_source4*sizeof(int), cudaMemcpyHostToDevice));
-  _RemoteConnect(this_host, 1, d_source_node_index4, n_source4, 4, 0, 1,
-  		 conn_spec2, syn_spec1);
-
-  int n_source5 = 1;
-  int h_source_node_index5[n_source5] = {4};
-  int *d_source_node_index5;
-  CUDAMALLOCCTRL("&d_source_node_index5",&d_source_node_index5, n_source5*sizeof(int));
-  gpuErrchk(cudaMemcpy(d_source_node_index5, h_source_node_index5,
-		       n_source5*sizeof(int), cudaMemcpyHostToDevice));
-  _RemoteConnect(this_host, 1, d_source_node_index5, n_source5, 0, 0, 1,
-  		 conn_spec2, syn_spec1);
-
-  
-  //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  //RemoteConnectionMapCalibrate(this_host, 5);
-  Calibrate();
-
-#endif
-  
-  // NestedLoop::Init(); moved to calibrate
   nested_loop_algo_ = CumulSumNestedLoopAlgo;
 
   SpikeBufferUpdate_time_ = 0;
@@ -342,317 +220,280 @@ NESTGPU::~NESTGPU()
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
 
-  if (calibrate_flag_) {
+  if ( calibrate_flag_ )
+  {
     FreeNodeGroupMap();
     FreeGetSpikeArrays();
   }
 
-  for (unsigned int i=0; i<node_vect_.size(); i++) {
-    delete node_vect_[i];
+  for ( unsigned int i = 0; i < node_vect_.size(); i++ )
+  {
+    delete node_vect_[ i ];
   }
 
   delete multimeter_;
-  curandDestroyGenerator(*random_generator_);
+  curandDestroyGenerator( *random_generator_ );
   delete random_generator_;
 }
 
-int NESTGPU::SetRandomSeed(unsigned long long seed)
+int
+NESTGPU::SetRandomSeed( unsigned long long seed )
 {
   kernel_seed_ = seed;
-  //CURAND_CALL(curandDestroyGenerator(*random_generator_));
-  //random_generator_ = new curandGenerator_t;
-  //CURAND_CALL(curandCreateGenerator(random_generator_,
-  //				    CURAND_RNG_PSEUDO_DEFAULT));
-  CURAND_CALL(curandSetPseudoRandomGeneratorSeed
-	      (*random_generator_, kernel_seed_ + this_host_));
-  
-  for (int i_host=0; i_host<n_hosts_; i_host++) {
-    for (int j_host=0; j_host<n_hosts_; j_host++) {
-      CURAND_CALL(curandSetPseudoRandomGeneratorSeed
-		  (conn_random_generator_[i_host][j_host],
-		   seed + conn_seed_offset_ + i_host*n_hosts_ + j_host));
-    }
-  }
-  
+  CURAND_CALL( curandSetPseudoRandomGeneratorSeed( *random_generator_, kernel_seed_ + this_host_ ) );
+  conn_->setRandomSeed( seed );
+
   return 0;
 }
 
-int NESTGPU::SetTimeResolution(float time_res)
+int
+NESTGPU::SetTimeResolution( float time_res )
 {
   time_resolution_ = time_res;
-  
+  conn_->setTimeResolution( time_resolution_ );
+
   return 0;
 }
 
-int NESTGPU::SetNestedLoopAlgo(int nested_loop_algo)
+int
+NESTGPU::SetNestedLoopAlgo( int nested_loop_algo )
 {
   nested_loop_algo_ = nested_loop_algo;
-  
+
   return 0;
 }
 
-int NESTGPU::SetMaxSpikeBufferSize(int max_size)
+int
+NESTGPU::SetMaxSpikeBufferSize( int max_size )
 {
   max_spike_buffer_size_ = max_size;
-  
+
   return 0;
 }
 
-int NESTGPU::GetMaxSpikeBufferSize()
+int
+NESTGPU::GetMaxSpikeBufferSize()
 {
   return max_spike_buffer_size_;
 }
 
-uint NESTGPU::GetNLocalNodes()
+uint
+NESTGPU::GetNLocalNodes()
 {
   return node_group_map_.size();
 }
 
-int NESTGPU::CheckImageNodes(int n_nodes)
+uint
+NESTGPU::GetNTotalNodes()
+{
+  return GetNLocalNodes() + conn_->getNImageNodes();
+}
+
+int
+NESTGPU::CheckImageNodes( int n_nodes )
 {
   int i_node_0 = GetNLocalNodes();
-  int max_n_nodes = IntPow(2,h_MaxNodeNBits);
-  
-  if ((i_node_0 + n_nodes) > max_n_nodes) {
-    throw ngpu_exception(std::string("Local plus Image nodes exceed maximum"
-				     " number of nodes ")
-			 + std::to_string(max_n_nodes));
-  }
-  
+  int max_n_nodes = ( int ) ( IntPow( 2, conn_->getMaxNodeNBits() ) - 1 );
+
+  if ( ( i_node_0 + n_nodes ) > max_n_nodes )
+  {
+    throw ngpu_exception( std::string( "Local plus Image nodes exceed maximum"
+                                       " number of nodes " )
+      + std::to_string( max_n_nodes ) );
+  }
+
   return i_node_0;
 }
 
-int NESTGPU::CreateNodeGroup(int n_nodes, int n_ports)
+// method for changing connection structure type
+int
+NESTGPU::setConnStructType( int conn_struct_type )
+{
+  // std::cout << "In setConnStructType " << conn_struct_type << "\n";
+  // Check if conn_ pointer has a nonzero value.
+  // In this case connection object must be deallocated
+  if ( conn_ != nullptr )
+  {
+    delete conn_;
+  }
+  // set new connection structure type
+  conn_struct_type_ = conn_struct_type;
+  // create connection object from the proper derived class
+  // Note that conn_ is of the type pointer-to-the(abstract)-base class
+  // while the object is in instance of a derived class
+  // defined using templates
+  switch ( conn_struct_type )
+  {
+  case i_conn12b:
+    conn_ = new ConnectionTemplate< conn12b_key, conn12b_struct >;
+    break;
+  case i_conn16b:
+    conn_ = new ConnectionTemplate< conn16b_key, conn16b_struct >;
+    break;
+  default:
+    throw ngpu_exception( "Unrecognized connection structure type index" );
+  }
+  conn_->setRandomSeed( kernel_seed_ );
+
+  // set time resolution in connection object
+  conn_->setTimeResolution( time_resolution_ );
+
+  return 0;
+}
+
+int
+NESTGPU::CreateNodeGroup( int n_nodes, int n_ports )
 {
   int i_node_0 = GetNLocalNodes();
-  int max_n_nodes = IntPow(2,h_MaxNodeNBits);
-  int max_n_ports = IntPow(2, (h_MaxPortSynNBits-h_MaxSynNBits-1));
-  
-  if ((i_node_0 + n_nodes) > max_n_nodes) {
-    throw ngpu_exception(std::string("Maximum number of nodes ")
-			 + std::to_string(max_n_nodes) + " exceeded");
+  int max_node_nbits = conn_->getMaxNodeNBits();
+  int max_n_nodes = ( int ) ( IntPow( 2, max_node_nbits ) - 1 );
+  int max_n_ports = ( int ) ( IntPow( 2, conn_->getMaxPortNBits() ) - 1 );
+  // std::cout << "max_node_nbits " << max_node_nbits << "\n";
+
+  if ( ( i_node_0 + n_nodes ) > max_n_nodes )
+  {
+    throw ngpu_exception(
+      std::string( "Maximum number of local nodes " ) + std::to_string( max_n_nodes ) + " exceeded" );
   }
-  if (n_ports > max_n_ports) {
-    throw ngpu_exception(std::string("Maximum number of ports ")
-			 + std::to_string(max_n_ports) + " exceeded");
+  if ( n_ports > max_n_ports )
+  {
+    throw ngpu_exception( std::string( "Maximum number of ports " ) + std::to_string( max_n_ports ) + " exceeded" );
   }
   int i_group = node_vect_.size() - 1;
-  node_group_map_.insert(node_group_map_.end(), n_nodes, i_group);
+  node_group_map_.insert( node_group_map_.end(), n_nodes, i_group );
+
+  node_vect_[ i_group ]->random_generator_ = random_generator_;
+  node_vect_[ i_group ]->Init( i_node_0, n_nodes, n_ports, i_group );
+  node_vect_[ i_group ]->get_spike_array_ = InitGetSpikeArray( n_nodes, n_ports );
 
-  node_vect_[i_group]->random_generator_ = random_generator_;
-  node_vect_[i_group]->Init(i_node_0, n_nodes, n_ports, i_group);
-  node_vect_[i_group]->get_spike_array_ = InitGetSpikeArray(n_nodes, n_ports);
-  
   return i_node_0;
 }
 
-int NESTGPU::CheckUncalibrated(std::string message)
+int
+NESTGPU::CheckUncalibrated( std::string message )
 {
-  if (calibrate_flag_ == true) {
-    throw ngpu_exception(message);
+  if ( calibrate_flag_ == true )
+  {
+    throw ngpu_exception( message );
   }
-  
+
   return 0;
 }
 
-int NESTGPU::Calibrate()
+int
+NESTGPU::Calibrate()
 {
-  CheckUncalibrated("Calibration can be made only once");
-  
-  if (verbosity_level_>=1) {
+  CheckUncalibrated( "Calibration can be made only once" );
+
+  if ( verbosity_level_ >= 1 )
+  {
     std::cout << HostIdStr() << "Calibrating ...\n";
   }
-  
-  gpuErrchk(cudaMemcpyToSymbol(NESTGPUTimeResolution, &time_resolution_,
-			       sizeof(float)));
 
-  gpuErrchk(cudaMemcpyToSymbol(have_remote_spike_height, &remote_spike_height_,
-			       sizeof(bool)));
-///////////////////////////////////
+  gpuErrchk( cudaMemcpyToSymbol( NESTGPUTimeResolution, &time_resolution_, sizeof( float ) ) );
+
+  gpuErrchk( cudaMemcpyToSymbol( have_remote_spike_height, &remote_spike_height_, sizeof( bool ) ) );
+  ///////////////////////////////////
   int n_nodes = GetNLocalNodes();
-  gpuErrchk(cudaMemcpyToSymbol(n_local_nodes, &n_nodes,
-			       sizeof(int)));
+  gpuErrchk( cudaMemcpyToSymbol( n_local_nodes, &n_nodes, sizeof( int ) ) );
 
-  // std::cout << "n_local_nodes: " << n_nodes << " n_image_nodes_: "
-  // << n_image_nodes_ << "\n";
-  if (n_image_nodes_ > 0) {
-    CheckImageNodes(n_image_nodes_);
-    addOffsetToExternalNodeIds();
+  int n_image_nodes = conn_->getNImageNodes();
+  // std::cout << "n_local_nodes: " << n_nodes << " n_image_nodes: "
+  //	    << n_image_nodes << "\n";
+  if ( n_image_nodes > 0 )
+  {
+    CheckImageNodes( n_image_nodes );
+    conn_->addOffsetToExternalNodeIds( GetNLocalNodes() );
   }
-  
+
   calibrate_flag_ = true;
-  
-  organizeConnections(time_resolution_, GetTotalNNodes(),
-		      NConn, h_ConnBlockSize,
-		      KeySubarray, ConnectionSubarray);
-  
-  ConnectInit();
-
-  poiss_conn::OrganizeDirectConnections();
-  for (unsigned int i=0; i<node_vect_.size(); i++) {
-    if (node_vect_[i]->has_dir_conn_) {
-      node_vect_[i]->buildDirectConnections();
+
+  conn_->organizeConnections( GetNTotalNodes() );
+
+  conn_->calibrate();
+
+  poiss_conn::organizeDirectConnections( conn_ );
+  for ( unsigned int i = 0; i < node_vect_.size(); i++ )
+  {
+    if ( node_vect_[ i ]->has_dir_conn_ )
+    {
+      node_vect_[ i ]->buildDirectConnections();
     }
   }
-  
-  if (remove_conn_key_) {
-    freeConnectionKey(KeySubarray);
+
+  if ( remove_conn_key_ )
+  {
+    conn_->freeConnectionKey();
   }
-  
-  int max_delay_num = h_MaxDelayNum;
-  
-  unsigned int n_spike_buffers = GetTotalNNodes();
-  NestedLoop::Init(n_spike_buffers);
-		   
+
+  int max_delay_num = conn_->getMaxDelayNum();
+
+  unsigned int n_spike_buffers = GetNTotalNodes();
+  NestedLoop::Init( n_spike_buffers );
+
   // temporary
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  
-  
+
   neural_time_ = t_min_;
-  	    
+
   NodeGroupArrayInit();
-  
-  max_spike_num_ = (int)round(max_spike_num_fact_
-			      * GetTotalNNodes()
-			      * max_delay_num);
-  max_spike_num_ = (max_spike_num_>1) ? max_spike_num_ : 1;
-
-  max_spike_per_host_ = (int)round(max_spike_per_host_fact_
-				   * GetNLocalNodes()
-				   * max_delay_num);
-  max_spike_per_host_ = (max_spike_per_host_>1) ? max_spike_per_host_ : 1;
-
-  max_remote_spike_num_ = max_spike_per_host_ * n_hosts_
-    * max_remote_spike_num_fact_;
-  max_remote_spike_num_ = (max_remote_spike_num_>1)
-    ? max_remote_spike_num_ : 1;
-  
-  SpikeInit(max_spike_num_);
-  SpikeBufferInit(GetTotalNNodes(), max_spike_buffer_size_);
-
-  //#ifndef CHECKRC
-  if (n_hosts_ > 1) {
-    RemoteConnectionMapCalibrate(this_host_, n_hosts_);
-    addOffsetToSpikeBufferMap();
-#ifdef CHECKRC
-    // TEMPORARY, FOR TESTING
-    std::cout << "////////////////////////////////////////\n";
-    std::cout << "After addOffsetToSpikeBufferMap\n";
-    std::cout << "MAP\n";
-  
-    int tmp_n_hosts = 2;
-    int tmp_tg_host = 0;
-    int tmp_src_host = 1;
-  
-    int **tmp_pt2[tmp_n_hosts];
-    int tmp_n[tmp_n_hosts];
-    int tmp_map[h_node_map_block_size];
-    int n_map;
-    int n_blocks;
-
-    gpuErrchk(cudaMemcpy(tmp_n, d_n_local_source_node_map,
-			 tmp_n_hosts*sizeof(int), cudaMemcpyDeviceToHost));
-    n_map = tmp_n[tmp_tg_host];
-    if (n_map>0) {
-      std::cout << "////////////////////////////////////////\n";
-      std::cout << "Local Source Node Map\n";
-      std::cout << "target host: " << tmp_tg_host << "\n";
-      std::cout << "n_local_source_node_map: " << n_map << "\n";
-      gpuErrchk(cudaMemcpy(tmp_pt2, d_local_source_node_map,
-			   tmp_n_hosts*sizeof(int**), cudaMemcpyDeviceToHost));
-  
-      n_blocks = (n_map - 1) / h_node_map_block_size + 1;
-      std::cout << "n_blocks: " << n_blocks << "\n";
-      int *tmp_pt1[n_blocks];
-      gpuErrchk(cudaMemcpy(tmp_pt1, tmp_pt2[tmp_tg_host],
-			   n_blocks*sizeof(int*), cudaMemcpyDeviceToHost));
-    
-      for (int ib=0; ib<n_blocks; ib++) {
-	std::cout << "block " << ib << "\n";
-	int n = h_node_map_block_size;
-	if (ib==n_blocks-1) {
-	  n = (n_map - 1) % h_node_map_block_size + 1;
-	}
-	gpuErrchk(cudaMemcpy(tmp_map, tmp_pt1[ib],
-			     n*sizeof(int), cudaMemcpyDeviceToHost));
-	std::cout << "local source node index\n";
-	for (int i=0; i<n; i++) {
-	  std::cout << tmp_map[i] << "\n";
-	}
-      }
-    }
 
-    //gpuErrchk(cudaMemcpy(tmp_n, d_n_local_spike_buffer_map,
-    gpuErrchk(cudaMemcpy(tmp_n, d_n_remote_source_node_map,
-			 tmp_n_hosts*sizeof(int), cudaMemcpyDeviceToHost));
-    n_map = tmp_n[tmp_src_host];
-    if (n_map>0) {
-      std::cout << "////////////////////////////////////////\n";
-      std::cout << "Local Spike Buffer Map\n";
-      std::cout << "source host: " << tmp_src_host << "\n";
-      std::cout << "n_local_spike_buffer_map: " << n_map << "\n";
-      gpuErrchk(cudaMemcpy(tmp_pt2, d_local_spike_buffer_map,
-			   tmp_n_hosts*sizeof(int**), cudaMemcpyDeviceToHost));
-  
-      n_blocks = (n_map - 1) / h_node_map_block_size + 1;
-      std::cout << "n_blocks: " << n_blocks << "\n";
-      int *tmp_pt1[n_blocks];
-      gpuErrchk(cudaMemcpy(tmp_pt1, tmp_pt2[tmp_src_host],
-			   n_blocks*sizeof(int*), cudaMemcpyDeviceToHost));
-    
-      for (int ib=0; ib<n_blocks; ib++) {
-	std::cout << "block " << ib << "\n";
-	int n = h_node_map_block_size;
-	if (ib==n_blocks-1) {
-	  n = (n_map - 1) % h_node_map_block_size + 1;
-	}
-	gpuErrchk(cudaMemcpy(tmp_map, tmp_pt1[ib],
-			     n*sizeof(int), cudaMemcpyDeviceToHost));
-	std::cout << "local spike buffer index\n";
-	for (int i=0; i<n; i++) {
-	  std::cout << tmp_map[i] << "\n";
-	}
-      }
-    }
+  max_spike_num_ = ( int ) round( max_spike_num_fact_ * GetNTotalNodes() * max_delay_num );
+  max_spike_num_ = ( max_spike_num_ > 1 ) ? max_spike_num_ : 1;
+
+  max_spike_per_host_ = ( int ) round( max_spike_per_host_fact_ * GetNLocalNodes() * max_delay_num );
+  max_spike_per_host_ = ( max_spike_per_host_ > 1 ) ? max_spike_per_host_ : 1;
 
-    ////////////////////////////////////////
-#endif
+  max_remote_spike_num_ = max_spike_per_host_ * n_hosts_ * max_remote_spike_num_fact_;
+  max_remote_spike_num_ = ( max_remote_spike_num_ > 1 ) ? max_remote_spike_num_ : 1;
+
+  SpikeInit( max_spike_num_ );
+  spikeBufferInit( GetNTotalNodes(), max_spike_buffer_size_, conn_->getMaxDelayNum() );
+
+  if ( n_hosts_ > 1 )
+  {
+    conn_->remoteConnectionMapCalibrate( GetNLocalNodes() );
 
     ExternalSpikeInit();
   }
-  //#endif
 
-  if (rev_conn_flag_) {
-    RevSpikeInit(GetNLocalNodes());
+  if ( conn_->getRevConnFlag() )
+  {
+    conn_->revSpikeInit( GetNLocalNodes() );
   }
- 
+
   multimeter_->OpenFiles();
-  
-  for (unsigned int i=0; i<node_vect_.size(); i++) {
-    node_vect_[i]->Calibrate(t_min_, time_resolution_);
+
+  for ( unsigned int i = 0; i < node_vect_.size(); i++ )
+  {
+    node_vect_[ i ]->Calibrate( t_min_, time_resolution_ );
   }
-  
+
   SynGroupCalibrate();
-  
-  gpuErrchk(cudaMemcpyToSymbolAsync(NESTGPUTimeResolution, &time_resolution_,
-				    sizeof(float)));
+
+  gpuErrchk( cudaMemcpyToSymbolAsync( NESTGPUTimeResolution, &time_resolution_, sizeof( float ) ) );
 
   return 0;
 }
 
-int NESTGPU::Simulate(float sim_time) {
+int
+NESTGPU::Simulate( float sim_time )
+{
   sim_time_ = sim_time;
   return Simulate();
 }
 
-int NESTGPU::Simulate()
+int
+NESTGPU::Simulate()
 {
   StartSimulation();
-  
-  for (long long it=0; it<Nt_; it++) {
-    if (it%100==0 && verbosity_level_>=2 && print_time_==true) {
-      printf("\r[%.2lf %%] Model time: %.3lf ms", 100.0*(neural_time_-neur_t0_)/sim_time_, neural_time_);
+
+  for ( long long it = 0; it < Nt_; it++ )
+  {
+    if ( it % 100 == 0 && verbosity_level_ >= 2 && print_time_ == true )
+    {
+      printf( "\r[%.2lf %%] Model time: %.3lf ms", 100.0 * ( neural_time_ - neur_t0_ ) / sim_time_, neural_time_ );
     }
     SimulationStep();
   }
@@ -661,104 +502,98 @@ int NESTGPU::Simulate()
   return 0;
 }
 
-int NESTGPU::StartSimulation()
+int
+NESTGPU::StartSimulation()
 {
-  if (!calibrate_flag_) {
+  if ( !calibrate_flag_ )
+  {
     Calibrate();
   }
-  if (first_simulation_flag_) {
-    gpuErrchk(cudaMemcpyToSymbolAsync(NESTGPUTime, &neural_time_, sizeof(double)));
-    multimeter_->WriteRecords(neural_time_);
+  if ( first_simulation_flag_ )
+  {
+    gpuErrchk( cudaMemcpyToSymbolAsync( NESTGPUTime, &neural_time_, sizeof( double ) ) );
+    multimeter_->WriteRecords( neural_time_ );
     build_real_time_ = getRealTime();
     first_simulation_flag_ = false;
   }
-  if (verbosity_level_>=1) {
+  if ( verbosity_level_ >= 1 )
+  {
     std::cout << HostIdStr() << "Simulating ...\n";
-    printf("Neural activity simulation time: %.3lf ms\n", sim_time_);
+    printf( "Neural activity simulation time: %.3lf ms\n", sim_time_ );
   }
-  
+
   neur_t0_ = neural_time_;
   it_ = 0;
-  Nt_ = (long long)round(sim_time_/time_resolution_);
-  
+  Nt_ = ( long long ) round( sim_time_ / time_resolution_ );
+
   return 0;
 }
 
-int NESTGPU::EndSimulation()
+int
+NESTGPU::EndSimulation()
 {
-  if (verbosity_level_>=2  && print_time_==true) {
-    printf("\r[%.2lf %%] Model time: %.3lf ms", 100.0*(neural_time_-neur_t0_)/sim_time_, neural_time_);
+  if ( verbosity_level_ >= 2 && print_time_ == true )
+  {
+    printf( "\r[%.2lf %%] Model time: %.3lf ms", 100.0 * ( neural_time_ - neur_t0_ ) / sim_time_, neural_time_ );
   }
 
   end_real_time_ = getRealTime();
 
-  //multimeter_->CloseFiles();
-  //neuron.rk5.Free();
+  // multimeter_->CloseFiles();
+  // neuron.rk5.Free();
 
-  if (verbosity_level_>=3) {
+  if ( verbosity_level_ >= 3 )
+  {
     std::cout << "\n";
-    std::cout << HostIdStr() << "  SpikeBufferUpdate_time: " <<
-      SpikeBufferUpdate_time_ << "\n";
-    std::cout << HostIdStr() << "  poisson_generator_time: " <<
-      poisson_generator_time_ << "\n";
-    std::cout << HostIdStr() << "  neuron_Update_time: " <<
-      neuron_Update_time_ << "\n";
-    std::cout << HostIdStr() << "  copy_ext_spike_time: " <<
-      copy_ext_spike_time_ << "\n";
-    std::cout << HostIdStr() << "  organizeExternalSpike_time: " <<
-      organizeExternalSpike_time_ << "\n";
-    std::cout << HostIdStr() << "  SendSpikeToRemote_time: " <<
-      SendSpikeToRemote_time_ << "\n";
-    std::cout << HostIdStr() << "  RecvSpikeFromRemote_time: " <<
-      RecvSpikeFromRemote_time_ << "\n";
-    std::cout << HostIdStr() << "  NestedLoop_time: " <<
-      NestedLoop_time_ << "\n";
-    std::cout << HostIdStr() << "  GetSpike_time: " <<
-      GetSpike_time_ << "\n";
-    std::cout << HostIdStr() << "  SpikeReset_time: " <<
-      SpikeReset_time_ << "\n";
-    std::cout << HostIdStr() << "  ExternalSpikeReset_time: " <<
-      ExternalSpikeReset_time_ << "\n";
-  }
-
-  if (n_hosts_>1 && verbosity_level_>=4) {
-    std::cout << HostIdStr() << "  SendSpikeToRemote_comm_time: " <<
-      SendSpikeToRemote_comm_time_ << "\n";
-    std::cout << HostIdStr() << "  RecvSpikeFromRemote_comm_time: " <<
-      RecvSpikeFromRemote_comm_time_ << "\n";
-    std::cout << HostIdStr() << "  SendSpikeToRemote_CUDAcp_time: " <<
-      SendSpikeToRemote_CUDAcp_time_  << "\n";
-    std::cout << HostIdStr() << "  RecvSpikeFromRemote_CUDAcp_time: " <<
-      RecvSpikeFromRemote_CUDAcp_time_  << "\n";
-  }
-  
-  if (verbosity_level_>=1) {
-    std::cout << HostIdStr() << "Building time: " <<
-      (build_real_time_ - start_real_time_) << "\n";
-    std::cout << HostIdStr() << "Simulation time: " <<
-      (end_real_time_ - build_real_time_) << "\n";
+    std::cout << HostIdStr() << "  SpikeBufferUpdate_time: " << SpikeBufferUpdate_time_ << "\n";
+    std::cout << HostIdStr() << "  poisson_generator_time: " << poisson_generator_time_ << "\n";
+    std::cout << HostIdStr() << "  neuron_Update_time: " << neuron_Update_time_ << "\n";
+    std::cout << HostIdStr() << "  copy_ext_spike_time: " << copy_ext_spike_time_ << "\n";
+    std::cout << HostIdStr() << "  organizeExternalSpike_time: " << organizeExternalSpike_time_ << "\n";
+    std::cout << HostIdStr() << "  SendSpikeToRemote_time: " << SendSpikeToRemote_time_ << "\n";
+    std::cout << HostIdStr() << "  RecvSpikeFromRemote_time: " << RecvSpikeFromRemote_time_ << "\n";
+    std::cout << HostIdStr() << "  NestedLoop_time: " << NestedLoop_time_ << "\n";
+    std::cout << HostIdStr() << "  GetSpike_time: " << GetSpike_time_ << "\n";
+    std::cout << HostIdStr() << "  SpikeReset_time: " << SpikeReset_time_ << "\n";
+    std::cout << HostIdStr() << "  ExternalSpikeReset_time: " << ExternalSpikeReset_time_ << "\n";
+  }
+
+  if ( n_hosts_ > 1 && verbosity_level_ >= 4 )
+  {
+    std::cout << HostIdStr() << "  SendSpikeToRemote_comm_time: " << SendSpikeToRemote_comm_time_ << "\n";
+    std::cout << HostIdStr() << "  RecvSpikeFromRemote_comm_time: " << RecvSpikeFromRemote_comm_time_ << "\n";
+    std::cout << HostIdStr() << "  SendSpikeToRemote_CUDAcp_time: " << SendSpikeToRemote_CUDAcp_time_ << "\n";
+    std::cout << HostIdStr() << "  RecvSpikeFromRemote_CUDAcp_time: " << RecvSpikeFromRemote_CUDAcp_time_ << "\n";
+  }
+
+  if ( verbosity_level_ >= 1 )
+  {
+    std::cout << HostIdStr() << "Building time: " << ( build_real_time_ - start_real_time_ ) << "\n";
+    std::cout << HostIdStr() << "Simulation time: " << ( end_real_time_ - build_real_time_ ) << "\n";
   }
 
   return 0;
 }
 
-
-int NESTGPU::SimulationStep()
+int
+NESTGPU::SimulationStep()
 {
-  if (first_simulation_flag_) {
+  if ( first_simulation_flag_ )
+  {
     StartSimulation();
   }
   double time_mark;
 
   time_mark = getRealTime();
-  SpikeBufferUpdate<<<(GetTotalNNodes()+1023)/1024, 1024>>>();
+  SpikeBufferUpdate<<< ( GetNTotalNodes() + 1023 ) / 1024, 1024 >>>();
   gpuErrchk( cudaPeekAtLastError() );
-  SpikeBufferUpdate_time_ += (getRealTime() - time_mark);
+  SpikeBufferUpdate_time_ += ( getRealTime() - time_mark );
   time_mark = getRealTime();
-  neural_time_ = neur_t0_ + (double)time_resolution_*(it_+1);
-  gpuErrchk(cudaMemcpyToSymbolAsync(NESTGPUTime, &neural_time_, sizeof(double)));
-  long long time_idx = (int)round(neur_t0_/time_resolution_) + it_ + 1;
-  gpuErrchk(cudaMemcpyToSymbolAsync(NESTGPUTimeIdx, &time_idx, sizeof(long long)));
+  neural_time_ = neur_t0_ + ( double ) time_resolution_ * ( it_ + 1 );
+  // std::cout << "neural_time_: " << neural_time_ << "\n";
+  gpuErrchk( cudaMemcpyToSymbolAsync( NESTGPUTime, &neural_time_, sizeof( double ) ) );
+  long long time_idx = ( int ) round( neur_t0_ / time_resolution_ ) + it_ + 1;
+  gpuErrchk( cudaMemcpyToSymbolAsync( NESTGPUTimeIdx, &time_idx, sizeof( long long ) ) );
 
   /*
   if (ConnectionSpikeTimeFlag) {
@@ -770,1054 +605,1290 @@ int NESTGPU::SimulationStep()
     }
   }
   */
-  
-  for (unsigned int i=0; i<node_vect_.size(); i++) {
-    node_vect_[i]->Update(it_, neural_time_);
+
+  for ( unsigned int i = 0; i < node_vect_.size(); i++ )
+  {
+    node_vect_[ i ]->Update( it_, neural_time_ );
   }
   gpuErrchk( cudaPeekAtLastError() );
-  
-  neuron_Update_time_ += (getRealTime() - time_mark);
-  multimeter_->WriteRecords(neural_time_);
 
-  if (n_hosts_>1) {
+  neuron_Update_time_ += ( getRealTime() - time_mark );
+  multimeter_->WriteRecords( neural_time_ );
+
+  if ( n_hosts_ > 1 )
+  {
     int n_ext_spikes;
     time_mark = getRealTime();
-    gpuErrchk(cudaMemcpy(&n_ext_spikes, d_ExternalSpikeNum, sizeof(int),
-			 cudaMemcpyDeviceToHost));
-    copy_ext_spike_time_ += (getRealTime() - time_mark);
+    gpuErrchk( cudaMemcpy( &n_ext_spikes, d_ExternalSpikeNum, sizeof( int ), cudaMemcpyDeviceToHost ) );
+    copy_ext_spike_time_ += ( getRealTime() - time_mark );
 
-    if (n_ext_spikes != 0) {
+    if ( n_ext_spikes != 0 )
+    {
       time_mark = getRealTime();
-      organizeExternalSpikes(n_ext_spikes);
-      organizeExternalSpike_time_ += (getRealTime() - time_mark);
+      organizeExternalSpikes( n_ext_spikes );
+      organizeExternalSpike_time_ += ( getRealTime() - time_mark );
     }
     time_mark = getRealTime();
-    SendSpikeToRemote(n_ext_spikes);
-    
-    SendSpikeToRemote_time_ += (getRealTime() - time_mark);
+    SendSpikeToRemote( n_ext_spikes );
+
+    SendSpikeToRemote_time_ += ( getRealTime() - time_mark );
     time_mark = getRealTime();
     RecvSpikeFromRemote();
-    RecvSpikeFromRemote_time_ += (getRealTime() - time_mark);
+    RecvSpikeFromRemote_time_ += ( getRealTime() - time_mark );
     CopySpikeFromRemote();
   }
-  
+
   int n_spikes;
-  time_mark = getRealTime();
+
   // Call will get delayed until ClearGetSpikesArrays()
   // afterwards the value of n_spikes will be available
-  gpuErrchk(cudaMemcpyAsync(&n_spikes, d_SpikeNum, sizeof(int),
-		       cudaMemcpyDeviceToHost));
+  gpuErrchk( cudaMemcpyAsync( &n_spikes, d_SpikeNum, sizeof( int ), cudaMemcpyDeviceToHost ) );
 
   ClearGetSpikeArrays();
   gpuErrchk( cudaDeviceSynchronize() );
-  if (n_spikes > 0) {
+  if ( n_spikes > 0 )
+  {
     time_mark = getRealTime();
-    NestedLoop::Run<0>(nested_loop_algo_, n_spikes, d_SpikeTargetNum);
-    NestedLoop_time_ += (getRealTime() - time_mark);
+    switch ( conn_struct_type_ )
+    {
+    case i_conn12b:
+      NestedLoop::Run< 0 >( nested_loop_algo_, n_spikes, d_SpikeTargetNum );
+      break;
+    case i_conn16b:
+      NestedLoop::Run< 2 >( nested_loop_algo_, n_spikes, d_SpikeTargetNum );
+      break;
+    default:
+      throw ngpu_exception( "Unrecognized connection structure type index" );
+    }
+    NestedLoop_time_ += ( getRealTime() - time_mark );
   }
   time_mark = getRealTime();
-  for (unsigned int i=0; i<node_vect_.size(); i++) {
-    if (node_vect_[i]->has_dir_conn_) {
-      node_vect_[i]->SendDirectSpikes(time_idx);
+  for ( unsigned int i = 0; i < node_vect_.size(); i++ )
+  {
+    if ( node_vect_[ i ]->has_dir_conn_ )
+    {
+      node_vect_[ i ]->SendDirectSpikes( time_idx );
     }
   }
-  poisson_generator_time_ += (getRealTime() - time_mark);
+  poisson_generator_time_ += ( getRealTime() - time_mark );
   time_mark = getRealTime();
-  for (unsigned int i=0; i<node_vect_.size(); i++) {
-    if (node_vect_[i]->n_port_>0) {
-
-      int grid_dim_x = (node_vect_[i]->n_node_+1023)/1024;
-      int grid_dim_y = node_vect_[i]->n_port_;
-      dim3 grid_dim(grid_dim_x, grid_dim_y);
-      //dim3 block_dim(1024,1);
-					    
-      GetSpikes<<<grid_dim, 1024>>> //block_dim>>>
-	(node_vect_[i]->get_spike_array_, node_vect_[i]->n_node_,
-	 node_vect_[i]->n_port_,
-	 node_vect_[i]->n_var_,
-	 node_vect_[i]->port_weight_arr_,
-	 node_vect_[i]->port_weight_arr_step_,
-	 node_vect_[i]->port_weight_port_step_,
-	 node_vect_[i]->port_input_arr_,
-	 node_vect_[i]->port_input_arr_step_,
-	 node_vect_[i]->port_input_port_step_);
+  for ( unsigned int i = 0; i < node_vect_.size(); i++ )
+  {
+    if ( node_vect_[ i ]->n_port_ > 0 )
+    {
+
+      int grid_dim_x = ( node_vect_[ i ]->n_node_ + 1023 ) / 1024;
+      int grid_dim_y = node_vect_[ i ]->n_port_;
+      dim3 grid_dim( grid_dim_x, grid_dim_y );
+      // dim3 block_dim(1024,1);
+
+      GetSpikes<<< grid_dim, 1024 >>> // block_dim>>>
+        ( node_vect_[ i ]->get_spike_array_,
+          node_vect_[ i ]->n_node_,
+          node_vect_[ i ]->n_port_,
+          node_vect_[ i ]->n_var_,
+          node_vect_[ i ]->port_weight_arr_,
+          node_vect_[ i ]->port_weight_arr_step_,
+          node_vect_[ i ]->port_weight_port_step_,
+          node_vect_[ i ]->port_input_arr_,
+          node_vect_[ i ]->port_input_arr_step_,
+          node_vect_[ i ]->port_input_port_step_ );
     }
   }
   gpuErrchk( cudaPeekAtLastError() );
 
-  GetSpike_time_ += (getRealTime() - time_mark);
+  GetSpike_time_ += ( getRealTime() - time_mark );
 
   time_mark = getRealTime();
-  SpikeReset<<<1, 1>>>();
+  SpikeReset<<< 1, 1 >>>();
   gpuErrchk( cudaPeekAtLastError() );
-  SpikeReset_time_ += (getRealTime() - time_mark);
+  SpikeReset_time_ += ( getRealTime() - time_mark );
 
-  if (n_hosts_>1) {
+  if ( n_hosts_ > 1 )
+  {
     time_mark = getRealTime();
     ExternalSpikeReset();
-    ExternalSpikeReset_time_ += (getRealTime() - time_mark);
+    ExternalSpikeReset_time_ += ( getRealTime() - time_mark );
   }
 
-  if (h_NRevConn > 0) {
-    //time_mark = getRealTime();
-    RevSpikeReset<<<1, 1>>>();
+  if ( conn_->getNRevConn() > 0 )
+  {
+    // time_mark = getRealTime();
+    revSpikeReset<<< 1, 1 >>>();
     gpuErrchk( cudaPeekAtLastError() );
-    RevSpikeBufferUpdate<<<(GetNLocalNodes()+1023)/1024, 1024>>>
-      (GetNLocalNodes());
+    revSpikeBufferUpdate<<< ( GetNLocalNodes() + 1023 ) / 1024, 1024 >>>( GetNLocalNodes() );
     gpuErrchk( cudaPeekAtLastError() );
     unsigned int n_rev_spikes;
-    gpuErrchk(cudaMemcpy(&n_rev_spikes, d_RevSpikeNum, sizeof(unsigned int),
-			 cudaMemcpyDeviceToHost));
-    if (n_rev_spikes > 0) {
-      NestedLoop::Run<1>(nested_loop_algo_, n_rev_spikes, d_RevSpikeNConn);
-    }      
-    //RevSpikeBufferUpdate_time_ += (getRealTime() - time_mark);
+    gpuErrchk(
+      cudaMemcpy( &n_rev_spikes, conn_->getDevRevSpikeNumPt(), sizeof( unsigned int ), cudaMemcpyDeviceToHost ) );
+    if ( n_rev_spikes > 0 )
+    {
+      switch ( conn_struct_type_ )
+      {
+      case i_conn12b:
+        NestedLoop::Run< 1 >( nested_loop_algo_, n_rev_spikes, conn_->getDevRevSpikeNConnPt() );
+        break;
+      case i_conn16b:
+        NestedLoop::Run< 3 >( nested_loop_algo_, n_rev_spikes, conn_->getDevRevSpikeNConnPt() );
+        break;
+      default:
+        throw ngpu_exception( "Unrecognized connection structure type index" );
+      }
+    }
+    // RevSpikeBufferUpdate_time_ += (getRealTime() - time_mark);
   }
 
-  for (unsigned int i=0; i<node_vect_.size(); i++) {
+  for ( unsigned int i = 0; i < node_vect_.size(); i++ )
+  {
     // if spike times recording is activated for node group...
-    if (node_vect_[i]->max_n_rec_spike_times_>0) {
+    if ( node_vect_[ i ]->max_n_rec_spike_times_ > 0 )
+    {
       // and if buffering is activated every n_step time steps...
-      int n_step = node_vect_[i]->rec_spike_times_step_;
-      if (n_step>0 && (time_idx%n_step == n_step-1)) {
-	// extract recorded spike times and put them in buffers
-	node_vect_[i]->BufferRecSpikeTimes();
+      int n_step = node_vect_[ i ]->rec_spike_times_step_;
+      if ( n_step > 0 && ( time_idx % n_step == n_step - 1 ) )
+      {
+        // extract recorded spike times and put them in buffers
+        node_vect_[ i ]->BufferRecSpikeTimes();
       }
     }
   }
 
   it_++;
-  
+
   return 0;
 }
 
-int NESTGPU::CreateRecord(std::string file_name, std::string *var_name_arr,
-			    int *i_node_arr, int *port_arr,
-			    int n_node)
+int
+NESTGPU::CreateRecord( std::string file_name, std::string* var_name_arr, int* i_node_arr, int* port_arr, int n_node )
 {
-  std::vector<BaseNeuron*> neur_vect;
-  std::vector<int> i_neur_vect;
-  std::vector<int> port_vect;
-  std::vector<std::string> var_name_vect;
-  for (int i=0; i<n_node; i++) {
-    var_name_vect.push_back(var_name_arr[i]);
-    int i_group = node_group_map_[i_node_arr[i]];
-    i_neur_vect.push_back(i_node_arr[i] - node_vect_[i_group]->i_node_0_);
-    port_vect.push_back(port_arr[i]);
-    neur_vect.push_back(node_vect_[i_group]);
+  std::vector< BaseNeuron* > neur_vect;
+  std::vector< int > i_neur_vect;
+  std::vector< int > port_vect;
+  std::vector< std::string > var_name_vect;
+  for ( int i = 0; i < n_node; i++ )
+  {
+    var_name_vect.push_back( var_name_arr[ i ] );
+    int i_group = node_group_map_[ i_node_arr[ i ] ];
+    i_neur_vect.push_back( i_node_arr[ i ] - node_vect_[ i_group ]->i_node_0_ );
+    port_vect.push_back( port_arr[ i ] );
+    neur_vect.push_back( node_vect_[ i_group ] );
   }
 
-  return multimeter_->CreateRecord(neur_vect, file_name, var_name_vect,
-  				   i_neur_vect, port_vect);
-
+  return multimeter_->CreateRecord( neur_vect, file_name, var_name_vect, i_neur_vect, port_vect );
 }
 
-int NESTGPU::CreateRecord(std::string file_name, std::string *var_name_arr,
-			    int *i_node_arr, int n_node)
+int
+NESTGPU::CreateRecord( std::string file_name, std::string* var_name_arr, int* i_node_arr, int n_node )
 {
-  std::vector<int> port_vect(n_node, 0);
-  return CreateRecord(file_name, var_name_arr, i_node_arr,
-		      port_vect.data(), n_node);
+  std::vector< int > port_vect( n_node, 0 );
+  return CreateRecord( file_name, var_name_arr, i_node_arr, port_vect.data(), n_node );
 }
 
-std::vector<std::vector<float> > *NESTGPU::GetRecordData(int i_record)
+std::vector< std::vector< float > >*
+NESTGPU::GetRecordData( int i_record )
 {
-  return multimeter_->GetRecordData(i_record);
+  return multimeter_->GetRecordData( i_record );
 }
 
-int NESTGPU::GetNodeSequenceOffset(int i_node, int n_node, int &i_group)
+int
+NESTGPU::GetNodeSequenceOffset( int i_node, int n_node, int& i_group )
 {
-  if (i_node<0 || (i_node+n_node > (int)node_group_map_.size())) {
-    throw ngpu_exception("Unrecognized node in getting node sequence offset");
+  if ( i_node < 0 || ( i_node + n_node > ( int ) node_group_map_.size() ) )
+  {
+    throw ngpu_exception( "Unrecognized node in getting node sequence offset" );
   }
-  i_group = node_group_map_[i_node];  
-  if (node_group_map_[i_node+n_node-1] != i_group) {
-    throw ngpu_exception("Nodes belong to different node groups "
-			 "in setting parameter");
+  i_group = node_group_map_[ i_node ];
+  if ( node_group_map_[ i_node + n_node - 1 ] != i_group )
+  {
+    throw ngpu_exception(
+      "Nodes belong to different node groups "
+      "in setting parameter" );
   }
-  return node_vect_[i_group]->i_node_0_;
+  return node_vect_[ i_group ]->i_node_0_;
 }
-  
-std::vector<int> NESTGPU::GetNodeArrayWithOffset(int *i_node, int n_node,
-						   int &i_group)
+
+std::vector< int >
+NESTGPU::GetNodeArrayWithOffset( int* i_node, int n_node, int& i_group )
 {
-  int in0 = i_node[0];
-  if (in0<0 || in0>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in setting parameter");
+  int in0 = i_node[ 0 ];
+  if ( in0 < 0 || in0 > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception( "Unrecognized node in setting parameter" );
   }
-  i_group = node_group_map_[in0];
-  int i0 = node_vect_[i_group]->i_node_0_;
-  std::vector<int> nodes;
-  nodes.assign(i_node, i_node+n_node);
-  for(int i=0; i<n_node; i++) {
-    int in = nodes[i];
-    if (in<0 || in>=(int)node_group_map_.size()) {
-      throw ngpu_exception("Unrecognized node in setting parameter");
+  i_group = node_group_map_[ in0 ];
+  int i0 = node_vect_[ i_group ]->i_node_0_;
+  std::vector< int > nodes;
+  nodes.assign( i_node, i_node + n_node );
+  for ( int i = 0; i < n_node; i++ )
+  {
+    int in = nodes[ i ];
+    if ( in < 0 || in >= ( int ) node_group_map_.size() )
+    {
+      throw ngpu_exception( "Unrecognized node in setting parameter" );
     }
-    if (node_group_map_[in] != i_group) {
-      throw ngpu_exception("Nodes belong to different node groups "
-			   "in setting parameter");
+    if ( node_group_map_[ in ] != i_group )
+    {
+      throw ngpu_exception(
+        "Nodes belong to different node groups "
+        "in setting parameter" );
     }
-    nodes[i] -= i0;
+    nodes[ i ] -= i0;
   }
   return nodes;
 }
 
-int NESTGPU::SetNeuronParam(int i_node, int n_node,
-			      std::string param_name, float val)
+int
+NESTGPU::SetNeuronParam( int i_node, int n_node, std::string param_name, float val )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, n_node, i_group);
-  
-  return node_vect_[i_group]->SetScalParam(i_neuron, n_node, param_name, val);
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, n_node, i_group );
+
+  return node_vect_[ i_group ]->SetScalParam( i_neuron, n_node, param_name, val );
 }
 
-int NESTGPU::SetNeuronParam(int *i_node, int n_node,
-			      std::string param_name, float val)
+int
+NESTGPU::SetNeuronParam( int* i_node, int n_node, std::string param_name, float val )
 {
   int i_group;
-  std::vector<int> nodes = GetNodeArrayWithOffset(i_node, n_node,
-						  i_group);
-  return node_vect_[i_group]->SetScalParam(nodes.data(), n_node,
-					   param_name, val);
+  std::vector< int > nodes = GetNodeArrayWithOffset( i_node, n_node, i_group );
+  return node_vect_[ i_group ]->SetScalParam( nodes.data(), n_node, param_name, val );
 }
 
-int NESTGPU::SetNeuronParam(int i_node, int n_node, std::string param_name,
-			      float *param, int array_size)
+int
+NESTGPU::SetNeuronParam( int i_node, int n_node, std::string param_name, float* param, int array_size )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, n_node, i_group);
-  if (node_vect_[i_group]->IsPortParam(param_name)) {
-      return node_vect_[i_group]->SetPortParam(i_neuron, n_node, param_name,
-					       param, array_size);
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, n_node, i_group );
+  if ( node_vect_[ i_group ]->IsPortParam( param_name ) )
+  {
+    return node_vect_[ i_group ]->SetPortParam( i_neuron, n_node, param_name, param, array_size );
   }
-  else {
-    return node_vect_[i_group]->SetArrayParam(i_neuron, n_node, param_name,
-					      param, array_size);
+  else
+  {
+    return node_vect_[ i_group ]->SetArrayParam( i_neuron, n_node, param_name, param, array_size );
   }
 }
 
-int NESTGPU::SetNeuronParam( int *i_node, int n_node,
-			       std::string param_name, float *param,
-			       int array_size)
+int
+NESTGPU::SetNeuronParam( int* i_node, int n_node, std::string param_name, float* param, int array_size )
 {
   int i_group;
-  std::vector<int> nodes = GetNodeArrayWithOffset(i_node, n_node,
-						  i_group);
-  if (node_vect_[i_group]->IsPortParam(param_name)) {  
-    return node_vect_[i_group]->SetPortParam(nodes.data(), n_node,
-					     param_name, param, array_size);
+  std::vector< int > nodes = GetNodeArrayWithOffset( i_node, n_node, i_group );
+  if ( node_vect_[ i_group ]->IsPortParam( param_name ) )
+  {
+    return node_vect_[ i_group ]->SetPortParam( nodes.data(), n_node, param_name, param, array_size );
+  }
+  else
+  {
+    return node_vect_[ i_group ]->SetArrayParam( nodes.data(), n_node, param_name, param, array_size );
   }
-  else {
-    return node_vect_[i_group]->SetArrayParam(nodes.data(), n_node,
-					      param_name, param, array_size);
-  }    
 }
 
 ////////////////////////////////////////////////////////////////////////
 
-int NESTGPU::SetNeuronScalParamDistr(int i_node, int n_node,
-				     std::string param_name)
+int
+NESTGPU::SetNeuronScalParamDistr( int i_node, int n_node, std::string param_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, n_node, i_group);
-  
-  return node_vect_[i_group]->SetScalParamDistr(i_neuron, n_node, param_name,
-						distribution_);
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, n_node, i_group );
+
+  return node_vect_[ i_group ]->SetScalParamDistr( i_neuron, n_node, param_name, distribution_ );
 }
 
-int NESTGPU::SetNeuronScalVarDistr(int i_node, int n_node,
-				   std::string var_name)
+int
+NESTGPU::SetNeuronScalVarDistr( int i_node, int n_node, std::string var_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, n_node, i_group);
-  
-  return node_vect_[i_group]->SetScalVarDistr(i_neuron, n_node, var_name,
-						distribution_);
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, n_node, i_group );
+
+  return node_vect_[ i_group ]->SetScalVarDistr( i_neuron, n_node, var_name, distribution_ );
 }
 
-int NESTGPU::SetNeuronPortParamDistr(int i_node, int n_node,
-				     std::string param_name)
+int
+NESTGPU::SetNeuronPortParamDistr( int i_node, int n_node, std::string param_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, n_node, i_group);
-  
-  return node_vect_[i_group]->SetPortParamDistr(i_neuron, n_node, param_name,
-						distribution_);
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, n_node, i_group );
+
+  return node_vect_[ i_group ]->SetPortParamDistr( i_neuron, n_node, param_name, distribution_ );
 }
 
-int NESTGPU::SetNeuronPortVarDistr(int i_node, int n_node,
-				   std::string var_name)
+int
+NESTGPU::SetNeuronPortVarDistr( int i_node, int n_node, std::string var_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, n_node, i_group);
-  
-  return node_vect_[i_group]->SetPortVarDistr(i_neuron, n_node, var_name,
-						distribution_);
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, n_node, i_group );
+
+  return node_vect_[ i_group ]->SetPortVarDistr( i_neuron, n_node, var_name, distribution_ );
 }
 
-int NESTGPU::SetNeuronPtScalParamDistr(int *i_node, int n_node,
-				       std::string param_name)
+int
+NESTGPU::SetNeuronPtScalParamDistr( int* i_node, int n_node, std::string param_name )
 {
   int i_group;
-  std::vector<int> nodes = GetNodeArrayWithOffset(i_node, n_node,
-						  i_group);
-  return node_vect_[i_group]->SetScalParamDistr(nodes.data(), n_node,
-						param_name, distribution_);
+  std::vector< int > nodes = GetNodeArrayWithOffset( i_node, n_node, i_group );
+  return node_vect_[ i_group ]->SetScalParamDistr( nodes.data(), n_node, param_name, distribution_ );
 }
 
-int NESTGPU::SetNeuronPtScalVarDistr(int *i_node, int n_node,
-				       std::string var_name)
+int
+NESTGPU::SetNeuronPtScalVarDistr( int* i_node, int n_node, std::string var_name )
 {
   int i_group;
-  std::vector<int> nodes = GetNodeArrayWithOffset(i_node, n_node,
-						  i_group);
-  return node_vect_[i_group]->SetScalVarDistr(nodes.data(), n_node,
-						var_name, distribution_);
+  std::vector< int > nodes = GetNodeArrayWithOffset( i_node, n_node, i_group );
+  return node_vect_[ i_group ]->SetScalVarDistr( nodes.data(), n_node, var_name, distribution_ );
 }
 
-int NESTGPU::SetNeuronPtPortParamDistr(int *i_node, int n_node,
-				       std::string param_name)
+int
+NESTGPU::SetNeuronPtPortParamDistr( int* i_node, int n_node, std::string param_name )
 {
   int i_group;
-  std::vector<int> nodes = GetNodeArrayWithOffset(i_node, n_node,
-						  i_group);
-  return node_vect_[i_group]->SetPortParamDistr(nodes.data(), n_node,
-						param_name, distribution_);
+  std::vector< int > nodes = GetNodeArrayWithOffset( i_node, n_node, i_group );
+  return node_vect_[ i_group ]->SetPortParamDistr( nodes.data(), n_node, param_name, distribution_ );
 }
 
-int NESTGPU::SetNeuronPtPortVarDistr(int *i_node, int n_node,
-				       std::string var_name)
+int
+NESTGPU::SetNeuronPtPortVarDistr( int* i_node, int n_node, std::string var_name )
 {
   int i_group;
-  std::vector<int> nodes = GetNodeArrayWithOffset(i_node, n_node,
-						  i_group);
-  return node_vect_[i_group]->SetPortVarDistr(nodes.data(), n_node,
-						var_name, distribution_);
+  std::vector< int > nodes = GetNodeArrayWithOffset( i_node, n_node, i_group );
+  return node_vect_[ i_group ]->SetPortVarDistr( nodes.data(), n_node, var_name, distribution_ );
 }
 
-int NESTGPU::SetDistributionIntParam(std::string param_name, int val)
+int
+NESTGPU::SetDistributionIntParam( std::string param_name, int val )
 {
-  return distribution_->SetIntParam(param_name, val);
+  return distribution_->SetIntParam( param_name, val );
 }
 
-int NESTGPU::SetDistributionScalParam(std::string param_name, float val)
+int
+NESTGPU::SetDistributionScalParam( std::string param_name, float val )
 {
-  return distribution_->SetScalParam(param_name, val);
+  return distribution_->SetScalParam( param_name, val );
 }
 
-int NESTGPU::SetDistributionVectParam(std::string param_name, float val, int i)
+int
+NESTGPU::SetDistributionVectParam( std::string param_name, float val, int i )
 {
-  return distribution_->SetVectParam(param_name, val, i);
+  return distribution_->SetVectParam( param_name, val, i );
 }
 
-int NESTGPU::SetDistributionFloatPtParam(std::string param_name,
-					 float *array_pt)
+int
+NESTGPU::SetDistributionFloatPtParam( std::string param_name, float* array_pt )
 {
-  return distribution_->SetFloatPtParam(param_name, array_pt);
+  return distribution_->SetFloatPtParam( param_name, array_pt );
 }
 
-int NESTGPU::IsDistributionFloatParam(std::string param_name)
+int
+NESTGPU::IsDistributionFloatParam( std::string param_name )
 {
-  return distribution_->IsFloatParam(param_name);
+  return distribution_->IsFloatParam( param_name );
 }
 
 ////////////////////////////////////////////////////////////////////////
 
-int NESTGPU::IsNeuronScalParam(int i_node, std::string param_name)
+int
+NESTGPU::IsNeuronScalParam( int i_node, std::string param_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, 1, i_group);
-  
-  return node_vect_[i_group]->IsScalParam(param_name);
+  GetNodeSequenceOffset( i_node, 1, i_group );
+
+  return node_vect_[ i_group ]->IsScalParam( param_name );
 }
 
-int NESTGPU::IsNeuronPortParam(int i_node, std::string param_name)
+int
+NESTGPU::IsNeuronPortParam( int i_node, std::string param_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, 1, i_group);
-  
-  return node_vect_[i_group]->IsPortParam(param_name);
+  GetNodeSequenceOffset( i_node, 1, i_group );
+
+  return node_vect_[ i_group ]->IsPortParam( param_name );
 }
 
-int NESTGPU::IsNeuronArrayParam(int i_node, std::string param_name)
+int
+NESTGPU::IsNeuronArrayParam( int i_node, std::string param_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, 1, i_group);
-  
-  return node_vect_[i_group]->IsArrayParam(param_name);
+  GetNodeSequenceOffset( i_node, 1, i_group );
+
+  return node_vect_[ i_group ]->IsArrayParam( param_name );
 }
 
-int NESTGPU::SetNeuronIntVar(int i_node, int n_node,
-			      std::string var_name, int val)
+int
+NESTGPU::SetNeuronIntVar( int i_node, int n_node, std::string var_name, int val )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, n_node, i_group);
-  
-  return node_vect_[i_group]->SetIntVar(i_neuron, n_node, var_name, val);
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, n_node, i_group );
+
+  return node_vect_[ i_group ]->SetIntVar( i_neuron, n_node, var_name, val );
 }
 
-int NESTGPU::SetNeuronIntVar(int *i_node, int n_node,
-			      std::string var_name, int val)
+int
+NESTGPU::SetNeuronIntVar( int* i_node, int n_node, std::string var_name, int val )
 {
   int i_group;
-  std::vector<int> nodes = GetNodeArrayWithOffset(i_node, n_node,
-						  i_group);
-  return node_vect_[i_group]->SetIntVar(nodes.data(), n_node,
-					var_name, val);
+  std::vector< int > nodes = GetNodeArrayWithOffset( i_node, n_node, i_group );
+  return node_vect_[ i_group ]->SetIntVar( nodes.data(), n_node, var_name, val );
 }
 
-int NESTGPU::SetNeuronVar(int i_node, int n_node,
-			      std::string var_name, float val)
+int
+NESTGPU::SetNeuronVar( int i_node, int n_node, std::string var_name, float val )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, n_node, i_group);
-  
-  return node_vect_[i_group]->SetScalVar(i_neuron, n_node, var_name, val);
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, n_node, i_group );
+
+  return node_vect_[ i_group ]->SetScalVar( i_neuron, n_node, var_name, val );
 }
 
-int NESTGPU::SetNeuronVar(int *i_node, int n_node,
-			      std::string var_name, float val)
+int
+NESTGPU::SetNeuronVar( int* i_node, int n_node, std::string var_name, float val )
 {
   int i_group;
-  std::vector<int> nodes = GetNodeArrayWithOffset(i_node, n_node,
-						  i_group);
-  return node_vect_[i_group]->SetScalVar(nodes.data(), n_node,
-					   var_name, val);
+  std::vector< int > nodes = GetNodeArrayWithOffset( i_node, n_node, i_group );
+  return node_vect_[ i_group ]->SetScalVar( nodes.data(), n_node, var_name, val );
 }
 
-int NESTGPU::SetNeuronVar(int i_node, int n_node, std::string var_name,
-			      float *var, int array_size)
+int
+NESTGPU::SetNeuronVar( int i_node, int n_node, std::string var_name, float* var, int array_size )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, n_node, i_group);
-  if (node_vect_[i_group]->IsPortVar(var_name)) {
-      return node_vect_[i_group]->SetPortVar(i_neuron, n_node, var_name,
-					       var, array_size);
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, n_node, i_group );
+  if ( node_vect_[ i_group ]->IsPortVar( var_name ) )
+  {
+    return node_vect_[ i_group ]->SetPortVar( i_neuron, n_node, var_name, var, array_size );
   }
-  else {
-    return node_vect_[i_group]->SetArrayVar(i_neuron, n_node, var_name,
-					      var, array_size);
+  else
+  {
+    return node_vect_[ i_group ]->SetArrayVar( i_neuron, n_node, var_name, var, array_size );
   }
 }
 
-int NESTGPU::SetNeuronVar( int *i_node, int n_node,
-			       std::string var_name, float *var,
-			       int array_size)
+int
+NESTGPU::SetNeuronVar( int* i_node, int n_node, std::string var_name, float* var, int array_size )
 {
   int i_group;
-  std::vector<int> nodes = GetNodeArrayWithOffset(i_node, n_node,
-						  i_group);
-  if (node_vect_[i_group]->IsPortVar(var_name)) {  
-    return node_vect_[i_group]->SetPortVar(nodes.data(), n_node,
-					   var_name, var, array_size);
+  std::vector< int > nodes = GetNodeArrayWithOffset( i_node, n_node, i_group );
+  if ( node_vect_[ i_group ]->IsPortVar( var_name ) )
+  {
+    return node_vect_[ i_group ]->SetPortVar( nodes.data(), n_node, var_name, var, array_size );
+  }
+  else
+  {
+    return node_vect_[ i_group ]->SetArrayVar( nodes.data(), n_node, var_name, var, array_size );
   }
-  else {
-    return node_vect_[i_group]->SetArrayVar(nodes.data(), n_node,
-					    var_name, var, array_size);
-  }    
 }
 
-int NESTGPU::IsNeuronIntVar(int i_node, std::string var_name)
+int
+NESTGPU::IsNeuronIntVar( int i_node, std::string var_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, 1, i_group);
-  
-  return node_vect_[i_group]->IsIntVar(var_name);
+  GetNodeSequenceOffset( i_node, 1, i_group );
+
+  return node_vect_[ i_group ]->IsIntVar( var_name );
 }
 
-int NESTGPU::IsNeuronScalVar(int i_node, std::string var_name)
+int
+NESTGPU::IsNeuronScalVar( int i_node, std::string var_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, 1, i_group);
-  
-  return node_vect_[i_group]->IsScalVar(var_name);
+  GetNodeSequenceOffset( i_node, 1, i_group );
+
+  return node_vect_[ i_group ]->IsScalVar( var_name );
 }
 
-int NESTGPU::IsNeuronPortVar(int i_node, std::string var_name)
+int
+NESTGPU::IsNeuronPortVar( int i_node, std::string var_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, 1, i_group);
-  
-  return node_vect_[i_group]->IsPortVar(var_name);
+  GetNodeSequenceOffset( i_node, 1, i_group );
+
+  return node_vect_[ i_group ]->IsPortVar( var_name );
 }
 
-int NESTGPU::IsNeuronArrayVar(int i_node, std::string var_name)
+int
+NESTGPU::IsNeuronArrayVar( int i_node, std::string var_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, 1, i_group);
-  
-  return node_vect_[i_group]->IsArrayVar(var_name);
-}
+  GetNodeSequenceOffset( i_node, 1, i_group );
 
+  return node_vect_[ i_group ]->IsArrayVar( var_name );
+}
 
-int NESTGPU::GetNeuronParamSize(int i_node, std::string param_name)
+int
+NESTGPU::GetNeuronParamSize( int i_node, std::string param_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, 1, i_group);
-  if (node_vect_[i_group]->IsArrayParam(param_name)!=0) {
-    return node_vect_[i_group]->GetArrayParamSize(i_neuron, param_name);
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, 1, i_group );
+  if ( node_vect_[ i_group ]->IsArrayParam( param_name ) != 0 )
+  {
+    return node_vect_[ i_group ]->GetArrayParamSize( i_neuron, param_name );
   }
-  else {
-    return node_vect_[i_group]->GetParamSize(param_name);
+  else
+  {
+    return node_vect_[ i_group ]->GetParamSize( param_name );
   }
 }
 
-int NESTGPU::GetNeuronVarSize(int i_node, std::string var_name)
+int
+NESTGPU::GetNeuronVarSize( int i_node, std::string var_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, 1, i_group);
-  if (node_vect_[i_group]->IsArrayVar(var_name)!=0) {
-    return node_vect_[i_group]->GetArrayVarSize(i_neuron, var_name);
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, 1, i_group );
+  if ( node_vect_[ i_group ]->IsArrayVar( var_name ) != 0 )
+  {
+    return node_vect_[ i_group ]->GetArrayVarSize( i_neuron, var_name );
   }
-  else {
-    return node_vect_[i_group]->GetVarSize(var_name);
+  else
+  {
+    return node_vect_[ i_group ]->GetVarSize( var_name );
   }
 }
 
-
-float *NESTGPU::GetNeuronParam(int i_node, int n_node,
-				 std::string param_name)
+float*
+NESTGPU::GetNeuronParam( int i_node, int n_node, std::string param_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, n_node, i_group);
-  if (node_vect_[i_group]->IsScalParam(param_name)) {
-    return node_vect_[i_group]->GetScalParam(i_neuron, n_node, param_name);
-  }
-  else if (node_vect_[i_group]->IsPortParam(param_name)) {
-    return node_vect_[i_group]->GetPortParam(i_neuron, n_node, param_name);
-  }
-  else if (node_vect_[i_group]->IsArrayParam(param_name)) {
-    if (n_node != 1) {
-      throw ngpu_exception("Cannot get array parameters for more than one node"
-			   "at a time");
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, n_node, i_group );
+  if ( node_vect_[ i_group ]->IsScalParam( param_name ) )
+  {
+    return node_vect_[ i_group ]->GetScalParam( i_neuron, n_node, param_name );
+  }
+  else if ( node_vect_[ i_group ]->IsPortParam( param_name ) )
+  {
+    return node_vect_[ i_group ]->GetPortParam( i_neuron, n_node, param_name );
+  }
+  else if ( node_vect_[ i_group ]->IsArrayParam( param_name ) )
+  {
+    if ( n_node != 1 )
+    {
+      throw ngpu_exception(
+        "Cannot get array parameters for more than one node"
+        "at a time" );
     }
-    return node_vect_[i_group]->GetArrayParam(i_neuron, param_name);
+    return node_vect_[ i_group ]->GetArrayParam( i_neuron, param_name );
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized parameter ")
-			 + param_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized parameter " ) + param_name );
   }
 }
 
-float *NESTGPU::GetNeuronParam( int *i_node, int n_node,
-				  std::string param_name)
+float*
+NESTGPU::GetNeuronParam( int* i_node, int n_node, std::string param_name )
 {
   int i_group;
-  std::vector<int> nodes = GetNodeArrayWithOffset(i_node, n_node,
-						  i_group);
-  if (node_vect_[i_group]->IsScalParam(param_name)) {
-    return node_vect_[i_group]->GetScalParam(nodes.data(), n_node,
-					     param_name);
-  }
-  else if (node_vect_[i_group]->IsPortParam(param_name)) {  
-    return node_vect_[i_group]->GetPortParam(nodes.data(), n_node,
-					     param_name);
-  }
-  else if (node_vect_[i_group]->IsArrayParam(param_name)) {
-    if (n_node != 1) {
-      throw ngpu_exception("Cannot get array parameters for more than one node"
-			   "at a time");
+  std::vector< int > nodes = GetNodeArrayWithOffset( i_node, n_node, i_group );
+  if ( node_vect_[ i_group ]->IsScalParam( param_name ) )
+  {
+    return node_vect_[ i_group ]->GetScalParam( nodes.data(), n_node, param_name );
+  }
+  else if ( node_vect_[ i_group ]->IsPortParam( param_name ) )
+  {
+    return node_vect_[ i_group ]->GetPortParam( nodes.data(), n_node, param_name );
+  }
+  else if ( node_vect_[ i_group ]->IsArrayParam( param_name ) )
+  {
+    if ( n_node != 1 )
+    {
+      throw ngpu_exception(
+        "Cannot get array parameters for more than one node"
+        "at a time" );
     }
-    return node_vect_[i_group]->GetArrayParam(nodes[0], param_name);
+    return node_vect_[ i_group ]->GetArrayParam( nodes[ 0 ], param_name );
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized parameter ")
-			 + param_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized parameter " ) + param_name );
   }
 }
 
-float *NESTGPU::GetArrayParam(int i_node, std::string param_name)
+float*
+NESTGPU::GetArrayParam( int i_node, std::string param_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, 1, i_group);
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, 1, i_group );
 
-  return node_vect_[i_group]->GetArrayParam(i_neuron, param_name);
+  return node_vect_[ i_group ]->GetArrayParam( i_neuron, param_name );
 }
 
-int *NESTGPU::GetNeuronIntVar(int i_node, int n_node,
-				std::string var_name)
+int*
+NESTGPU::GetNeuronIntVar( int i_node, int n_node, std::string var_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, n_node, i_group);
-  if (node_vect_[i_group]->IsIntVar(var_name)) {
-    return node_vect_[i_group]->GetIntVar(i_neuron, n_node, var_name);
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, n_node, i_group );
+  if ( node_vect_[ i_group ]->IsIntVar( var_name ) )
+  {
+    return node_vect_[ i_group ]->GetIntVar( i_neuron, n_node, var_name );
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized integer variable ")
-			 + var_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized integer variable " ) + var_name );
   }
 }
 
-int *NESTGPU::GetNeuronIntVar(int *i_node, int n_node,
-			       std::string var_name)
+int*
+NESTGPU::GetNeuronIntVar( int* i_node, int n_node, std::string var_name )
 {
   int i_group;
-  std::vector<int> nodes = GetNodeArrayWithOffset(i_node, n_node,
-						  i_group);
-  if (node_vect_[i_group]->IsIntVar(var_name)) {
-    return node_vect_[i_group]->GetIntVar(nodes.data(), n_node,
-					     var_name);
+  std::vector< int > nodes = GetNodeArrayWithOffset( i_node, n_node, i_group );
+  if ( node_vect_[ i_group ]->IsIntVar( var_name ) )
+  {
+    return node_vect_[ i_group ]->GetIntVar( nodes.data(), n_node, var_name );
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized variable ")
-			 + var_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized variable " ) + var_name );
   }
 }
 
-float *NESTGPU::GetNeuronVar(int i_node, int n_node,
-			       std::string var_name)
+float*
+NESTGPU::GetNeuronVar( int i_node, int n_node, std::string var_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, n_node, i_group);
-  if (node_vect_[i_group]->IsScalVar(var_name)) {
-    return node_vect_[i_group]->GetScalVar(i_neuron, n_node, var_name);
-  }
-  else if (node_vect_[i_group]->IsPortVar(var_name)) {
-    return node_vect_[i_group]->GetPortVar(i_neuron, n_node, var_name);
-  }
-  else if (node_vect_[i_group]->IsArrayVar(var_name)) {
-    if (n_node != 1) {
-      throw ngpu_exception("Cannot get array variables for more than one node"
-			   "at a time");
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, n_node, i_group );
+  if ( node_vect_[ i_group ]->IsScalVar( var_name ) )
+  {
+    return node_vect_[ i_group ]->GetScalVar( i_neuron, n_node, var_name );
+  }
+  else if ( node_vect_[ i_group ]->IsPortVar( var_name ) )
+  {
+    return node_vect_[ i_group ]->GetPortVar( i_neuron, n_node, var_name );
+  }
+  else if ( node_vect_[ i_group ]->IsArrayVar( var_name ) )
+  {
+    if ( n_node != 1 )
+    {
+      throw ngpu_exception(
+        "Cannot get array variables for more than one node"
+        "at a time" );
     }
-    return node_vect_[i_group]->GetArrayVar(i_neuron, var_name);
+    return node_vect_[ i_group ]->GetArrayVar( i_neuron, var_name );
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized variable ")
-			 + var_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized variable " ) + var_name );
   }
 }
 
-float *NESTGPU::GetNeuronVar(int *i_node, int n_node,
-			       std::string var_name)
+float*
+NESTGPU::GetNeuronVar( int* i_node, int n_node, std::string var_name )
 {
   int i_group;
-  std::vector<int> nodes = GetNodeArrayWithOffset(i_node, n_node,
-						  i_group);
-  if (node_vect_[i_group]->IsScalVar(var_name)) {
-    return node_vect_[i_group]->GetScalVar(nodes.data(), n_node,
-					     var_name);
-  }
-  else if (node_vect_[i_group]->IsPortVar(var_name)) {  
-    return node_vect_[i_group]->GetPortVar(nodes.data(), n_node,
-					   var_name);
-  }
-  else if (node_vect_[i_group]->IsArrayVar(var_name)) {
-    if (n_node != 1) {
-      throw ngpu_exception("Cannot get array variables for more than one node"
-			   "at a time");
+  std::vector< int > nodes = GetNodeArrayWithOffset( i_node, n_node, i_group );
+  if ( node_vect_[ i_group ]->IsScalVar( var_name ) )
+  {
+    return node_vect_[ i_group ]->GetScalVar( nodes.data(), n_node, var_name );
+  }
+  else if ( node_vect_[ i_group ]->IsPortVar( var_name ) )
+  {
+    return node_vect_[ i_group ]->GetPortVar( nodes.data(), n_node, var_name );
+  }
+  else if ( node_vect_[ i_group ]->IsArrayVar( var_name ) )
+  {
+    if ( n_node != 1 )
+    {
+      throw ngpu_exception(
+        "Cannot get array variables for more than one node"
+        "at a time" );
     }
-    return node_vect_[i_group]->GetArrayVar(nodes[0], var_name);
+    return node_vect_[ i_group ]->GetArrayVar( nodes[ 0 ], var_name );
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized variable ")
-			 + var_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized variable " ) + var_name );
   }
 }
 
-float *NESTGPU::GetArrayVar(int i_node, std::string var_name)
+float*
+NESTGPU::GetArrayVar( int i_node, std::string var_name )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, 1, i_group);
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, 1, i_group );
 
-  return node_vect_[i_group]->GetArrayVar(i_neuron, var_name);
+  return node_vect_[ i_group ]->GetArrayVar( i_neuron, var_name );
 }
 
-std::string NESTGPU::HostIdStr()
+std::string
+NESTGPU::HostIdStr()
 {
-  if (n_hosts_ > 1) {
-    return std::string("Host ") + std::to_string(this_host_)
-      + " : ";
+  if ( n_hosts_ > 1 )
+  {
+    return std::string( "Host " ) + std::to_string( this_host_ ) + " : ";
   }
-  else {
+  else
+  {
     return "";
   }
 }
 
-unsigned int *NESTGPU::RandomInt(size_t n)
+size_t
+NESTGPU::getCUDAMemHostUsed()
+{
+  return cuda_error_ns::mem_used_;
+}
+
+size_t
+NESTGPU::getCUDAMemHostPeak()
+{
+  return cuda_error_ns::mem_max_;
+}
+
+size_t
+NESTGPU::getCUDAMemTotal()
+{
+  size_t mem_free;
+  size_t mem_total;
+  cudaError_t cuda_status = cudaMemGetInfo( &mem_free, &mem_total );
+  if ( cuda_status != cudaSuccess )
+  {
+    throw ngpu_exception( std::string( "CUDA error in getCUDAMemTotal: " ) + cudaGetErrorString( cuda_status ) );
+  }
+
+  return mem_total;
+}
+
+size_t
+NESTGPU::getCUDAMemFree()
 {
-  return curand_int(*random_generator_, n);
+  size_t mem_free;
+  size_t mem_total;
+  cudaError_t cuda_status = cudaMemGetInfo( &mem_free, &mem_total );
+  if ( cuda_status != cudaSuccess )
+  {
+    throw ngpu_exception( std::string( "CUDA error in getCUDAMemFree: " ) + cudaGetErrorString( cuda_status ) );
+  }
+
+  return mem_free;
 }
 
-float *NESTGPU::RandomUniform(size_t n)
+unsigned int*
+NESTGPU::RandomInt( size_t n )
 {
-  return curand_uniform(*random_generator_, n);
+  return curand_int( *random_generator_, n );
 }
 
-float *NESTGPU::RandomNormal(size_t n, float mean, float stddev)
+float*
+NESTGPU::RandomUniform( size_t n )
 {
-  return curand_normal(*random_generator_, n, mean, stddev);
+  return curand_uniform( *random_generator_, n );
 }
 
-float *NESTGPU::RandomNormalClipped(size_t n, float mean, float stddev,
-				      float vmin, float vmax, float vstep)
+float*
+NESTGPU::RandomNormal( size_t n, float mean, float stddev )
+{
+  return curand_normal( *random_generator_, n, mean, stddev );
+}
+
+float*
+NESTGPU::RandomNormalClipped( size_t n, float mean, float stddev, float vmin, float vmax, float vstep )
 {
   const float epsi = 1.0e-6;
-  
-  n = (n/4 + 1)*4; 
-  int n_extra = n/10;
-  n_extra = (n_extra/4 + 1)*4; 
-  if (n_extra<1024) {
-    n_extra=1024;
+
+  n = ( n / 4 + 1 ) * 4;
+  int n_extra = n / 10;
+  n_extra = ( n_extra / 4 + 1 ) * 4;
+  if ( n_extra < 1024 )
+  {
+    n_extra = 1024;
   }
   int i_extra = 0;
-  float *arr = curand_normal(*random_generator_, n, mean, stddev);
-  float *arr_extra = NULL;
-  for (size_t i=0; i<n; i++) {
-    while (arr[i]<vmin || arr[i]>vmax) {
-      if (i_extra==0) {
-	arr_extra = curand_normal(*random_generator_, n_extra, mean, stddev);
+  float* arr = curand_normal( *random_generator_, n, mean, stddev );
+  float* arr_extra = nullptr;
+  for ( size_t i = 0; i < n; i++ )
+  {
+    while ( arr[ i ] < vmin || arr[ i ] > vmax )
+    {
+      if ( i_extra == 0 )
+      {
+        arr_extra = curand_normal( *random_generator_, n_extra, mean, stddev );
       }
-      arr[i] = arr_extra[i_extra];
+      arr[ i ] = arr_extra[ i_extra ];
       i_extra++;
-      if (i_extra==n_extra) {
-	i_extra = 0;
-	delete[](arr_extra);
-	arr_extra = NULL;
+      if ( i_extra == n_extra )
+      {
+        i_extra = 0;
+        delete[] ( arr_extra );
+        arr_extra = nullptr;
       }
     }
   }
-  if (arr_extra != NULL) {
-    delete[](arr_extra);
+  if ( arr_extra != nullptr )
+  {
+    delete[] ( arr_extra );
   }
-  if (vstep>stddev*epsi) {
-    for (size_t i=0; i<n; i++) {
-      arr[i] = vmin + vstep*round((arr[i] - vmin)/vstep);
+  if ( vstep > stddev * epsi )
+  {
+    for ( size_t i = 0; i < n; i++ )
+    {
+      arr[ i ] = vmin + vstep * round( ( arr[ i ] - vmin ) / vstep );
     }
   }
 
-  return arr; 
+  return arr;
 }
 
-std::vector<std::string> NESTGPU::GetIntVarNames(int i_node)
+std::vector< std::string >
+NESTGPU::GetIntVarNames( int i_node )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading variable names");
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception( "Unrecognized node in reading variable names" );
   }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetIntVarNames();
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetIntVarNames();
 }
 
-std::vector<std::string> NESTGPU::GetScalVarNames(int i_node)
+std::vector< std::string >
+NESTGPU::GetScalVarNames( int i_node )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading variable names");
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception( "Unrecognized node in reading variable names" );
   }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetScalVarNames();
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetScalVarNames();
 }
 
-int NESTGPU::GetNIntVar(int i_node)
+int
+NESTGPU::GetNIntVar( int i_node )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading number of "
-			 "variables");
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception(
+      "Unrecognized node in reading number of "
+      "variables" );
   }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetNIntVar();
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetNIntVar();
 }
 
-int NESTGPU::GetNScalVar(int i_node)
+int
+NESTGPU::GetNScalVar( int i_node )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading number of "
-			 "variables");
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception(
+      "Unrecognized node in reading number of "
+      "variables" );
   }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetNScalVar();
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetNScalVar();
 }
 
-std::vector<std::string> NESTGPU::GetPortVarNames(int i_node)
+std::vector< std::string >
+NESTGPU::GetPortVarNames( int i_node )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading variable names");
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception( "Unrecognized node in reading variable names" );
   }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetPortVarNames();
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetPortVarNames();
 }
 
-int NESTGPU::GetNPortVar(int i_node)
+int
+NESTGPU::GetNPortVar( int i_node )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading number of "
-			 "variables");
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception(
+      "Unrecognized node in reading number of "
+      "variables" );
   }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetNPortVar();
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetNPortVar();
 }
 
+std::vector< std::string >
+NESTGPU::GetScalParamNames( int i_node )
+{
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception( "Unrecognized node in reading parameter names" );
+  }
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetScalParamNames();
+}
 
-std::vector<std::string> NESTGPU::GetScalParamNames(int i_node)
+int
+NESTGPU::GetNScalParam( int i_node )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading parameter names");
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception(
+      "Unrecognized node in reading number of "
+      "parameters" );
   }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetScalParamNames();
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetNScalParam();
 }
 
-int NESTGPU::GetNScalParam(int i_node)
+std::vector< std::string >
+NESTGPU::GetPortParamNames( int i_node )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading number of "
-			 "parameters");
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception( "Unrecognized node in reading parameter names" );
   }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetNScalParam();
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetPortParamNames();
 }
 
-std::vector<std::string> NESTGPU::GetPortParamNames(int i_node)
+int
+NESTGPU::GetNPortParam( int i_node )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading parameter names");
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception(
+      "Unrecognized node in reading number of "
+      "parameters" );
   }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetPortParamNames();
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetNPortParam();
 }
 
-int NESTGPU::GetNPortParam(int i_node)
+std::vector< std::string >
+NESTGPU::GetArrayParamNames( int i_node )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading number of "
-			 "parameters");
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception( "Unrecognized node in reading array parameter names" );
   }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetNPortParam();
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetArrayParamNames();
 }
 
+int
+NESTGPU::GetNArrayParam( int i_node )
+{
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception(
+      "Unrecognized node in reading number of array "
+      "parameters" );
+  }
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetNArrayParam();
+}
 
-std::vector<std::string> NESTGPU::GetArrayParamNames(int i_node)
+std::vector< std::string >
+NESTGPU::GetArrayVarNames( int i_node )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading array parameter names");
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception( "Unrecognized node in reading array variable names" );
   }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetArrayParamNames();
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetArrayVarNames();
 }
 
-int NESTGPU::GetNArrayParam(int i_node)
+int
+NESTGPU::GetNArrayVar( int i_node )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading number of array "
-			 "parameters");
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception(
+      "Unrecognized node in reading number of array "
+      "variables" );
   }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetNArrayParam();
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetNArrayVar();
 }
 
+int
+NESTGPU::GetConnectionFloatParamIndex( std::string param_name )
+{
+  return conn_->getConnectionFloatParamIndex( param_name );
+}
 
-std::vector<std::string> NESTGPU::GetArrayVarNames(int i_node)
+int
+NESTGPU::GetConnectionIntParamIndex( std::string param_name )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading array variable names");
-  }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetArrayVarNames();
+  return conn_->getConnectionIntParamIndex( param_name );
 }
 
-int NESTGPU::GetNArrayVar(int i_node)
+int
+NESTGPU::IsConnectionFloatParam( std::string param_name )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading number of array "
-			 "variables");
-  }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetNArrayVar();
+  return conn_->isConnectionFloatParam( param_name );
+}
+
+int
+NESTGPU::IsConnectionIntParam( std::string param_name )
+{
+  return conn_->isConnectionIntParam( param_name );
+}
+
+int
+NESTGPU::GetConnectionFloatParam( int64_t* conn_ids, int64_t n_conn, float* h_param_arr, std::string param_name )
+{
+  return conn_->getConnectionFloatParam( conn_ids, n_conn, h_param_arr, param_name );
+}
+
+int
+NESTGPU::GetConnectionIntParam( int64_t* conn_ids, int64_t n_conn, int* h_param_arr, std::string param_name )
+{
+  return conn_->getConnectionIntParam( conn_ids, n_conn, h_param_arr, param_name );
+}
+
+int
+NESTGPU::SetConnectionFloatParamDistr( int64_t* conn_ids, int64_t n_conn, std::string param_name )
+{
+  return conn_->setConnectionFloatParamDistr( conn_ids, n_conn, param_name );
+}
+
+int
+NESTGPU::SetConnectionFloatParam( int64_t* conn_ids, int64_t n_conn, float val, std::string param_name )
+{
+  return conn_->setConnectionFloatParam( conn_ids, n_conn, val, param_name );
 }
 
-int64_t *NESTGPU::GetConnections(int i_source, int n_source,
-				 int i_target, int n_target,
-				 int syn_group, int64_t *n_conn)
+int
+NESTGPU::SetConnectionIntParamArr( int64_t* conn_ids, int64_t n_conn, int* h_param_arr, std::string param_name )
 {
-  if (n_source<=0) {
+  return conn_->setConnectionIntParamArr( conn_ids, n_conn, h_param_arr, param_name );
+}
+
+int
+NESTGPU::SetConnectionIntParam( int64_t* conn_ids, int64_t n_conn, int val, std::string param_name )
+{
+  return conn_->setConnectionIntParam( conn_ids, n_conn, val, param_name );
+}
+
+int
+NESTGPU::GetConnectionStatus( int64_t* conn_ids,
+  int64_t n_conn,
+  inode_t* source,
+  inode_t* target,
+  int* port,
+  int* syn_group,
+  float* delay,
+  float* weight )
+{
+  return conn_->getConnectionStatus( conn_ids, n_conn, source, target, port, syn_group, delay, weight );
+}
+
+int64_t*
+NESTGPU::GetConnections( inode_t i_source,
+  inode_t n_source,
+  inode_t i_target,
+  inode_t n_target,
+  int syn_group,
+  int64_t* n_conn )
+{
+  if ( n_source <= 0 )
+  {
     i_source = 0;
     // gets also connections from image neurons
-    n_source = GetTotalNNodes();
+    n_source = GetNTotalNodes();
   }
-  if (n_target<=0) {
+  if ( n_target <= 0 )
+  {
     i_target = 0;
     n_target = GetNLocalNodes();
   }
-  int *i_source_pt = new int[n_source];
-  for (int i=0; i<n_source; i++) {
-    i_source_pt[i] = i_source + i;
+  inode_t* i_source_pt = new inode_t[ n_source ];
+  for ( inode_t i = 0; i < n_source; i++ )
+  {
+    i_source_pt[ i ] = i_source + i;
   }
-  int *i_target_pt = new int[n_target];
-  for (int i=0; i<n_target; i++) {
-    i_target_pt[i] = i_target + i;
+  inode_t* i_target_pt = new inode_t[ n_target ];
+  for ( inode_t i = 0; i < n_target; i++ )
+  {
+    i_target_pt[ i ] = i_target + i;
   }
-  
-  int64_t *conn_ids =
-    GetConnections(i_source_pt, n_source, i_target_pt, n_target, syn_group,
-		   n_conn);
+
+  int64_t* conn_ids = conn_->getConnections( i_source_pt, n_source, i_target_pt, n_target, syn_group, n_conn );
+
   delete[] i_source_pt;
   delete[] i_target_pt;
 
   return conn_ids;
 }
 
-int64_t *NESTGPU::GetConnections(int *i_source_pt, int n_source,
-				 int i_target, int n_target,
-				 int syn_group, int64_t *n_conn)
+int64_t*
+NESTGPU::GetConnections( inode_t* i_source_pt,
+  inode_t n_source,
+  inode_t i_target,
+  inode_t n_target,
+  int syn_group,
+  int64_t* n_conn )
 {
-  if (n_target<=0) {
+  if ( n_target <= 0 )
+  {
     i_target = 0;
     n_target = GetNLocalNodes();
   }
-  int *i_target_pt = new int[n_target];
-  for (int i=0; i<n_target; i++) {
-    i_target_pt[i] = i_target + i;
+  inode_t* i_target_pt = new inode_t[ n_target ];
+  for ( inode_t i = 0; i < n_target; i++ )
+  {
+    i_target_pt[ i ] = i_target + i;
   }
-  
-  int64_t *conn_ids =
-    GetConnections(i_source_pt, n_source, i_target_pt, n_target, syn_group,
-		   n_conn);
+
+  int64_t* conn_ids = conn_->getConnections( i_source_pt, n_source, i_target_pt, n_target, syn_group, n_conn );
+
   delete[] i_target_pt;
 
   return conn_ids;
 }
 
-
-int64_t *NESTGPU::GetConnections(int i_source, int n_source,
-				 int *i_target_pt, int n_target,
-				 int syn_group, int64_t *n_conn)
+int64_t*
+NESTGPU::GetConnections( inode_t i_source,
+  inode_t n_source,
+  inode_t* i_target_pt,
+  inode_t n_target,
+  int syn_group,
+  int64_t* n_conn )
 {
-  if (n_source<=0) {
+  if ( n_source <= 0 )
+  {
     i_source = 0;
     //  gets also connections from image neurons
-    n_source = GetTotalNNodes();
+    n_source = GetNTotalNodes();
   }
-  int *i_source_pt = new int[n_source];
-  for (int i=0; i<n_source; i++) {
-    i_source_pt[i] = i_source + i;
+  inode_t* i_source_pt = new inode_t[ n_source ];
+  for ( inode_t i = 0; i < n_source; i++ )
+  {
+    i_source_pt[ i ] = i_source + i;
   }
 
-  int64_t *conn_ids =
-    GetConnections(i_source_pt, n_source, i_target_pt, n_target, syn_group,
-		   n_conn);
+  int64_t* conn_ids = conn_->getConnections( i_source_pt, n_source, i_target_pt, n_target, syn_group, n_conn );
+
   delete[] i_source_pt;
 
   return conn_ids;
 }
 
-int64_t *NESTGPU::GetConnections(NodeSeq source, NodeSeq target,
-				 int syn_group, int64_t *n_conn)
+int64_t*
+NESTGPU::GetConnections( inode_t* i_source_pt,
+  inode_t n_source,
+  inode_t* i_target_pt,
+  inode_t n_target,
+  int syn_group,
+  int64_t* n_conn )
 {
-  return GetConnections(source.i0, source.n, target.i0, target.n, syn_group,
-			n_conn);
+  int64_t* conn_ids = conn_->getConnections( i_source_pt, n_source, i_target_pt, n_target, syn_group, n_conn );
+
+  return conn_ids;
 }
 
-int64_t *NESTGPU::GetConnections(std::vector<int> source, NodeSeq target,
-				 int syn_group, int64_t *n_conn)
+int64_t*
+NESTGPU::GetConnections( NodeSeq source, NodeSeq target, int syn_group, int64_t* n_conn )
 {
-  return GetConnections(source.data(), source.size(), target.i0, target.n,
-			syn_group, n_conn);
+  return GetConnections( source.i0, source.n, target.i0, target.n, syn_group, n_conn );
 }
 
-
-int64_t *NESTGPU::GetConnections(NodeSeq source, std::vector<int> target,
-				 int syn_group, int64_t *n_conn)
+int64_t*
+NESTGPU::GetConnections( std::vector< inode_t > source, NodeSeq target, int syn_group, int64_t* n_conn )
 {
-  return GetConnections(source.i0, source.n, target.data(), target.size(),
-			syn_group, n_conn);
+  return GetConnections( source.data(), source.size(), target.i0, target.n, syn_group, n_conn );
 }
 
-int64_t *NESTGPU::GetConnections(std::vector<int> source,
-				 std::vector<int> target,
-				 int syn_group, int64_t *n_conn)
+int64_t*
+NESTGPU::GetConnections( NodeSeq source, std::vector< inode_t > target, int syn_group, int64_t* n_conn )
 {
-  return GetConnections(source.data(), source.size(),
-			target.data(), target.size(),
-			syn_group, n_conn);
+  return GetConnections( source.i0, source.n, target.data(), target.size(), syn_group, n_conn );
 }
 
+int64_t*
+NESTGPU::GetConnections( std::vector< inode_t > source, std::vector< inode_t > target, int syn_group, int64_t* n_conn )
+{
+  return conn_->getConnections( source.data(), source.size(), target.data(), target.size(), syn_group, n_conn );
+}
 
-int NESTGPU::ActivateSpikeCount(int i_node, int n_node)
+int
+NESTGPU::ActivateSpikeCount( int i_node, int n_node )
 {
-  CheckUncalibrated("Spike count must be activated before calibration");
+  CheckUncalibrated( "Spike count must be activated before calibration" );
   int i_group;
-  int i_node_0 = GetNodeSequenceOffset(i_node, n_node, i_group);
-  if (i_node_0!=i_node || node_vect_[i_group]->n_node_!=n_node) {
-    throw ngpu_exception("Spike count must be activated for all and only "
-			 " the nodes of the same group");
+  int i_node_0 = GetNodeSequenceOffset( i_node, n_node, i_group );
+  if ( i_node_0 != i_node || node_vect_[ i_group ]->n_node_ != n_node )
+  {
+    throw ngpu_exception(
+      "Spike count must be activated for all and only "
+      " the nodes of the same group" );
   }
-  node_vect_[i_group]->ActivateSpikeCount();
+  node_vect_[ i_group ]->ActivateSpikeCount();
 
   return 0;
 }
 
-int NESTGPU::ActivateRecSpikeTimes(int i_node, int n_node,
-				     int max_n_rec_spike_times)
+int
+NESTGPU::ActivateRecSpikeTimes( int i_node, int n_node, int max_n_rec_spike_times )
 {
-  CheckUncalibrated("Spike time recording must be activated "
-		    "before calibration");
+  CheckUncalibrated(
+    "Spike time recording must be activated "
+    "before calibration" );
   int i_group;
-  int i_node_0 = GetNodeSequenceOffset(i_node, n_node, i_group);
-  if (i_node_0!=i_node || node_vect_[i_group]->n_node_!=n_node) {
-    throw ngpu_exception("Spike count must be activated for all and only "
-			 " the nodes of the same group");
+  int i_node_0 = GetNodeSequenceOffset( i_node, n_node, i_group );
+  if ( i_node_0 != i_node || node_vect_[ i_group ]->n_node_ != n_node )
+  {
+    throw ngpu_exception(
+      "Spike count must be activated for all and only "
+      " the nodes of the same group" );
   }
-  node_vect_[i_group]->ActivateRecSpikeTimes(max_n_rec_spike_times);
+  node_vect_[ i_group ]->ActivateRecSpikeTimes( max_n_rec_spike_times );
 
   return 0;
 }
 
-int NESTGPU::SetRecSpikeTimesStep(int i_node, int n_node,
-				     int rec_spike_times_step)
+int
+NESTGPU::SetRecSpikeTimesStep( int i_node, int n_node, int rec_spike_times_step )
 {
   int i_group;
-  int i_node_0 = GetNodeSequenceOffset(i_node, n_node, i_group);
-  if (i_node_0!=i_node || node_vect_[i_group]->n_node_!=n_node) {
-    throw ngpu_exception("Time step for buffering spike time recording "
-			 "must be set for all and only "
-			 "the nodes of the same group");
+  int i_node_0 = GetNodeSequenceOffset( i_node, n_node, i_group );
+  if ( i_node_0 != i_node || node_vect_[ i_group ]->n_node_ != n_node )
+  {
+    throw ngpu_exception(
+      "Time step for buffering spike time recording "
+      "must be set for all and only "
+      "the nodes of the same group" );
   }
-  node_vect_[i_group]->SetRecSpikeTimesStep(rec_spike_times_step);
+  node_vect_[ i_group ]->SetRecSpikeTimesStep( rec_spike_times_step );
 
   return 0;
 }
 
 // get number of recorded spike times for a node
-int NESTGPU::GetNRecSpikeTimes(int i_node)
+int
+NESTGPU::GetNRecSpikeTimes( int i_node )
 {
   int i_group;
-  int i_neuron = i_node - GetNodeSequenceOffset(i_node, 1, i_group);
-  return node_vect_[i_group]->GetNRecSpikeTimes(i_neuron);
+  int i_neuron = i_node - GetNodeSequenceOffset( i_node, 1, i_group );
+  return node_vect_[ i_group ]->GetNRecSpikeTimes( i_neuron );
 }
 
 // get recorded spike times for node group
-int NESTGPU::GetRecSpikeTimes(int i_node, int n_node, int **n_spike_times_pt,
-			      float ***spike_times_pt)
+int
+NESTGPU::GetRecSpikeTimes( int i_node, int n_node, int** n_spike_times_pt, float*** spike_times_pt )
 {
   int i_group;
-  int i_node_0 = GetNodeSequenceOffset(i_node, n_node, i_group);
-  if (i_node_0!=i_node || node_vect_[i_group]->n_node_!=n_node) {
-    throw ngpu_exception("Spike times must be extracted for all and only "
-			 " the nodes of the same group");
+  int i_node_0 = GetNodeSequenceOffset( i_node, n_node, i_group );
+  if ( i_node_0 != i_node || node_vect_[ i_group ]->n_node_ != n_node )
+  {
+    throw ngpu_exception(
+      "Spike times must be extracted for all and only "
+      " the nodes of the same group" );
   }
-  
-  return node_vect_[i_group]->GetRecSpikeTimes(n_spike_times_pt,
-					       spike_times_pt);
-					       
+
+  return node_vect_[ i_group ]->GetRecSpikeTimes( n_spike_times_pt, spike_times_pt );
 }
 
-int NESTGPU::PushSpikesToNodes(int n_spikes, int *node_id,
-				 float *spike_height)
+int
+NESTGPU::PushSpikesToNodes( int n_spikes, int* node_id, float* spike_height )
 {
   /*
   int *d_node_id;
@@ -1826,21 +1897,22 @@ int NESTGPU::PushSpikesToNodes(int n_spikes, int *node_id,
   CUDAMALLOCCTRL("&d_spike_height",&d_spike_height, n_spikes*sizeof(float));
   // Memcpy are synchronized by PushSpikeFromRemote kernel
   gpuErrchk(cudaMemcpyAsync(d_node_id, node_id, n_spikes*sizeof(int),
-		       cudaMemcpyHostToDevice));
-  gpuErrchk(cudaMemcpyAsync(d_spike_height, spike_height, n_spikes*sizeof(float),
-		       cudaMemcpyHostToDevice));
+                       cudaMemcpyHostToDevice));
+  gpuErrchk(cudaMemcpyAsync(d_spike_height, spike_height,
+  n_spikes*sizeof(float), cudaMemcpyHostToDevice));
   PushSpikeFromRemote<<<(n_spikes+1023)/1024, 1024>>>(n_spikes, d_node_id,
-						     d_spike_height);
+                                                     d_spike_height);
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
   CUDAFREECTRL("d_node_id",d_node_id);
   CUDAFREECTRL("d_spike_height",d_spike_height);
   */
-  
+
   return 0;
 }
 
-int NESTGPU::PushSpikesToNodes(int n_spikes, int *node_id)
+int
+NESTGPU::PushSpikesToNodes( int n_spikes, int* node_id )
 {
   /*
   //std::cout << "n_spikes: " << n_spikes << "\n";
@@ -1853,141 +1925,169 @@ int NESTGPU::PushSpikesToNodes(int n_spikes, int *node_id)
   CUDAMALLOCCTRL("&d_node_id",&d_node_id, n_spikes*sizeof(int));
   // memcopy data transfer is overlapped with PushSpikeFromRemote kernel
   gpuErrchk(cudaMemcpyAsync(d_node_id, node_id, n_spikes*sizeof(int),
-		       cudaMemcpyHostToDevice));  
+                       cudaMemcpyHostToDevice));
   PushSpikeFromRemote<<<(n_spikes+1023)/1024, 1024>>>(n_spikes, d_node_id);
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
   CUDAFREECTRL("d_node_id",d_node_id);
   */
-  
+
   return 0;
 }
 
-int NESTGPU::GetExtNeuronInputSpikes(int *n_spikes, int **node, int **port,
-				       float **spike_height, bool include_zeros)
+int
+NESTGPU::GetExtNeuronInputSpikes( int* n_spikes, int** node, int** port, float** spike_height, bool include_zeros )
 {
   ext_neuron_input_spike_node_.clear();
   ext_neuron_input_spike_port_.clear();
   ext_neuron_input_spike_height_.clear();
-  
-  for (unsigned int i=0; i<node_vect_.size(); i++) {
-    if (node_vect_[i]->IsExtNeuron()) {
+
+  for ( unsigned int i = 0; i < node_vect_.size(); i++ )
+  {
+    if ( node_vect_[ i ]->IsExtNeuron() )
+    {
       int n_node;
       int n_port;
-      float *sh = node_vect_[i]->GetExtNeuronInputSpikes(&n_node, &n_port);
-      for (int i_neur=0; i_neur<n_node; i_neur++) {
-	int i_node = i_neur + node_vect_[i]->i_node_0_;
-	for (int i_port=0; i_port<n_port; i_port++) {
-	  int j = i_neur*n_port + i_port;
-	  if (sh[j] != 0.0 || include_zeros) {
-	    ext_neuron_input_spike_node_.push_back(i_node);
-	    ext_neuron_input_spike_port_.push_back(i_port);
-	    ext_neuron_input_spike_height_.push_back(sh[j]);
-	  }
-	}
-      }	
+      float* sh = node_vect_[ i ]->GetExtNeuronInputSpikes( &n_node, &n_port );
+      for ( int i_neur = 0; i_neur < n_node; i_neur++ )
+      {
+        int i_node = i_neur + node_vect_[ i ]->i_node_0_;
+        for ( int i_port = 0; i_port < n_port; i_port++ )
+        {
+          int j = i_neur * n_port + i_port;
+          if ( sh[ j ] != 0.0 || include_zeros )
+          {
+            ext_neuron_input_spike_node_.push_back( i_node );
+            ext_neuron_input_spike_port_.push_back( i_port );
+            ext_neuron_input_spike_height_.push_back( sh[ j ] );
+          }
+        }
+      }
     }
   }
   *n_spikes = ext_neuron_input_spike_node_.size();
   *node = ext_neuron_input_spike_node_.data();
   *port = ext_neuron_input_spike_port_.data();
   *spike_height = ext_neuron_input_spike_height_.data();
-  
+
   return 0;
 }
 
-int NESTGPU::SetNeuronGroupParam(int i_node, int n_node,
-				   std::string param_name, float val)
+int
+NESTGPU::SetNeuronGroupParam( int i_node, int n_node, std::string param_name, float val )
 {
   int i_group;
-  int i_node_0 = GetNodeSequenceOffset(i_node, n_node, i_group);
-  if (i_node_0!=i_node || node_vect_[i_group]->n_node_!=n_node) {
-    throw ngpu_exception(std::string("Group parameter ") + param_name
-			 + " can only be set for all and only "
-			 " the nodes of the same group");
+  int i_node_0 = GetNodeSequenceOffset( i_node, n_node, i_group );
+  if ( i_node_0 != i_node || node_vect_[ i_group ]->n_node_ != n_node )
+  {
+    throw ngpu_exception(std::string("Group parameter ") + param_name +
+                         " can only be set for all and only "
+                         " the nodes of the same group");
   }
-  return node_vect_[i_group]->SetGroupParam(param_name, val);
+  return node_vect_[ i_group ]->SetGroupParam( param_name, val );
 }
 
-int NESTGPU::IsNeuronGroupParam(int i_node, std::string param_name)
+int
+NESTGPU::IsNeuronGroupParam( int i_node, std::string param_name )
 {
   int i_group;
-  int i_node_0 = GetNodeSequenceOffset(i_node, 1, i_group);
+  GetNodeSequenceOffset( i_node, 1, i_group );
 
-  return node_vect_[i_group]->IsGroupParam(param_name);
+  return node_vect_[ i_group ]->IsGroupParam( param_name );
 }
 
-float NESTGPU::GetNeuronGroupParam(int i_node, std::string param_name)
+float
+NESTGPU::GetNeuronGroupParam( int i_node, std::string param_name )
 {
   int i_group;
-  int i_node_0 = GetNodeSequenceOffset(i_node, 1, i_group);
+  GetNodeSequenceOffset( i_node, 1, i_group );
 
-  return node_vect_[i_group]->GetGroupParam(param_name);
+  return node_vect_[ i_group ]->GetGroupParam( param_name );
 }
 
-std::vector<std::string> NESTGPU::GetGroupParamNames(int i_node)
+std::vector< std::string >
+NESTGPU::GetGroupParamNames( int i_node )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading group parameter names");
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception( "Unrecognized node in reading group parameter names" );
   }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetGroupParamNames();
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetGroupParamNames();
 }
 
-int NESTGPU::GetNGroupParam(int i_node)
+int
+NESTGPU::GetNGroupParam( int i_node )
 {
-  if (i_node<0 || i_node>(int)node_group_map_.size()) {
-    throw ngpu_exception("Unrecognized node in reading number of "
-			 "group parameters");
+  if ( i_node < 0 || i_node > ( int ) node_group_map_.size() )
+  {
+    throw ngpu_exception(
+      "Unrecognized node in reading number of "
+      "group parameters" );
   }
-  int i_group = node_group_map_[i_node];
-  
-  return node_vect_[i_group]->GetNGroupParam();
+  int i_group = node_group_map_[ i_node ];
+
+  return node_vect_[ i_group ]->GetNGroupParam();
 }
 
-int NESTGPU::GetNBoolParam()
+int
+NESTGPU::GetNBoolParam()
 {
   return N_KERNEL_BOOL_PARAM;
 }
 
-std::vector<std::string> NESTGPU::GetBoolParamNames()
+std::vector< std::string >
+NESTGPU::GetBoolParamNames()
 {
-  std::vector<std::string> param_name_vect;
-  for (int i=0; i<N_KERNEL_BOOL_PARAM; i++) {
-    param_name_vect.push_back(kernel_bool_param_name[i]);
+  std::vector< std::string > param_name_vect;
+  for ( int i = 0; i < N_KERNEL_BOOL_PARAM; i++ )
+  {
+    param_name_vect.push_back( kernel_bool_param_name[ i ] );
   }
-  
+
   return param_name_vect;
 }
 
-bool NESTGPU::IsBoolParam(std::string param_name)
+bool
+NESTGPU::IsBoolParam( std::string param_name )
 {
   int i_param;
-  for (i_param=0; i_param<N_KERNEL_BOOL_PARAM; i_param++) {
-    if (param_name == kernel_bool_param_name[i_param]) return true;
+  for ( i_param = 0; i_param < N_KERNEL_BOOL_PARAM; i_param++ )
+  {
+    if ( param_name == kernel_bool_param_name[ i_param ] )
+    {
+      return true;
+    }
   }
   return false;
 }
 
-int NESTGPU::GetBoolParamIdx(std::string param_name)
+int
+NESTGPU::GetBoolParamIdx( std::string param_name )
 {
   int i_param;
-  for (i_param=0; i_param<N_KERNEL_BOOL_PARAM; i_param++) {
-    if (param_name == kernel_bool_param_name[i_param]) break;
+  for ( i_param = 0; i_param < N_KERNEL_BOOL_PARAM; i_param++ )
+  {
+    if ( param_name == kernel_bool_param_name[ i_param ] )
+    {
+      break;
+    }
   }
-  if (i_param == N_KERNEL_BOOL_PARAM) {
-    throw ngpu_exception(std::string("Unrecognized kernel boolean parameter ")
-			 + param_name);
+  if ( i_param == N_KERNEL_BOOL_PARAM )
+  {
+    throw ngpu_exception( std::string( "Unrecognized kernel boolean parameter " ) + param_name );
   }
-  
+
   return i_param;
 }
 
-bool NESTGPU::GetBoolParam(std::string param_name)
+bool
+NESTGPU::GetBoolParam( std::string param_name )
 {
-  int i_param =  GetBoolParamIdx(param_name);
-  switch (i_param) {
+  int i_param = GetBoolParamIdx( param_name );
+  switch ( i_param )
+  {
   case i_print_time:
     return print_time_;
   case i_remove_conn_key:
@@ -1995,16 +2095,17 @@ bool NESTGPU::GetBoolParam(std::string param_name)
   case i_remote_spike_height:
     return remote_spike_height_;
   default:
-    throw ngpu_exception(std::string("Unrecognized kernel boolean parameter ")
-			 + param_name);
+    throw ngpu_exception( std::string( "Unrecognized kernel boolean parameter " ) + param_name );
   }
 }
 
-int NESTGPU::SetBoolParam(std::string param_name, bool val)
+int
+NESTGPU::SetBoolParam( std::string param_name, bool val )
 {
-  int i_param =  GetBoolParamIdx(param_name);
+  int i_param = GetBoolParamIdx( param_name );
 
-  switch (i_param) {
+  switch ( i_param )
+  {
   case i_print_time:
     print_time_ = val;
     break;
@@ -2012,59 +2113,72 @@ int NESTGPU::SetBoolParam(std::string param_name, bool val)
     remove_conn_key_ = val;
     break;
   case i_remote_spike_height:
-      remote_spike_height_ = val;
+    remote_spike_height_ = val;
     break;
   default:
-    throw ngpu_exception(std::string("Unrecognized kernel boolean parameter ")
-			 + param_name);
+    throw ngpu_exception( std::string( "Unrecognized kernel boolean parameter " ) + param_name );
   }
-  
+
   return 0;
 }
 
-
-int NESTGPU::GetNFloatParam()
+int
+NESTGPU::GetNFloatParam()
 {
   return N_KERNEL_FLOAT_PARAM;
 }
 
-std::vector<std::string> NESTGPU::GetFloatParamNames()
+std::vector< std::string >
+NESTGPU::GetFloatParamNames()
 {
-  std::vector<std::string> param_name_vect;
-  for (int i=0; i<N_KERNEL_FLOAT_PARAM; i++) {
-    param_name_vect.push_back(kernel_float_param_name[i]);
+  std::vector< std::string > param_name_vect;
+  for ( int i = 0; i < N_KERNEL_FLOAT_PARAM; i++ )
+  {
+    param_name_vect.push_back( kernel_float_param_name[ i ] );
   }
-  
+
   return param_name_vect;
 }
 
-bool NESTGPU::IsFloatParam(std::string param_name)
+bool
+NESTGPU::IsFloatParam( std::string param_name )
 {
   int i_param;
-  for (i_param=0; i_param<N_KERNEL_FLOAT_PARAM; i_param++) {
-    if (param_name == kernel_float_param_name[i_param]) return true;
+  for ( i_param = 0; i_param < N_KERNEL_FLOAT_PARAM; i_param++ )
+  {
+    if ( param_name == kernel_float_param_name[ i_param ] )
+    {
+      return true;
+    }
   }
   return false;
 }
 
-int NESTGPU::GetFloatParamIdx(std::string param_name)
+int
+NESTGPU::GetFloatParamIdx( std::string param_name )
 {
   int i_param;
-  for (i_param=0; i_param<N_KERNEL_FLOAT_PARAM; i_param++) {
-    if (param_name == kernel_float_param_name[i_param]) break;
+  for ( i_param = 0; i_param < N_KERNEL_FLOAT_PARAM; i_param++ )
+  {
+    if ( param_name == kernel_float_param_name[ i_param ] )
+    {
+      break;
+    }
   }
-  if (i_param == N_KERNEL_FLOAT_PARAM) {
-    throw ngpu_exception(std::string("Unrecognized kernel float parameter ")
-			 + param_name);
+  if ( i_param == N_KERNEL_FLOAT_PARAM )
+  {
+    throw ngpu_exception( std::string( "Unrecognized kernel float parameter " ) + param_name );
   }
-  
+
   return i_param;
 }
 
-float NESTGPU::GetFloatParam(std::string param_name)
+float
+NESTGPU::GetFloatParam( std::string param_name )
 {
-  int i_param =  GetFloatParamIdx(param_name);
-  switch (i_param) {
+  int i_param = GetFloatParamIdx( param_name );
+  switch ( i_param )
+  {
   case i_time_resolution:
     return time_resolution_;
   case i_max_spike_num_fact:
@@ -2074,18 +2188,20 @@ float NESTGPU::GetFloatParam(std::string param_name)
   case i_max_remote_spike_num_fact:
     return max_remote_spike_num_fact_;
   default:
-    throw ngpu_exception(std::string("Unrecognized kernel float parameter ")
-			 + param_name);
+    throw ngpu_exception( std::string( "Unrecognized kernel float parameter " ) + param_name );
   }
 }
 
-int NESTGPU::SetFloatParam(std::string param_name, float val)
+int
+NESTGPU::SetFloatParam( std::string param_name, float val )
 {
-  int i_param =  GetFloatParamIdx(param_name);
+  int i_param = GetFloatParamIdx( param_name );
 
-  switch (i_param) {
+  switch ( i_param )
+  {
   case i_time_resolution:
     time_resolution_ = val;
+    conn_->setTimeResolution( time_resolution_ );
     break;
   case i_max_spike_num_fact:
     max_spike_num_fact_ = val;
@@ -2097,55 +2213,70 @@ int NESTGPU::SetFloatParam(std::string param_name, float val)
     max_remote_spike_num_fact_ = val;
     break;
   default:
-    throw ngpu_exception(std::string("Unrecognized kernel float parameter ")
-			 + param_name);
+    throw ngpu_exception( std::string( "Unrecognized kernel float parameter " ) + param_name );
   }
-  
+
   return 0;
 }
 
-int NESTGPU::GetNIntParam()
+int
+NESTGPU::GetNIntParam()
 {
   return N_KERNEL_INT_PARAM;
 }
 
-std::vector<std::string> NESTGPU::GetIntParamNames()
+std::vector< std::string >
+NESTGPU::GetIntParamNames()
 {
-  std::vector<std::string> param_name_vect;
-  for (int i=0; i<N_KERNEL_INT_PARAM; i++) {
-    param_name_vect.push_back(kernel_int_param_name[i]);
+  std::vector< std::string > param_name_vect;
+  for ( int i = 0; i < N_KERNEL_INT_PARAM; i++ )
+  {
+    param_name_vect.push_back( kernel_int_param_name[ i ] );
   }
-  
+
   return param_name_vect;
 }
 
-bool NESTGPU::IsIntParam(std::string param_name)
+bool
+NESTGPU::IsIntParam( std::string param_name )
 {
   int i_param;
-  for (i_param=0; i_param<N_KERNEL_INT_PARAM; i_param++) {
-    if (param_name == kernel_int_param_name[i_param]) return true;
+  for ( i_param = 0; i_param < N_KERNEL_INT_PARAM; i_param++ )
+  {
+    if ( param_name == kernel_int_param_name[ i_param ] )
+    {
+      return true;
+    }
   }
+
   return false;
 }
 
-int NESTGPU::GetIntParamIdx(std::string param_name)
+int
+NESTGPU::GetIntParamIdx( std::string param_name )
 {
   int i_param;
-  for (i_param=0; i_param<N_KERNEL_INT_PARAM; i_param++) {
-    if (param_name == kernel_int_param_name[i_param]) break;
+  for ( i_param = 0; i_param < N_KERNEL_INT_PARAM; i_param++ )
+  {
+    if ( param_name == kernel_int_param_name[ i_param ] )
+    {
+      break;
+    }
   }
-  if (i_param == N_KERNEL_INT_PARAM) {
-    throw ngpu_exception(std::string("Unrecognized kernel int parameter ")
-			 + param_name);
+  if ( i_param == N_KERNEL_INT_PARAM )
+  {
+    throw ngpu_exception( std::string( "Unrecognized kernel int parameter " ) + param_name );
   }
-  
+
   return i_param;
 }
 
-int NESTGPU::GetIntParam(std::string param_name)
+int
+NESTGPU::GetIntParam( std::string param_name )
 {
-  int i_param =  GetIntParamIdx(param_name);
-  switch (i_param) {
+  int i_param = GetIntParamIdx( param_name );
+  switch ( i_param )
+  {
   case i_rnd_seed:
     return kernel_seed_;
   case i_verbosity_level:
@@ -2153,65 +2284,93 @@ int NESTGPU::GetIntParam(std::string param_name)
   case i_max_spike_buffer_size:
     return max_spike_buffer_size_;
   case i_max_node_n_bits:
-    return h_MaxNodeNBits;    
+    return conn_->getMaxNodeNBits();
   case i_max_syn_n_bits:
-    return h_MaxSynNBits;    
+    return conn_->getMaxSynNBits();
+  case i_max_delay_n_bits:
+    return conn_->getMaxDelayNBits();
+  case i_conn_struct_type:
+    return conn_struct_type_;
   default:
-    throw ngpu_exception(std::string("Unrecognized kernel int parameter ")
-			 + param_name);
+    throw ngpu_exception( std::string( "Unrecognized kernel int parameter " ) + param_name );
   }
 }
 
-int NESTGPU::SetIntParam(std::string param_name, int val)
+int
+NESTGPU::SetIntParam( std::string param_name, int val )
 {
-  int i_param =  GetIntParamIdx(param_name);
-  switch (i_param) {
+  int i_param = GetIntParamIdx( param_name );
+  switch ( i_param )
+  {
   case i_rnd_seed:
-    SetRandomSeed(val);
+    SetRandomSeed( val );
     break;
   case i_verbosity_level:
-    SetVerbosityLevel(val);
+    SetVerbosityLevel( val );
+    if ( val >= 5 )
+    {
+      cuda_error_ns::verbose_ = 1;
+    }
+    else
+    {
+      cuda_error_ns::verbose_ = 0;
+    }
     break;
   case i_max_spike_buffer_size:
-    SetMaxSpikeBufferSize(val);
+    SetMaxSpikeBufferSize( val );
     break;
   case i_max_node_n_bits:
-    setMaxNodeNBits(val);
+    conn_->setMaxNodeNBits( val );
     break;
   case i_max_syn_n_bits:
-    setMaxSynNBits(val);
+    conn_->setMaxSynNBits( val );
+    break;
+  case i_max_delay_n_bits:
+    conn_->setMaxDelayNBits( val );
+    break;
+  case i_conn_struct_type:
+    if ( conn_struct_type_ != val )
+    {
+      setConnStructType( val );
+    }
     break;
   default:
-    throw ngpu_exception(std::string("Unrecognized kernel int parameter ")
-			 + param_name);
+    throw ngpu_exception( std::string( "Unrecognized kernel int parameter " ) + param_name );
   }
-  
+
   return 0;
 }
 
-RemoteNodeSeq NESTGPU::RemoteCreate(int i_host, std::string model_name,
-				      int n_nodes /*=1*/, int n_ports /*=1*/)
+RemoteNodeSeq
+NESTGPU::RemoteCreate( int i_host, std::string model_name, inode_t n_nodes /*=1*/, int n_ports /*=1*/ )
 {
-  if (!create_flag_) {
+  if ( !create_flag_ )
+  {
     create_flag_ = true;
     start_real_time_ = getRealTime();
   }
-  if (n_hosts_ > 1) {
-    if (i_host<0 || i_host>=n_hosts_) {
-      throw ngpu_exception("Invalid host index in RemoteCreate");
+  if ( n_hosts_ > 1 )
+  {
+    if ( i_host >= n_hosts_ )
+    {
+      throw ngpu_exception( "Invalid host index in RemoteCreate" );
     }
-    NodeSeq node_seq(n_remote_nodes_[i_host], n_nodes);
-    n_remote_nodes_[i_host] += n_nodes;
-    if (i_host == this_host_) {
-      NodeSeq check_node_seq = _Create(model_name, n_nodes, n_ports);
-      if (check_node_seq.i0 != node_seq.i0) {
-	throw ngpu_exception("Inconsistency in number of nodes in local"
-			     " and remote representation of the host.");
+    NodeSeq node_seq( n_remote_nodes_[ i_host ], n_nodes );
+    n_remote_nodes_[ i_host ] += n_nodes;
+    if ( i_host == this_host_ )
+    {
+      NodeSeq check_node_seq = _Create( model_name, n_nodes, n_ports );
+      if ( check_node_seq.i0 != node_seq.i0 )
+      {
+        throw ngpu_exception(
+          "Inconsistency in number of nodes in local"
+          " and remote representation of the host." );
       }
     }
-    return RemoteNodeSeq(i_host, node_seq);
+    return RemoteNodeSeq( i_host, node_seq );
   }
-  else {
-    throw ngpu_exception("RemoteCreate requires at least two hosts");
+  else
+  {
+    throw ngpu_exception( "RemoteCreate requires at least two hosts" );
   }
 }
diff --git a/src/nestgpu.h b/src/nestgpu.h
index 0e2f5a317..37af33ab6 100644
--- a/src/nestgpu.h
+++ b/src/nestgpu.h
@@ -20,68 +20,88 @@
  *
  */
 
-
-
-
-
 #ifndef NESTGPU_H
 #define NESTGPU_H
 
-#include <iostream>
-#include <vector>
-#include <string>
 #include <algorithm>
+#include <iostream>
 #include <numeric>
+#include <string>
+#include <vector>
 
-#include "ngpu_exception.h"
-#include "node_group.h"
 #include "base_neuron.h"
 #include "connect_spec.h"
-//#include "connect.h"
-//#include "syn_model.h"
-//#include "distribution.h"
+#include "ngpu_exception.h"
+#include "node_group.h"
+// #include "connect.h"
+// #include "syn_model.h"
+// #include "distribution.h"
 
 class Multimeter;
-class NetConnection;
+
 struct curandGenerator_st;
+
 typedef struct curandGenerator_st* curandGenerator_t;
+
 class ConnSpec;
+
 class SynSpec;
+
 class SynModel;
 
+class Connection;
+
+typedef uint inode_t;
+
+typedef uint iconngroup_t;
+
 class Sequence
 {
- public:
+public:
   int i0;
   int n;
-  
- Sequence(int i0=0, int n=0) : i0(i0), n(n) {}
-  
-  inline int operator[](int i) {
-    if (i<0) {
-      throw ngpu_exception("Sequence index cannot be negative");
+
+  Sequence( int i0 = 0, int n = 0 )
+    : i0( i0 )
+    , n( n )
+  {
+  }
+
+  inline int
+  operator[]( int i )
+  {
+    if ( i < 0 )
+    {
+      throw ngpu_exception( "Sequence index cannot be negative" );
     }
-    if (i>=n) {
-      throw ngpu_exception("Sequence index out of range");
+    if ( i >= n )
+    {
+      throw ngpu_exception( "Sequence index out of range" );
     }
     return i0 + i;
   }
 
-  inline Sequence Subseq(int first, int last) {
-    if (first<0 || first>last) {
-      throw ngpu_exception("Sequence subset range error");
+  inline Sequence
+  Subseq( int first, int last )
+  {
+    if ( first < 0 || first > last )
+    {
+      throw ngpu_exception( "Sequence subset range error" );
     }
-    if (last>=n) {
-      throw ngpu_exception("Sequence subset out of range");
+    if ( last >= n )
+    {
+      throw ngpu_exception( "Sequence subset out of range" );
     }
-    return Sequence(i0 + first, last - first + 1);
+    return Sequence( i0 + first, last - first + 1 );
   }
 
   // https://stackoverflow.com/questions/18625223
-  inline std::vector<int> ToVector() {
+  inline std::vector< int >
+  ToVector()
+  {
     int start = i0;
-    std::vector<int> v(n);
-    std::iota(v.begin(), v.end(), start);
+    std::vector< int > v( n );
+    std::iota( v.begin(), v.end(), start );
     return v;
   }
 };
@@ -90,721 +110,816 @@ typedef Sequence NodeSeq;
 
 class RemoteNodeSeq
 {
- public:
+public:
   int i_host;
   NodeSeq node_seq;
-  
-  RemoteNodeSeq(int i_host=0, NodeSeq node_seq=NodeSeq(0,0)) :
-    i_host(i_host), node_seq(node_seq) {}
+
+  RemoteNodeSeq( int i_host = 0, NodeSeq node_seq = NodeSeq( 0, 0 ) )
+    : i_host( i_host )
+    , node_seq( node_seq )
+  {
+  }
 };
 
-enum {ON_EXCEPTION_EXIT=0, ON_EXCEPTION_HANDLE};
+enum
+{
+  ON_EXCEPTION_EXIT = 0,
+  ON_EXCEPTION_HANDLE
+};
 
 class NESTGPU
 {
-  static const int conn_seed_offset_ = 12345;
   float time_resolution_; // time resolution in ms
-  curandGenerator_t *random_generator_;
-  //std::vector < std::vector <curandGenerator_t *> > conn_random_generator_;
-  std::vector < std::vector <curandGenerator_t > > conn_random_generator_;
+
+  curandGenerator_t* random_generator_;
+
   unsigned long long kernel_seed_;
+
   bool calibrate_flag_; // becomes true after calibration
+
   bool create_flag_; // becomes true just before creation of the first node
 
-  bool rev_conn_flag_; // flag for reverse connections
-  
-  Distribution *distribution_;
-  Multimeter *multimeter_;
-  std::vector<BaseNeuron*> node_vect_; // -> node_group_vect
-  std::vector<SynModel*> syn_group_vect_;
-  
-  NetConnection *net_connection_;
+  // Pointer to the connection object. Note that conn_ is of the type
+  // pointer-to-the(abstract)-base class
+  // while the object it will point to should be an instance of a derived class
+  Connection* conn_;
+
+  Distribution* distribution_;
+
+  Multimeter* multimeter_;
+
+  int conn_struct_type_;
+
+  std::vector< BaseNeuron* > node_vect_; // -> node_group_vect
+
+  std::vector< SynModel* > syn_group_vect_;
 
   int this_host_;
+
   int n_hosts_;
-  
+
   // if true it is possible to send spikes across different hosts
   bool external_spike_flag_;
-  
+
   bool mpi_flag_; // true if MPI is initialized
+
   bool remote_spike_height_;
-  
-  std::vector<int16_t> node_group_map_;
-  int16_t *d_node_group_map_;
 
+  std::vector< int16_t > node_group_map_;
+
+  int16_t* d_node_group_map_;
 
   int max_spike_buffer_size_;
+
   int max_spike_num_;
+
   int max_spike_per_host_;
+
   int max_remote_spike_num_;
-  
+
   double max_spike_num_fact_;
+
   double max_spike_per_host_fact_;
+
   double max_remote_spike_num_fact_;
 
   double t_min_;
+
   double neural_time_; // Neural activity time
+
   double sim_time_; // Simulation time in ms
+
   double neur_t0_; // Neural activity simulation time origin
+
   long long it_; // simulation time index
+
   long long Nt_; // number of simulation time steps
-  //int n_poiss_nodes_;
-  std::vector<int> n_remote_nodes_;
-  //int n_ext_nodes_;
-  //int i_ext_node_0_;
-  //int i_remote_node_0_;
-  int n_image_nodes_;
+
+  // int n_poiss_nodes_;
+
+  std::vector< int > n_remote_nodes_;
+
+  // int n_ext_nodes_;
+
+  // int i_ext_node_0_;
+
+  // int i_remote_node_0_;
 
   double start_real_time_;
+
   double build_real_time_;
+
   double end_real_time_;
 
   bool error_flag_;
+
   std::string error_message_;
+
   unsigned char error_code_;
+
   int on_exception_;
 
   int verbosity_level_;
+
   bool print_time_;
+
   bool remove_conn_key_;
-  
+
   int nested_loop_algo_;
 
-  //std::vector<RemoteConnection> remote_connection_vect_;
-  std::vector<int> ext_neuron_input_spike_node_;
-  std::vector<int> ext_neuron_input_spike_port_;
-  std::vector<float> ext_neuron_input_spike_height_;
+  std::vector< int > ext_neuron_input_spike_node_;
+
+  std::vector< int > ext_neuron_input_spike_port_;
+
+  std::vector< float > ext_neuron_input_spike_height_;
+
+  int setNHosts( int n_hosts );
+
+  int setThisHost( int i_host );
 
-  int setHostNum(int n_hosts);
-  int setThisHost(int i_host);
-  
-  int InitConnRandomGenerator();
-  int FreeConnRandomGenerator();
+  int CreateNodeGroup( int n_nodes, int n_ports );
+
+  int CheckUncalibrated( std::string message );
+
+  double* InitGetSpikeArray( int n_nodes, int n_ports );
 
-  int CreateNodeGroup(int n_nodes, int n_ports);
-  int CheckUncalibrated(std::string message);
-  double *InitGetSpikeArray(int n_nodes, int n_ports);
   int NodeGroupArrayInit();
+
   int ClearGetSpikeArrays();
+
   int FreeGetSpikeArrays();
+
   int FreeNodeGroupMap();
-  int CheckImageNodes(int n_nodes);
-
-  NodeSeq _Create(std::string model_name, int n_nodes, int n_ports);
-  
-  template <class T1, class T2>
-  int _Connect(T1 source, int n_source, T2 target, int n_target,
-		 ConnSpec &conn_spec, SynSpec &syn_spec);
-  
-  template <class T1, class T2>
-  int _Connect(curandGenerator_t &gen, T1 source, int n_source,
-	       T2 target, int n_target,
-	       ConnSpec &conn_spec, SynSpec &syn_spec);
-  
-  template <class T1, class T2>
-    int _ConnectOneToOne(curandGenerator_t &gen, T1 source, T2 target,
-			 int n_node, SynSpec &syn_spec);
-
-  template <class T1, class T2>
-    int _ConnectAllToAll(curandGenerator_t &gen, T1 source, int n_source,
-			 T2 target, int n_target, SynSpec &syn_spec);
-
-  template <class T1, class T2>
-    int _ConnectFixedTotalNumber(curandGenerator_t &gen, T1 source,
-				 int n_source, T2 target, int n_target,
-				 int total_num, SynSpec &syn_spec);
-
-  template <class T1, class T2>
-    int _ConnectFixedIndegree
-    (curandGenerator_t &gen, T1 source, int n_source, T2 target, int n_target,
-     int indegree, SynSpec &syn_spec);
-
-  template <class T1, class T2>
-    int _ConnectFixedOutdegree
-    (curandGenerator_t &gen, T1 source, int n_source, T2 target, int n_target,
-     int outdegree, SynSpec &syn_spec);
-
-  template <class T1, class T2>
-  int _RemoteConnect(int this_host, int source_host, T1 source, int n_source,
-		     int target_host, T2 target, int n_target,
-		     ConnSpec &conn_spec, SynSpec &syn_spec);
-
-  template <class T1, class T2>
-  int _RemoteConnect(int source_host, T1 source, int n_source,
-		     int target_host, T2 target, int n_target,
-		     ConnSpec &conn_spec, SynSpec &syn_spec);
-
-  template <class T1, class T2>
-  int _RemoteConnectSource(int source_host, T1 source, int n_source,
-			   T2 target, int n_target,
-			   ConnSpec &conn_spec, SynSpec &syn_spec);
-  
-  template <class T1, class T2>
-  int _RemoteConnectTarget(int target_host, T1 source, int n_source,
-			   T2 target, int n_target,
-			   ConnSpec &conn_spec, SynSpec &syn_spec);
-  
-  int addOffsetToExternalNodeIds();
-
-  int addOffsetToSpikeBufferMap();
+
+  int CheckImageNodes( int n_nodes );
+
+  NodeSeq _Create( std::string model_name, int n_nodes, int n_ports );
 
   double SpikeBufferUpdate_time_;
+
   double poisson_generator_time_;
+
   double neuron_Update_time_;
+
   double copy_ext_spike_time_;
+
   double organizeExternalSpike_time_;
+
   double SendSpikeToRemote_time_;
+
   double RecvSpikeFromRemote_time_;
+
   double NestedLoop_time_;
+
   double GetSpike_time_;
+
   double SpikeReset_time_;
+
   double ExternalSpikeReset_time_;
 
   double SendSpikeToRemote_comm_time_;
+
   double RecvSpikeFromRemote_comm_time_;
+
   double SendSpikeToRemote_CUDAcp_time_;
+
   double RecvSpikeFromRemote_CUDAcp_time_;
-  
+
   bool first_simulation_flag_;
 
- public:
+public:
   NESTGPU();
 
   ~NESTGPU();
 
-  int SetRandomSeed(unsigned long long seed);
+  int SetRandomSeed( unsigned long long seed );
 
-  int SetTimeResolution(float time_res);
-  
-  inline float GetTimeResolution() {
+  int SetTimeResolution( float time_res );
+
+  inline float
+  GetTimeResolution()
+  {
     return time_resolution_;
   }
 
-  inline int SetSimTime(float sim_time) {
+  inline int
+  SetSimTime( float sim_time )
+  {
     sim_time_ = sim_time;
     return 0;
   }
 
-  inline float GetSimTime() {
+  inline float
+  GetSimTime()
+  {
     return sim_time_;
   }
 
-  inline int SetVerbosityLevel(int verbosity_level) {
+  inline int
+  SetVerbosityLevel( int verbosity_level )
+  {
     verbosity_level_ = verbosity_level;
     return 0;
   }
 
-  int SetNestedLoopAlgo(int nested_loop_algo);
+  int SetNestedLoopAlgo( int nested_loop_algo );
 
-  inline int SetPrintTime(bool print_time) {
+  inline int
+  SetPrintTime( bool print_time )
+  {
     print_time_ = print_time;
     return 0;
   }
 
-  int SetMaxSpikeBufferSize(int max_size);
+  int SetMaxSpikeBufferSize( int max_size );
   int GetMaxSpikeBufferSize();
-  
+
   uint GetNLocalNodes();
-  
-  uint GetTotalNNodes() { return GetNLocalNodes() + n_image_nodes_; }
 
-  int HostNum() {
+  uint GetNTotalNodes();
+
+  int setConnStructType( int conn_struct_type );
+
+  int
+  HostNum()
+  {
     return n_hosts_;
   }
 
-  int HostId() {
+  int
+  HostId()
+  {
     return this_host_;
   }
 
   std::string HostIdStr();
 
+  size_t getCUDAMemHostUsed();
+
+  size_t getCUDAMemHostPeak();
+
+  size_t getCUDAMemTotal();
+
+  size_t getCUDAMemFree();
+
   int GetNBoolParam();
-  std::vector<std::string> GetBoolParamNames();
-  bool IsBoolParam(std::string param_name);
-  int GetBoolParamIdx(std::string param_name);
-  bool GetBoolParam(std::string param_name);
-  int SetBoolParam(std::string param_name, bool val);
+  std::vector< std::string > GetBoolParamNames();
+  bool IsBoolParam( std::string param_name );
+  int GetBoolParamIdx( std::string param_name );
+  bool GetBoolParam( std::string param_name );
+  int SetBoolParam( std::string param_name, bool val );
 
   int GetNFloatParam();
-  std::vector<std::string> GetFloatParamNames();
-  bool IsFloatParam(std::string param_name);
-  int GetFloatParamIdx(std::string param_name);
-  float GetFloatParam(std::string param_name);
-  int SetFloatParam(std::string param_name, float val);
+  std::vector< std::string > GetFloatParamNames();
+  bool IsFloatParam( std::string param_name );
+  int GetFloatParamIdx( std::string param_name );
+  float GetFloatParam( std::string param_name );
+  int SetFloatParam( std::string param_name, float val );
 
   int GetNIntParam();
-  std::vector<std::string> GetIntParamNames();
-  bool IsIntParam(std::string param_name);
-  int GetIntParamIdx(std::string param_name);
-  int GetIntParam(std::string param_name);
-  int SetIntParam(std::string param_name, int val);
+  std::vector< std::string > GetIntParamNames();
+  bool IsIntParam( std::string param_name );
+  int GetIntParamIdx( std::string param_name );
+  int GetIntParam( std::string param_name );
+  int SetIntParam( std::string param_name, int val );
 
-  NodeSeq Create(std::string model_name, int n_nodes=1, int n_ports=1);
+  NodeSeq Create( std::string model_name, int n_nodes = 1, int n_ports = 1 );
 
-  RemoteNodeSeq RemoteCreate(int i_host, std::string model_name,
-			     int n_nodes=1, int n_ports=1);
+  RemoteNodeSeq RemoteCreate( int i_host, std::string model_name, inode_t n_nodes = 1, int n_ports = 1 );
 
-  int CreateRecord(std::string file_name, std::string *var_name_arr,
-		   int *i_node_arr, int n_node);  
-  int CreateRecord(std::string file_name, std::string *var_name_arr,
-		   int *i_node_arr, int *port_arr, int n_node);
-  std::vector<std::vector<float> > *GetRecordData(int i_record);
+  int CreateRecord( std::string file_name, std::string* var_name_arr, int* i_node_arr, int n_node );
+  int CreateRecord( std::string file_name, std::string* var_name_arr, int* i_node_arr, int* port_arr, int n_node );
+  std::vector< std::vector< float > >* GetRecordData( int i_record );
 
-  int SetNeuronParam(int i_node, int n_neuron, std::string param_name,
-		     float val);
+  int SetNeuronParam( int i_node, int n_neuron, std::string param_name, float val );
 
-  int SetNeuronParam(int *i_node, int n_neuron, std::string param_name,
-		     float val);
+  int SetNeuronParam( int* i_node, int n_neuron, std::string param_name, float val );
 
-  int SetNeuronParam(int i_node, int n_neuron, std::string param_name,
-		     float *param, int array_size);
+  int SetNeuronParam( int i_node, int n_neuron, std::string param_name, float* param, int array_size );
 
-  int SetNeuronParam(int *i_node, int n_neuron, std::string param_name,
-		     float *param, int array_size);
+  int SetNeuronParam( int* i_node, int n_neuron, std::string param_name, float* param, int array_size );
 
-  int SetNeuronParam(NodeSeq nodes, std::string param_name, float val) {
-    return SetNeuronParam(nodes.i0, nodes.n, param_name, val);
+  int
+  SetNeuronParam( NodeSeq nodes, std::string param_name, float val )
+  {
+    return SetNeuronParam( nodes.i0, nodes.n, param_name, val );
   }
 
-  int SetNeuronParam(NodeSeq nodes, std::string param_name, float *param,
-		      int array_size) {
-    return SetNeuronParam(nodes.i0, nodes.n, param_name, param, array_size);
+  int
+  SetNeuronParam( NodeSeq nodes, std::string param_name, float* param, int array_size )
+  {
+    return SetNeuronParam( nodes.i0, nodes.n, param_name, param, array_size );
   }
-  
-  int SetNeuronParam(std::vector<int> nodes, std::string param_name,
-		     float val) {
-    return SetNeuronParam(nodes.data(), nodes.size(), param_name, val);
+
+  int
+  SetNeuronParam( std::vector< int > nodes, std::string param_name, float val )
+  {
+    return SetNeuronParam( nodes.data(), nodes.size(), param_name, val );
   }
 
-  int SetNeuronParam(std::vector<int> nodes, std::string param_name,
-		     float *param, int array_size) {
-    return SetNeuronParam(nodes.data(), nodes.size(), param_name, param,
-			  array_size);
+  int
+  SetNeuronParam( std::vector< int > nodes, std::string param_name, float* param, int array_size )
+  {
+    return SetNeuronParam( nodes.data(), nodes.size(), param_name, param, array_size );
   }
 
-  int SetNeuronIntVar(int i_node, int n_neuron, std::string var_name,
-		     int val);
+  int SetNeuronIntVar( int i_node, int n_neuron, std::string var_name, int val );
 
-  int SetNeuronIntVar(int *i_node, int n_neuron, std::string var_name,
-		     int val);
+  int SetNeuronIntVar( int* i_node, int n_neuron, std::string var_name, int val );
 
-  int SetNeuronIntVar(NodeSeq nodes, std::string var_name, int val) {
-    return SetNeuronIntVar(nodes.i0, nodes.n, var_name, val);
+  int
+  SetNeuronIntVar( NodeSeq nodes, std::string var_name, int val )
+  {
+    return SetNeuronIntVar( nodes.i0, nodes.n, var_name, val );
   }
 
-  int SetNeuronIntVar(std::vector<int> nodes, std::string var_name,
-		     int val) {
-    return SetNeuronIntVar(nodes.data(), nodes.size(), var_name, val);
+  int
+  SetNeuronIntVar( std::vector< int > nodes, std::string var_name, int val )
+  {
+    return SetNeuronIntVar( nodes.data(), nodes.size(), var_name, val );
   }
 
-  int SetNeuronVar(int i_node, int n_neuron, std::string var_name,
-		     float val);
+  int SetNeuronVar( int i_node, int n_neuron, std::string var_name, float val );
 
-  int SetNeuronVar(int *i_node, int n_neuron, std::string var_name,
-		     float val);
+  int SetNeuronVar( int* i_node, int n_neuron, std::string var_name, float val );
 
-  int SetNeuronVar(int i_node, int n_neuron, std::string var_name,
-		     float *var, int array_size);
+  int SetNeuronVar( int i_node, int n_neuron, std::string var_name, float* var, int array_size );
 
-  int SetNeuronVar(int *i_node, int n_neuron, std::string var_name,
-		     float *var, int array_size);
+  int SetNeuronVar( int* i_node, int n_neuron, std::string var_name, float* var, int array_size );
 
-  int SetNeuronVar(NodeSeq nodes, std::string var_name, float val) {
-    return SetNeuronVar(nodes.i0, nodes.n, var_name, val);
+  int
+  SetNeuronVar( NodeSeq nodes, std::string var_name, float val )
+  {
+    return SetNeuronVar( nodes.i0, nodes.n, var_name, val );
   }
 
-  int SetNeuronVar(NodeSeq nodes, std::string var_name, float *var,
-		      int array_size) {
-    return SetNeuronVar(nodes.i0, nodes.n, var_name, var, array_size);
+  int
+  SetNeuronVar( NodeSeq nodes, std::string var_name, float* var, int array_size )
+  {
+    return SetNeuronVar( nodes.i0, nodes.n, var_name, var, array_size );
   }
-  
-  int SetNeuronVar(std::vector<int> nodes, std::string var_name,
-		     float val) {
-    return SetNeuronVar(nodes.data(), nodes.size(), var_name, val);
+
+  int
+  SetNeuronVar( std::vector< int > nodes, std::string var_name, float val )
+  {
+    return SetNeuronVar( nodes.data(), nodes.size(), var_name, val );
   }
 
-  int SetNeuronVar(std::vector<int> nodes, std::string var_name,
-		     float *var, int array_size) {
-    return SetNeuronVar(nodes.data(), nodes.size(), var_name, var,
-			  array_size);
+  int
+  SetNeuronVar( std::vector< int > nodes, std::string var_name, float* var, int array_size )
+  {
+    return SetNeuronVar( nodes.data(), nodes.size(), var_name, var, array_size );
   }
 
-////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  int SetNeuronScalParamDistr( int i_node, int n_node, std::string param_name );
+
+  int SetNeuronScalVarDistr( int i_node, int n_node, std::string var_name );
+
+  int SetNeuronPortParamDistr( int i_node, int n_node, std::string param_name );
+
+  int SetNeuronPortVarDistr( int i_node, int n_node, std::string var_name );
+
+  int SetNeuronPtScalParamDistr( int* i_node, int n_node, std::string param_name );
 
-  int SetNeuronScalParamDistr(int i_node, int n_node,
-			      std::string param_name);
-  
-  int SetNeuronScalVarDistr(int i_node, int n_node,
-			    std::string var_name);
-  
-  int SetNeuronPortParamDistr(int i_node, int n_node,
-			      std::string param_name);
-  
-  int SetNeuronPortVarDistr(int i_node, int n_node,
-			    std::string var_name);
-  
-  int SetNeuronPtScalParamDistr(int *i_node, int n_node,
-				std::string param_name);
-  
-  int SetNeuronPtScalVarDistr(int *i_node, int n_node,
-			      std::string var_name);
-  
-  int SetNeuronPtPortParamDistr(int *i_node, int n_node,
-				std::string param_name);
-  
-  int SetNeuronPtPortVarDistr(int *i_node, int n_node,
-			      std::string var_name);
-  
-  int SetDistributionIntParam(std::string param_name, int val);
-  
-  int SetDistributionScalParam(std::string param_name, float val);
+  int SetNeuronPtScalVarDistr( int* i_node, int n_node, std::string var_name );
 
-  int SetDistributionVectParam(std::string param_name, float val, int i);
+  int SetNeuronPtPortParamDistr( int* i_node, int n_node, std::string param_name );
 
-  int SetDistributionFloatPtParam(std::string param_name,
-				  float *array_pt);
+  int SetNeuronPtPortVarDistr( int* i_node, int n_node, std::string var_name );
 
-  int IsDistributionFloatParam(std::string param_name);
-				  
-////////////////////////////////////////////////////////////////////////
-  
-  int GetNeuronParamSize(int i_node, std::string param_name);
+  int SetDistributionIntParam( std::string param_name, int val );
 
-  int GetNeuronVarSize(int i_node, std::string var_name);
+  int SetDistributionScalParam( std::string param_name, float val );
 
-  float *GetNeuronParam(int i_node, int n_neuron, std::string param_name);
+  int SetDistributionVectParam( std::string param_name, float val, int i );
 
-  float *GetNeuronParam(int *i_node, int n_neuron, std::string param_name);
+  int SetDistributionFloatPtParam( std::string param_name, float* array_pt );
 
-  float *GetNeuronParam(NodeSeq nodes, std::string param_name) {
-    return GetNeuronParam(nodes.i0, nodes.n, param_name);
+  int IsDistributionFloatParam( std::string param_name );
+
+  ////////////////////////////////////////////////////////////////////////
+
+  int GetNeuronParamSize( int i_node, std::string param_name );
+
+  int GetNeuronVarSize( int i_node, std::string var_name );
+
+  float* GetNeuronParam( int i_node, int n_neuron, std::string param_name );
+
+  float* GetNeuronParam( int* i_node, int n_neuron, std::string param_name );
+
+  float*
+  GetNeuronParam( NodeSeq nodes, std::string param_name )
+  {
+    return GetNeuronParam( nodes.i0, nodes.n, param_name );
   }
-  
-  float *GetNeuronParam(std::vector<int> nodes, std::string param_name) {
-    return GetNeuronParam(nodes.data(), nodes.size(), param_name);
+
+  float*
+  GetNeuronParam( std::vector< int > nodes, std::string param_name )
+  {
+    return GetNeuronParam( nodes.data(), nodes.size(), param_name );
   }
 
-  float *GetArrayParam(int i_node, std::string param_name);
-  
-  int *GetNeuronIntVar(int i_node, int n_neuron, std::string var_name);
+  float* GetArrayParam( int i_node, std::string param_name );
 
-  int *GetNeuronIntVar(int *i_node, int n_neuron, std::string var_name);
+  int* GetNeuronIntVar( int i_node, int n_neuron, std::string var_name );
 
-  int *GetNeuronIntVar(NodeSeq nodes, std::string var_name) {
-    return GetNeuronIntVar(nodes.i0, nodes.n, var_name);
+  int* GetNeuronIntVar( int* i_node, int n_neuron, std::string var_name );
+
+  int*
+  GetNeuronIntVar( NodeSeq nodes, std::string var_name )
+  {
+    return GetNeuronIntVar( nodes.i0, nodes.n, var_name );
   }
-  
-  int *GetNeuronIntVar(std::vector<int> nodes, std::string var_name) {
-    return GetNeuronIntVar(nodes.data(), nodes.size(), var_name);
+
+  int*
+  GetNeuronIntVar( std::vector< int > nodes, std::string var_name )
+  {
+    return GetNeuronIntVar( nodes.data(), nodes.size(), var_name );
   }
-  
-  float *GetNeuronVar(int i_node, int n_neuron, std::string var_name);
 
-  float *GetNeuronVar(int *i_node, int n_neuron, std::string var_name);
+  float* GetNeuronVar( int i_node, int n_neuron, std::string var_name );
 
-  float *GetNeuronVar(NodeSeq nodes, std::string var_name) {
-    return GetNeuronVar(nodes.i0, nodes.n, var_name);
+  float* GetNeuronVar( int* i_node, int n_neuron, std::string var_name );
+
+  float*
+  GetNeuronVar( NodeSeq nodes, std::string var_name )
+  {
+    return GetNeuronVar( nodes.i0, nodes.n, var_name );
   }
-  
-  float *GetNeuronVar(std::vector<int> nodes, std::string var_name) {
-    return GetNeuronVar(nodes.data(), nodes.size(), var_name);
+
+  float*
+  GetNeuronVar( std::vector< int > nodes, std::string var_name )
+  {
+    return GetNeuronVar( nodes.data(), nodes.size(), var_name );
   }
 
-  float *GetArrayVar(int i_node, std::string param_name);
-  
-  int GetNodeSequenceOffset(int i_node, int n_node, int &i_group);
+  float* GetArrayVar( int i_node, std::string param_name );
+
+  int GetNodeSequenceOffset( int i_node, int n_node, int& i_group );
+
+  std::vector< int > GetNodeArrayWithOffset( int* i_node, int n_node, int& i_group );
+
+  int IsNeuronScalParam( int i_node, std::string param_name );
 
-  std::vector<int> GetNodeArrayWithOffset(int *i_node, int n_node,
-					  int &i_group);
+  int IsNeuronPortParam( int i_node, std::string param_name );
 
-  int IsNeuronScalParam(int i_node, std::string param_name);
+  int IsNeuronArrayParam( int i_node, std::string param_name );
 
-  int IsNeuronPortParam(int i_node, std::string param_name);
+  int IsNeuronIntVar( int i_node, std::string var_name );
 
-  int IsNeuronArrayParam(int i_node, std::string param_name);
+  int IsNeuronScalVar( int i_node, std::string var_name );
 
-  int IsNeuronIntVar(int i_node, std::string var_name);
-  
-  int IsNeuronScalVar(int i_node, std::string var_name);
+  int IsNeuronPortVar( int i_node, std::string var_name );
 
-  int IsNeuronPortVar(int i_node, std::string var_name);
+  int IsNeuronArrayVar( int i_node, std::string var_name );
 
-  int IsNeuronArrayVar(int i_node, std::string var_name);
-  
-  int SetSpikeGenerator(int i_node, int n_spikes, float *spike_time,
-			float *spike_height);
+  int SetSpikeGenerator( int i_node, int n_spikes, float* spike_time, float* spike_height );
 
   int Calibrate();
-  
+
   int Simulate();
 
-  int Simulate(float sim_time);
+  int Simulate( float sim_time );
 
   int StartSimulation();
 
   int SimulationStep();
 
   int EndSimulation();
-  
-  int ConnectMpiInit(int argc, char *argv[]);
+
+  int ConnectMpiInit( int argc, char* argv[] );
 
   int MpiFinalize();
 
-  void SetErrorFlag(bool error_flag) {error_flag_ = error_flag;}
-  
-  void SetErrorMessage(std::string error_message) { error_message_
-      = error_message; }
+  void
+  SetErrorFlag( bool error_flag )
+  {
+    error_flag_ = error_flag;
+  }
 
-  void SetErrorCode(unsigned char error_code) {error_code_ = error_code;}
+  void
+  SetErrorMessage( std::string error_message )
+  {
+    error_message_ = error_message;
+  }
 
-  void SetOnException(int on_exception) {on_exception_ = on_exception;}
+  void
+  SetErrorCode( unsigned char error_code )
+  {
+    error_code_ = error_code;
+  }
 
-  bool GetErrorFlag() {return error_flag_;}
+  void
+  SetOnException( int on_exception )
+  {
+    on_exception_ = on_exception;
+  }
 
-  char *GetErrorMessage() {return &error_message_[0];}
+  bool
+  GetErrorFlag()
+  {
+    return error_flag_;
+  }
+
+  char*
+  GetErrorMessage()
+  {
+    return &error_message_[ 0 ];
+  }
+
+  unsigned char
+  GetErrorCode()
+  {
+    return error_code_;
+  }
+
+  int
+  OnException()
+  {
+    return on_exception_;
+  }
 
-  unsigned char GetErrorCode() {return error_code_;}
+  unsigned int* RandomInt( size_t n );
+
+  float* RandomUniform( size_t n );
+
+  float* RandomNormal( size_t n, float mean, float stddev );
+
+  float* RandomNormalClipped( size_t n, float mean, float stddev, float vmin, float vmax, float vstep );
+
+  int Connect( inode_t i_source,
+    inode_t n_source,
+    inode_t i_target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec );
+
+  int Connect( inode_t i_source,
+    inode_t n_source,
+    inode_t* target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec );
+
+  int Connect( inode_t* source,
+    inode_t n_source,
+    inode_t i_target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec );
+
+  int Connect( inode_t* source,
+    inode_t n_source,
+    inode_t* target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec );
 
-  int OnException() {return on_exception_;}
+  int Connect( NodeSeq source, NodeSeq target, ConnSpec& conn_spec, SynSpec& syn_spec );
 
-  unsigned int *RandomInt(size_t n);
-  
-  float *RandomUniform(size_t n);
+  int Connect( NodeSeq source, std::vector< inode_t > target, ConnSpec& conn_spec, SynSpec& syn_spec );
 
-  float *RandomNormal(size_t n, float mean, float stddev);
+  int Connect( std::vector< inode_t > source, NodeSeq target, ConnSpec& conn_spec, SynSpec& syn_spec );
 
-  float *RandomNormalClipped(size_t n, float mean, float stddev, float vmin,
-			     float vmax, float vstep);  
+  int Connect( std::vector< inode_t > source, std::vector< inode_t > target, ConnSpec& conn_spec, SynSpec& syn_spec );
 
-  int Connect(int i_source_node, int i_target_node, int port,
-	      unsigned char syn_group, float weight, float delay);
+  int RemoteConnect( int i_source_host,
+    inode_t i_source,
+    inode_t n_source,
+    int i_target_host,
+    inode_t i_target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec );
 
-  int Connect(int i_source, int n_source, int i_target, int n_target,
-	      ConnSpec &conn_spec, SynSpec &syn_spec);
+  int RemoteConnect( int i_source_host,
+    inode_t i_source,
+    inode_t n_source,
+    int i_target_host,
+    inode_t* target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec );
 
-  int Connect(int i_source, int n_source, int* target, int n_target,
-	      ConnSpec &conn_spec, SynSpec &syn_spec);
+  int RemoteConnect( int i_source_host,
+    inode_t* source,
+    inode_t n_source,
+    int i_target_host,
+    inode_t i_target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec );
 
-  int Connect(int* source, int n_source, int i_target, int n_target,
-	      ConnSpec &conn_spec, SynSpec &syn_spec);
+  int RemoteConnect( int i_source_host,
+    inode_t* source,
+    inode_t n_source,
+    int i_target_host,
+    inode_t* target,
+    inode_t n_target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec );
 
-  int Connect(int* source, int n_source, int* target, int n_target,
-	      ConnSpec &conn_spec, SynSpec &syn_spec);
+  int RemoteConnect( int i_source_host,
+    NodeSeq source,
+    int i_target_host,
+    NodeSeq target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec );
 
-  int Connect(NodeSeq source, NodeSeq target,
-	      ConnSpec &conn_spec, SynSpec &syn_spec);
+  int RemoteConnect( int i_source_host,
+    NodeSeq source,
+    int i_target_host,
+    std::vector< inode_t > target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec );
 
-  int Connect(NodeSeq source, std::vector<int> target,
-	      ConnSpec &conn_spec, SynSpec &syn_spec);
+  int RemoteConnect( int i_source_host,
+    std::vector< inode_t > source,
+    int i_target_host,
+    NodeSeq target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec );
 
-  int Connect(std::vector<int> source, NodeSeq target,
-	      ConnSpec &conn_spec, SynSpec &syn_spec);
+  int RemoteConnect( int i_source_host,
+    std::vector< inode_t > source,
+    int i_target_host,
+    std::vector< inode_t > target,
+    ConnSpec& conn_spec,
+    SynSpec& syn_spec );
 
-  int Connect(std::vector<int> source, std::vector<int> target,
-	      ConnSpec &conn_spec, SynSpec &syn_spec);
+  std::vector< std::string > GetScalVarNames( int i_node );
 
-  int RemoteConnect(int i_source_host, int i_source, int n_source,
-		    int i_target_host, int i_target, int n_target,
-		    ConnSpec &conn_spec, SynSpec &syn_spec);
+  int GetNIntVar( int i_node );
 
-  int RemoteConnect(int i_source_host, int i_source, int n_source,
-		    int i_target_host, int* target, int n_target,
-		    ConnSpec &conn_spec, SynSpec &syn_spec);
+  std::vector< std::string > GetIntVarNames( int i_node );
 
-  int RemoteConnect(int i_source_host, int* source, int n_source,
-		    int i_target_host, int i_target, int n_target,
-		    ConnSpec &conn_spec, SynSpec &syn_spec);
+  int GetNScalVar( int i_node );
 
-  int RemoteConnect(int i_source_host, int* source, int n_source,
-		    int i_target_host, int* target, int n_target,
-		    ConnSpec &conn_spec, SynSpec &syn_spec);
+  std::vector< std::string > GetPortVarNames( int i_node );
 
-  int RemoteConnect(int i_source_host, NodeSeq source,
-		    int i_target_host, NodeSeq target,
-		    ConnSpec &conn_spec, SynSpec &syn_spec);
+  int GetNPortVar( int i_node );
 
-  int RemoteConnect(int i_source_host, NodeSeq source,
-		    int i_target_host, std::vector<int> target,
-		    ConnSpec &conn_spec, SynSpec &syn_spec);
+  std::vector< std::string > GetScalParamNames( int i_node );
 
-  int RemoteConnect(int i_source_host, std::vector<int> source,
-		    int i_target_host, NodeSeq target,
-		    ConnSpec &conn_spec, SynSpec &syn_spec);
+  int GetNScalParam( int i_node );
 
-  int RemoteConnect(int i_source_host, std::vector<int> source,
-		    int i_target_host, std::vector<int> target,
-		    ConnSpec &conn_spec, SynSpec &syn_spec);
+  std::vector< std::string > GetPortParamNames( int i_node );
 
+  int GetNPortParam( int i_node );
 
-  std::vector<std::string> GetScalVarNames(int i_node);
+  std::vector< std::string > GetArrayParamNames( int i_node );
 
-  int GetNIntVar(int i_node);
-  
-  std::vector<std::string> GetIntVarNames(int i_node);
+  int GetNArrayParam( int i_node );
 
-  int GetNScalVar(int i_node);
-  
-  std::vector<std::string> GetPortVarNames(int i_node);
+  std::vector< std::string > GetArrayVarNames( int i_node );
 
-  int GetNPortVar(int i_node);
-  
-  std::vector<std::string> GetScalParamNames(int i_node);
+  std::vector< std::string > GetGroupParamNames( int i_node );
 
-  int GetNScalParam(int i_node);
-  
-  std::vector<std::string> GetPortParamNames(int i_node);
+  int GetNGroupParam( int i_node );
 
-  int GetNPortParam(int i_node);
-  
-  std::vector<std::string> GetArrayParamNames(int i_node);
+  int GetNArrayVar( int i_node );
 
-  int GetNArrayParam(int i_node);
+  int GetConnectionFloatParamIndex( std::string param_name );
 
-  std::vector<std::string> GetArrayVarNames(int i_node);
+  int GetConnectionIntParamIndex( std::string param_name );
 
-  std::vector<std::string> GetGroupParamNames(int i_node);
+  int IsConnectionFloatParam( std::string param_name );
 
-  int GetNGroupParam(int i_node);
-  
-  int GetNArrayVar(int i_node);
+  int IsConnectionIntParam( std::string param_name );
 
-  int GetConnectionFloatParamIndex(std::string param_name);
-  
-  int GetConnectionIntParamIndex(std::string param_name);
-  
-  int IsConnectionFloatParam(std::string param_name);
-  
-  int IsConnectionIntParam(std::string param_name);
-  
-  int GetConnectionFloatParam(int64_t *conn_ids, int64_t n_conn,
-			      float *h_param_arr,
-			      std::string param_name);
-  
-  int GetConnectionIntParam(int64_t *conn_ids, int64_t n_conn,
-			    int *h_param_arr,
-			    std::string param_name);
+  int GetConnectionFloatParam( int64_t* conn_ids, int64_t n_conn, float* h_param_arr, std::string param_name );
 
-  int SetConnectionFloatParamDistr(int64_t *conn_ids, int64_t n_conn,
-				   std::string param_name);
+  int GetConnectionIntParam( int64_t* conn_ids, int64_t n_conn, int* h_param_arr, std::string param_name );
 
-  int SetConnectionFloatParam(int64_t *conn_ids, int64_t n_conn,
-			      float val, std::string param_name);
+  int SetConnectionFloatParamDistr( int64_t* conn_ids, int64_t n_conn, std::string param_name );
 
-  int SetConnectionIntParamArr(int64_t *conn_ids, int64_t n_conn,
-			       int *h_param_arr,
-			       std::string param_name);
+  int SetConnectionFloatParam( int64_t* conn_ids, int64_t n_conn, float val, std::string param_name );
 
-  int SetConnectionIntParam(int64_t *conn_ids, int64_t n_conn,
-			    int val, std::string param_name);
+  int SetConnectionIntParamArr( int64_t* conn_ids, int64_t n_conn, int* h_param_arr, std::string param_name );
 
-  int GetConnectionStatus(int64_t *conn_ids, int64_t n_conn,
-			  int *i_source, int *i_target, int *port,
-			  unsigned char *syn_group, float *delay,
-			  float *weight);
+  int SetConnectionIntParam( int64_t* conn_ids, int64_t n_conn, int val, std::string param_name );
 
-  int64_t *GetConnections(int i_source, int n_source,
-			  int i_target, int n_target,
-			  int syn_group, int64_t *n_conn);
+  int GetConnectionStatus( int64_t* conn_ids,
+    int64_t n_conn,
+    inode_t* source,
+    inode_t* target,
+    int* port,
+    int* syn_group,
+    float* delay,
+    float* weight );
 
-  int64_t *GetConnections(int *i_source_pt, int n_source,
-			  int i_target, int n_target,
-			  int syn_group, int64_t *n_conn);
+  int64_t* GetConnections( inode_t i_source,
+    inode_t n_source,
+    inode_t i_target,
+    inode_t n_target,
+    int syn_group,
+    int64_t* n_conn );
 
-  int64_t *GetConnections(int i_source, int n_source,
-			  int *i_target_pt, int n_target,
-			  int syn_group, int64_t *n_conn);
+  int64_t* GetConnections( inode_t* i_source_pt,
+    inode_t n_source,
+    inode_t i_target,
+    inode_t n_target,
+    int syn_group,
+    int64_t* n_conn );
 
-  int64_t *GetConnections(int *i_source_pt, int n_source,
-			  int *i_target_pt, int n_target,
-			  int syn_group, int64_t *n_conn);
-    
-  int64_t *GetConnections(NodeSeq source, NodeSeq target,
-			  int syn_group, int64_t *n_conn);
+  int64_t* GetConnections( inode_t i_source,
+    inode_t n_source,
+    inode_t* i_target_pt,
+    inode_t n_target,
+    int syn_group,
+    int64_t* n_conn );
 
-  int64_t *GetConnections(std::vector<int> source, NodeSeq target,
-			  int syn_group, int64_t *n_conn);
+  int64_t* GetConnections( inode_t* i_source_pt,
+    inode_t n_source,
+    inode_t* i_target_pt,
+    inode_t n_target,
+    int syn_group,
+    int64_t* n_conn );
 
-  int64_t *GetConnections(NodeSeq source, std::vector<int> target,
-			  int syn_group, int64_t *n_conn);
+  int64_t* GetConnections( NodeSeq source, NodeSeq target, int syn_group, int64_t* n_conn );
 
-  int64_t *GetConnections(std::vector<int> source, std::vector<int> target,
-			  int syn_group, int64_t *n_conn);
+  int64_t* GetConnections( std::vector< inode_t > source, NodeSeq target, int syn_group, int64_t* n_conn );
 
-  int CreateSynGroup(std::string model_name);
+  int64_t* GetConnections( NodeSeq source, std::vector< inode_t > target, int syn_group, int64_t* n_conn );
 
-  int GetSynGroupNParam(int syn_group);
+  int64_t*
+  GetConnections( std::vector< inode_t > source, std::vector< inode_t > target, int syn_group, int64_t* n_conn );
 
-  std::vector<std::string> GetSynGroupParamNames(int syn_group);
+  int CreateSynGroup( std::string model_name );
 
-  bool IsSynGroupParam(int syn_group, std::string param_name);
+  int GetSynGroupNParam( int syn_group );
 
-  int GetSynGroupParamIdx(int syn_group, std::string param_name);
+  std::vector< std::string > GetSynGroupParamNames( int syn_group );
 
-  float GetSynGroupParam(int syn_group, std::string param_name);
+  bool IsSynGroupParam( int syn_group, std::string param_name );
 
-  int SetSynGroupParam(int syn_group, std::string param_name, float val);
+  int GetSynGroupParamIdx( int syn_group, std::string param_name );
+
+  float GetSynGroupParam( int syn_group, std::string param_name );
+
+  int SetSynGroupParam( int syn_group, std::string param_name, float val );
 
   int SynGroupCalibrate();
 
-  int ActivateSpikeCount(int i_node, int n_node);
-  
-  int ActivateSpikeCount(NodeSeq nodes) {
-    return ActivateSpikeCount(nodes.i0, nodes.n);
+  int ActivateSpikeCount( int i_node, int n_node );
+
+  int
+  ActivateSpikeCount( NodeSeq nodes )
+  {
+    return ActivateSpikeCount( nodes.i0, nodes.n );
   }
 
-  int ActivateRecSpikeTimes(int i_node, int n_node, int max_n_rec_spike_times);
-  
-  int ActivateRecSpikeTimes(NodeSeq nodes, int max_n_rec_spike_times) {
-    return ActivateRecSpikeTimes(nodes.i0, nodes.n, max_n_rec_spike_times);
+  int ActivateRecSpikeTimes( int i_node, int n_node, int max_n_rec_spike_times );
+
+  int
+  ActivateRecSpikeTimes( NodeSeq nodes, int max_n_rec_spike_times )
+  {
+    return ActivateRecSpikeTimes( nodes.i0, nodes.n, max_n_rec_spike_times );
   }
 
-  int SetRecSpikeTimesStep(int i_node, int n_node, int rec_spike_times_step);
+  int SetRecSpikeTimesStep( int i_node, int n_node, int rec_spike_times_step );
 
-  int SetRecSpikeTimesStep(NodeSeq nodes, int rec_spike_times_step) {
-    return SetRecSpikeTimesStep(nodes.i0, nodes.n, rec_spike_times_step);
+  int
+  SetRecSpikeTimesStep( NodeSeq nodes, int rec_spike_times_step )
+  {
+    return SetRecSpikeTimesStep( nodes.i0, nodes.n, rec_spike_times_step );
   }
 
-  int GetNRecSpikeTimes(int i_node);
+  int GetNRecSpikeTimes( int i_node );
 
-  int GetRecSpikeTimes(int i_node, int n_node, int **n_spike_times_pt,
-		       float ***spike_times_pt);
+  int GetRecSpikeTimes( int i_node, int n_node, int** n_spike_times_pt, float*** spike_times_pt );
 
-  int GetRecSpikeTimes(NodeSeq nodes, int **n_spike_times_pt,
-		       float ***spike_times_pt) {
-    return GetRecSpikeTimes(nodes.i0, nodes.n, n_spike_times_pt,
-			    spike_times_pt);
+  int
+  GetRecSpikeTimes( NodeSeq nodes, int** n_spike_times_pt, float*** spike_times_pt )
+  {
+    return GetRecSpikeTimes( nodes.i0, nodes.n, n_spike_times_pt, spike_times_pt );
   }
 
-  int PushSpikesToNodes(int n_spikes, int *node_id, float *spike_height);
-  
-  int PushSpikesToNodes(int n_spikes, int *node_id);
+  int PushSpikesToNodes( int n_spikes, int* node_id, float* spike_height );
 
-  int GetExtNeuronInputSpikes(int *n_spikes, int **node, int **port,
-			      float **spike_height, bool include_zeros);
+  int PushSpikesToNodes( int n_spikes, int* node_id );
 
-  int SetNeuronGroupParam(int i_node, int n_node,
-			  std::string param_name, float val);
-  
-  int IsNeuronGroupParam(int i_node, std::string param_name);
+  int GetExtNeuronInputSpikes( int* n_spikes, int** node, int** port, float** spike_height, bool include_zeros );
 
-  float GetNeuronGroupParam(int i_node, std::string param_name);
+  int SetNeuronGroupParam( int i_node, int n_node, std::string param_name, float val );
+
+  int IsNeuronGroupParam( int i_node, std::string param_name );
+
+  float GetNeuronGroupParam( int i_node, std::string param_name );
 
-  // Calibrate remote connection maps
-  int  RemoteConnectionMapCalibrate(int i_host, int n_hosts);
-  
   int ExternalSpikeInit();
 
   int ExternalSpikeReset();
 
   int CopySpikeFromRemote();
 
-  int SendSpikeToRemote(int n_ext_spikes);
+  int SendSpikeToRemote( int n_ext_spikes );
 
   int RecvSpikeFromRemote();
 
-  int organizeExternalSpikes(int n_ext_spikes);
-
+  int organizeExternalSpikes( int n_ext_spikes );
 };
 
-
 #endif
diff --git a/src/nestgpu_C.cpp b/src/nestgpu_C.cpp
index c867ea3a8..7791eb3d8 100644
--- a/src/nestgpu_C.cpp
+++ b/src/nestgpu_C.cpp
@@ -20,1288 +20,2053 @@
  *
  */
 
-
-
-
-
 #include <config.h>
-#include <iostream>
 #include <cstdlib>
-#include <string>
 #include <cstring>
+#include <iostream>
+#include <string>
 
 #include "nestgpu.h"
 #include "nestgpu_C.h"
 #include "propagate_error.h"
 
-extern "C" {
-  static NESTGPU *NESTGPU_instance = NULL;
+extern "C"
+{
+  static NESTGPU* NESTGPU_instance = nullptr;
   ConnSpec ConnSpec_instance;
   SynSpec SynSpec_instance;
 
-  void checkNESTGPUInstance() {
-    if (NESTGPU_instance == NULL) {
+  void
+  checkNESTGPUInstance()
+  {
+    if ( NESTGPU_instance == nullptr )
+    {
       NESTGPU_instance = new NESTGPU();
     }
   }
-  
-  char *NESTGPU_GetErrorMessage()
+
+  char*
+  NESTGPU_GetErrorMessage()
   {
     checkNESTGPUInstance();
-    char *cstr = NESTGPU_instance->GetErrorMessage();
+    char* cstr = NESTGPU_instance->GetErrorMessage();
     return cstr;
   }
 
-  unsigned char NESTGPU_GetErrorCode()
+  unsigned char
+  NESTGPU_GetErrorCode()
   {
     checkNESTGPUInstance();
     return NESTGPU_instance->GetErrorCode();
   }
 
-  void NESTGPU_SetOnException(int on_exception)
+  void
+  NESTGPU_SetOnException( int on_exception )
   {
     checkNESTGPUInstance();
-    NESTGPU_instance->SetOnException(on_exception);
-  }
-
-  unsigned int *RandomInt(size_t n);
-  
-  int NESTGPU_SetRandomSeed(unsigned long long seed)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->SetRandomSeed(seed);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetTimeResolution(float time_res)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->SetTimeResolution(time_res);
-  } END_ERR_PROP return ret; }
-
-  float NESTGPU_GetTimeResolution()
-  { float ret = 0.0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->GetTimeResolution();
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetMaxSpikeBufferSize(int max_size)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->SetMaxSpikeBufferSize(max_size);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_GetMaxSpikeBufferSize()
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->GetMaxSpikeBufferSize();
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetSimTime(float sim_time)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->SetSimTime(sim_time);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetVerbosityLevel(int verbosity_level)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->SetVerbosityLevel(verbosity_level);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNestedLoopAlgo(int nested_loop_algo)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->SetNestedLoopAlgo(nested_loop_algo);
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_Create(char *model_name, int n_neuron, int n_port)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string model_name_str = std::string(model_name);
-    NodeSeq neur = NESTGPU_instance->Create(model_name_str, n_neuron,
-						    n_port);
-    ret = neur[0];
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_CreateRecord(char *file_name, char *var_name_arr[],
-			     int *i_node_arr, int *port_arr,
-			     int n_node)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string file_name_str = std::string(file_name);
-    std::vector<std::string> var_name_vect;
-    for (int i=0; i<n_node; i++) {
-      std::string var_name = std::string(var_name_arr[i]);
-      var_name_vect.push_back(var_name);
-    }
-    ret = NESTGPU_instance->CreateRecord
-      (file_name_str, var_name_vect.data(), i_node_arr, port_arr,
-       n_node);		       
-  } END_ERR_PROP return ret; }
-  
-  int NESTGPU_GetRecordDataRows(int i_record)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::vector<std::vector<float> > *data_vect_pt
-      = NESTGPU_instance->GetRecordData(i_record);
-
-    ret = data_vect_pt->size();
-  } END_ERR_PROP return ret; }
-  
-  int NESTGPU_GetRecordDataColumns(int i_record)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::vector<std::vector<float> > *data_vect_pt
-      = NESTGPU_instance->GetRecordData(i_record);
-    
-    ret = data_vect_pt->at(0).size();
-  } END_ERR_PROP return ret; }
-
-  float **NESTGPU_GetRecordData(int i_record)
-  { float **ret = NULL; BEGIN_ERR_PROP {
-    std::vector<std::vector<float> > *data_vect_pt
-      = NESTGPU_instance->GetRecordData(i_record);
-    int nr = data_vect_pt->size();
-    ret = new float*[nr];
-    for (int i=0; i<nr; i++) {
-      ret[i] = data_vect_pt->at(i).data();
-    }
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronScalParam(int i_node, int n_neuron, char *param_name,
-				   float val)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->SetNeuronParam(i_node, n_neuron,
-					     param_name_str, val);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronArrayParam(int i_node, int n_neuron,
-				    char *param_name, float *param,
-				    int array_size)
-  { int ret = 0; BEGIN_ERR_PROP {
-      std::string param_name_str = std::string(param_name);    
-      ret = NESTGPU_instance->SetNeuronParam(i_node, n_neuron,
-					       param_name_str, param,
-					       array_size);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronPtScalParam(int *i_node, int n_neuron,
-				     char *param_name,float val)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->SetNeuronParam(i_node, n_neuron,
-					     param_name_str, val);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronPtArrayParam(int *i_node, int n_neuron,
-				     char *param_name, float *param,
-				     int array_size)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);    
-    ret = NESTGPU_instance->SetNeuronParam(i_node, n_neuron,
-					     param_name_str, param,
-					     array_size);
-  } END_ERR_PROP return ret; }
-  
-  int NESTGPU_IsNeuronScalParam(int i_node, char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->IsNeuronScalParam(i_node, param_name_str);
-  } END_ERR_PROP return ret; }
-  
-  int NESTGPU_IsNeuronPortParam(int i_node, char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->IsNeuronPortParam(i_node, param_name_str);
-  } END_ERR_PROP return ret; }
-  
-  int NESTGPU_IsNeuronArrayParam(int i_node, char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->IsNeuronArrayParam(i_node, param_name_str);
-  } END_ERR_PROP return ret; }
-  
-
-  int NESTGPU_SetNeuronIntVar(int i_node, int n_neuron, char *var_name,
-				int val)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-    std::string var_name_str = std::string(var_name);
-    ret = NESTGPU_instance->SetNeuronIntVar(i_node, n_neuron,
-					     var_name_str, val);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronScalVar(int i_node, int n_neuron, char *var_name,
-				   float val)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-    std::string var_name_str = std::string(var_name);
-    ret = NESTGPU_instance->SetNeuronVar(i_node, n_neuron,
-					     var_name_str, val);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronArrayVar(int i_node, int n_neuron,
-				    char *var_name, float *var,
-				    int array_size)
-  { int ret = 0; BEGIN_ERR_PROP {
-      std::string var_name_str = std::string(var_name);    
-      ret = NESTGPU_instance->SetNeuronVar(i_node, n_neuron,
-					       var_name_str, var,
-					       array_size);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronPtIntVar(int *i_node, int n_neuron,
-				     char *var_name, int val)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string var_name_str = std::string(var_name);
-    ret = NESTGPU_instance->SetNeuronIntVar(i_node, n_neuron,
-					      var_name_str, val);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronPtScalVar(int *i_node, int n_neuron,
-				     char *var_name, float val)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string var_name_str = std::string(var_name);
-    ret = NESTGPU_instance->SetNeuronVar(i_node, n_neuron,
-					     var_name_str, val);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronPtArrayVar(int *i_node, int n_neuron,
-				     char *var_name, float *var,
-				     int array_size)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string var_name_str = std::string(var_name);    
-    ret = NESTGPU_instance->SetNeuronVar(i_node, n_neuron,
-					     var_name_str, var,
-					     array_size);
-  } END_ERR_PROP return ret; }
-
-
-  
-  int NESTGPU_SetNeuronScalParamDistr(int i_node, int n_neuron,
-				      char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->SetNeuronScalParamDistr(i_node, n_neuron,
-						    param_name_str);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronScalVarDistr(int i_node, int n_neuron,
-				    char *var_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-    std::string var_name_str = std::string(var_name);
-    ret = NESTGPU_instance->SetNeuronScalVarDistr(i_node, n_neuron,
-						  var_name_str);
-  } END_ERR_PROP return ret; }
-
-
-  int NESTGPU_SetNeuronPortParamDistr(int i_node, int n_neuron,
-				      char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->SetNeuronPortParamDistr(i_node, n_neuron,
-						    param_name_str);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronPortVarDistr(int i_node, int n_neuron,
-				    char *var_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-    std::string var_name_str = std::string(var_name);
-    ret = NESTGPU_instance->SetNeuronPortVarDistr(i_node, n_neuron,
-						  var_name_str);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronPtScalParamDistr(int *i_node, int n_neuron,
-					char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->SetNeuronPtScalParamDistr(i_node, n_neuron,
-						      param_name_str);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronPtScalVarDistr(int *i_node, int n_neuron,
-					char *var_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string var_name_str = std::string(var_name);
-    ret = NESTGPU_instance->SetNeuronPtScalVarDistr(i_node, n_neuron,
-						    var_name_str);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronPtPortParamDistr(int *i_node, int n_neuron,
-					char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->SetNeuronPtPortParamDistr(i_node, n_neuron,
-						      param_name_str);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronPtPortVarDistr(int *i_node, int n_neuron,
-				      char *var_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string var_name_str = std::string(var_name);
-    ret = NESTGPU_instance->SetNeuronPtPortVarDistr(i_node, n_neuron,
-						    var_name_str);
-  } END_ERR_PROP return ret; }
-
-
-  int NESTGPU_SetDistributionIntParam(char *param_name, int val)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->SetDistributionIntParam(param_name_str, val);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetDistributionScalParam(char *param_name, float val)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->SetDistributionScalParam(param_name_str, val);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetDistributionVectParam(char *param_name, float val, int i)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->SetDistributionVectParam(param_name_str, val, i);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetDistributionFloatPtParam(char *param_name, float *array_pt)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->SetDistributionFloatPtParam(param_name_str,
-							array_pt);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_IsDistributionFloatParam(char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->IsDistributionFloatParam(param_name_str);
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_IsNeuronIntVar(int i_node, char *var_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string var_name_str = std::string(var_name);
-
-    ret = NESTGPU_instance->IsNeuronIntVar(i_node, var_name_str);
-  } END_ERR_PROP return ret; }
-  
-  int NESTGPU_IsNeuronScalVar(int i_node, char *var_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string var_name_str = std::string(var_name);
-    
-    ret = NESTGPU_instance->IsNeuronScalVar(i_node, var_name_str);
-  } END_ERR_PROP return ret; }
-  
-  int NESTGPU_IsNeuronPortVar(int i_node, char *var_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string var_name_str = std::string(var_name);
-    
-    ret = NESTGPU_instance->IsNeuronPortVar(i_node, var_name_str);
-  } END_ERR_PROP return ret; }
-  
-  int NESTGPU_IsNeuronArrayVar(int i_node, char *var_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string var_name_str = std::string(var_name);
-    
-    ret = NESTGPU_instance->IsNeuronArrayVar(i_node, var_name_str);
-  } END_ERR_PROP return ret; }
-  
-
-  int NESTGPU_GetNeuronParamSize(int i_node, char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->GetNeuronParamSize(i_node, param_name_str);
-  } END_ERR_PROP return ret; }
-  
-  
-  int NESTGPU_GetNeuronVarSize(int i_node, char *var_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string var_name_str = std::string(var_name);
-    
-    ret = NESTGPU_instance->GetNeuronVarSize(i_node, var_name_str);
-  } END_ERR_PROP return ret; }
-  
-  
-  float *NESTGPU_GetNeuronParam(int i_node, int n_neuron,
-				  char *param_name)
-  { float *ret = NULL; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->GetNeuronParam(i_node, n_neuron,
-					     param_name_str);
-  } END_ERR_PROP return ret; }
-
-
-  float *NESTGPU_GetNeuronPtParam(int *i_node, int n_neuron,
-				 char *param_name)
-  { float *ret = NULL; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->GetNeuronParam(i_node, n_neuron,
-					     param_name_str);
-  } END_ERR_PROP return ret; }
-
-
-  float *NESTGPU_GetArrayParam(int i_node, char *param_name)
-  { float *ret = NULL; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->GetArrayParam(i_node, param_name_str);
-  } END_ERR_PROP return ret; }
-
-  
-  int *NESTGPU_GetNeuronIntVar(int i_node, int n_neuron,
-				 char *param_name)
-  { int *ret = NULL; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->GetNeuronIntVar(i_node, n_neuron,
-					      param_name_str);
-  } END_ERR_PROP return ret; }
-
-
-  int *NESTGPU_GetNeuronPtIntVar(int *i_node, int n_neuron,
-				   char *param_name)
-  { int *ret = NULL; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->GetNeuronIntVar(i_node, n_neuron,
-					      param_name_str);
-  } END_ERR_PROP return ret; }
-
-  float *NESTGPU_GetNeuronVar(int i_node, int n_neuron,
-				char *param_name)
-  { float *ret = NULL; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->GetNeuronVar(i_node, n_neuron,
-					   param_name_str);
-  } END_ERR_PROP return ret; }
-
-
-  float *NESTGPU_GetNeuronPtVar(int *i_node, int n_neuron,
-				 char *param_name)
-  { float *ret = NULL; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->GetNeuronVar(i_node, n_neuron,
-					   param_name_str);
-  } END_ERR_PROP return ret; }
-
-  float *NESTGPU_GetArrayVar(int i_node, char *var_name)
-  { float *ret = NULL; BEGIN_ERR_PROP {
-    
-    std::string var_name_str = std::string(var_name);
-    ret = NESTGPU_instance->GetArrayVar(i_node, var_name_str);
-  } END_ERR_PROP return ret; }
-
-
-  int NESTGPU_Calibrate()
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->Calibrate();
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_Simulate()
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->Simulate();
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_StartSimulation()
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->StartSimulation();
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SimulationStep()
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->SimulationStep();
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_EndSimulation()
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->EndSimulation();
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_ConnectMpiInit(int argc, char *argv[])
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->ConnectMpiInit(argc, argv);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_MpiFinalize()
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->MpiFinalize();
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_HostId()
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->HostId();
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_HostNum()
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->HostNum();
-  } END_ERR_PROP return ret; }
-
-  unsigned int *NESTGPU_RandomInt(size_t n)
-  { unsigned int *ret = NULL; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->RandomInt(n);
-  } END_ERR_PROP return ret; }
-  
-  float *NESTGPU_RandomUniform(size_t n)
-  { float* ret = NULL; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->RandomUniform(n);
-  } END_ERR_PROP return ret; }
-  
-  float *NESTGPU_RandomNormal(size_t n, float mean, float stddev)
-  { float *ret = NULL; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->RandomNormal(n, mean, stddev);
-  } END_ERR_PROP return ret; }
-  
-  float *NESTGPU_RandomNormalClipped(size_t n, float mean, float stddev,
-				       float vmin, float vmax, float vstep)
-  { float *ret = NULL; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->RandomNormalClipped(n, mean, stddev, vmin,
-						  vmax, vstep);
-  } END_ERR_PROP return ret; }
-  
-  int NESTGPU_ConnSpecInit()
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = ConnSpec_instance.Init();
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetConnSpecParam(char *param_name, int value)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    ret = ConnSpec_instance.SetParam(param_name_str, value);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_ConnSpecIsParam(char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    ret = ConnSpec::IsParam(param_name_str);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SynSpecInit()
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = SynSpec_instance.Init();
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetSynSpecIntParam(char *param_name, int value)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    ret = SynSpec_instance.SetParam(param_name_str, value);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetSynSpecFloatParam(char *param_name, float value)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    ret = SynSpec_instance.SetParam(param_name_str, value);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetSynSpecFloatPtParam(char *param_name, float *array_pt)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    ret = SynSpec_instance.SetParam(param_name_str, array_pt);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SynSpecIsIntParam(char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    ret = SynSpec_instance.IsIntParam(param_name_str);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SynSpecIsFloatParam(char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    ret = SynSpec_instance.IsFloatParam(param_name_str);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SynSpecIsFloatPtParam(char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    ret = SynSpec_instance.IsFloatPtParam(param_name_str);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_ConnectSeqSeq(int i_source, int n_source, int i_target,
-			      int n_target)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->Connect(i_source, n_source, i_target, n_target,
-				      ConnSpec_instance, SynSpec_instance); 
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_ConnectSeqGroup(int i_source, int n_source, int *i_target,
-				int n_target)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->Connect(i_source, n_source, i_target, n_target,
-				      ConnSpec_instance, SynSpec_instance); 
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_ConnectGroupSeq(int *i_source, int n_source, int i_target,
-				int n_target)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->Connect(i_source, n_source, i_target, n_target,
-				      ConnSpec_instance, SynSpec_instance);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_ConnectGroupGroup(int *i_source, int n_source, int *i_target,
-				  int n_target)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->Connect(i_source, n_source, i_target, n_target,
-				      ConnSpec_instance, SynSpec_instance);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_RemoteConnectSeqSeq(int i_source_host, int i_source,
-				    int n_source, int i_target_host,
-				    int i_target, int n_target)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->RemoteConnect(i_source_host, i_source, n_source,
-					    i_target_host, i_target, n_target,
-					    ConnSpec_instance,
-					    SynSpec_instance); 
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_RemoteConnectSeqGroup(int i_source_host, int i_source,
-				      int n_source, int i_target_host,
-				      int *i_target, int n_target)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->RemoteConnect(i_source_host, i_source, n_source,
-					    i_target_host, i_target, n_target,
-					    ConnSpec_instance,
-					    SynSpec_instance); 
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_RemoteConnectGroupSeq(int i_source_host, int *i_source,
-				      int n_source, int i_target_host,
-				      int i_target, int n_target)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->RemoteConnect(i_source_host, i_source, n_source,
-					    i_target_host, i_target, n_target,
-					    ConnSpec_instance,
-					    SynSpec_instance);
-  } END_ERR_PROP return ret; }
-
-
-  int NESTGPU_RemoteConnectGroupGroup(int i_source_host, int *i_source,
-					int n_source, int i_target_host,
-					int *i_target, int n_target)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->RemoteConnect(i_source_host, i_source, n_source,
-					    i_target_host, i_target, n_target,
-					    ConnSpec_instance,
-					    SynSpec_instance);
-  } END_ERR_PROP return ret; }
-
-
-  char **NESTGPU_GetIntVarNames(int i_node)
-  { char **ret = NULL; BEGIN_ERR_PROP {
-    std::vector<std::string> var_name_vect =
-      NESTGPU_instance->GetIntVarNames(i_node);
-    char **var_name_array = (char**)malloc(var_name_vect.size()
-					   *sizeof(char*));
-    for (unsigned int i=0; i<var_name_vect.size(); i++) {
-      char *var_name = (char*)malloc((var_name_vect[i].length() + 1)
-				      *sizeof(char));
-      
-      strcpy(var_name, var_name_vect[i].c_str());
-      var_name_array[i] = var_name;
-    }
-    ret = var_name_array;
-    
-  } END_ERR_PROP return ret; }
-
-  char **NESTGPU_GetScalVarNames(int i_node)
-  { char **ret = NULL; BEGIN_ERR_PROP {
-    std::vector<std::string> var_name_vect =
-      NESTGPU_instance->GetScalVarNames(i_node);
-    char **var_name_array = (char**)malloc(var_name_vect.size()
-					   *sizeof(char*));
-    for (unsigned int i=0; i<var_name_vect.size(); i++) {
-      char *var_name = (char*)malloc((var_name_vect[i].length() + 1)
-				      *sizeof(char));
-      
-      strcpy(var_name, var_name_vect[i].c_str());
-      var_name_array[i] = var_name;
-    }
-    ret = var_name_array;
-    
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_GetNIntVar(int i_node)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->GetNIntVar(i_node);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_GetNScalVar(int i_node)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->GetNScalVar(i_node);
-  } END_ERR_PROP return ret; }
-
-
-  char **NESTGPU_GetPortVarNames(int i_node)
-  { char **ret = NULL; BEGIN_ERR_PROP {
-    std::vector<std::string> var_name_vect =
-      NESTGPU_instance->GetPortVarNames(i_node);
-    char **var_name_array = (char**)malloc(var_name_vect.size()
-					   *sizeof(char*));
-    for (unsigned int i=0; i<var_name_vect.size(); i++) {
-      char *var_name = (char*)malloc((var_name_vect[i].length() + 1)
-				      *sizeof(char));
-      
-      strcpy(var_name, var_name_vect[i].c_str());
-      var_name_array[i] = var_name;
-    }
-    ret = var_name_array;
-    
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_GetNPortVar(int i_node)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->GetNPortVar(i_node);
-  } END_ERR_PROP return ret; }
-
-  
-  char **NESTGPU_GetScalParamNames(int i_node)
-  { char **ret = NULL; BEGIN_ERR_PROP {
-    std::vector<std::string> var_name_vect =
-      NESTGPU_instance->GetScalParamNames(i_node);
-    char **var_name_array = (char**)malloc(var_name_vect.size()
-					   *sizeof(char*));
-    for (unsigned int i=0; i<var_name_vect.size(); i++) {
-      char *var_name = (char*)malloc((var_name_vect[i].length() + 1)
-				      *sizeof(char));
-      
-      strcpy(var_name, var_name_vect[i].c_str());
-      var_name_array[i] = var_name;
-    }
-    ret = var_name_array;
-    
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_GetNScalParam(int i_node)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->GetNScalParam(i_node);
-  } END_ERR_PROP return ret; }
-
-
-  char **NESTGPU_GetGroupParamNames(int i_node)
-  { char **ret = NULL; BEGIN_ERR_PROP {
-    std::vector<std::string> var_name_vect =
-      NESTGPU_instance->GetGroupParamNames(i_node);
-    char **var_name_array = (char**)malloc(var_name_vect.size()
-					   *sizeof(char*));
-    for (unsigned int i=0; i<var_name_vect.size(); i++) {
-      char *var_name = (char*)malloc((var_name_vect[i].length() + 1)
-				      *sizeof(char));
-      
-      strcpy(var_name, var_name_vect[i].c_str());
-      var_name_array[i] = var_name;
-    }
-    ret = var_name_array;
-    
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_GetNGroupParam(int i_node)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->GetNGroupParam(i_node);
-  } END_ERR_PROP return ret; }
-
-
-  char **NESTGPU_GetPortParamNames(int i_node)
-  { char **ret = NULL; BEGIN_ERR_PROP {
-    std::vector<std::string> var_name_vect =
-      NESTGPU_instance->GetPortParamNames(i_node);
-    char **var_name_array = (char**)malloc(var_name_vect.size()
-					   *sizeof(char*));
-    for (unsigned int i=0; i<var_name_vect.size(); i++) {
-      char *var_name = (char*)malloc((var_name_vect[i].length() + 1)
-				      *sizeof(char));
-      
-      strcpy(var_name, var_name_vect[i].c_str());
-      var_name_array[i] = var_name;
-    }
-    ret = var_name_array;
-    
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_GetNPortParam(int i_node)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->GetNPortParam(i_node);
-  } END_ERR_PROP return ret; }
-
-
-  char **NESTGPU_GetArrayParamNames(int i_node)
-  { char **ret = NULL; BEGIN_ERR_PROP {
-    std::vector<std::string> var_name_vect =
-      NESTGPU_instance->GetArrayParamNames(i_node);
-    char **var_name_array = (char**)malloc(var_name_vect.size()
-					   *sizeof(char*));
-    for (unsigned int i=0; i<var_name_vect.size(); i++) {
-      char *var_name = (char*)malloc((var_name_vect[i].length() + 1)
-				      *sizeof(char));
-      
-      strcpy(var_name, var_name_vect[i].c_str());
-      var_name_array[i] = var_name;
-    }
-    ret = var_name_array;
-    
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_GetNArrayParam(int i_node)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->GetNArrayParam(i_node);
-  } END_ERR_PROP return ret; }
-
-  char **NESTGPU_GetArrayVarNames(int i_node)
-  { char **ret = NULL; BEGIN_ERR_PROP {
-    std::vector<std::string> var_name_vect =
-      NESTGPU_instance->GetArrayVarNames(i_node);
-    char **var_name_array = (char**)malloc(var_name_vect.size()
-					   *sizeof(char*));
-    for (unsigned int i=0; i<var_name_vect.size(); i++) {
-      char *var_name = (char*)malloc((var_name_vect[i].length() + 1)
-				      *sizeof(char));
-      
-      strcpy(var_name, var_name_vect[i].c_str());
-      var_name_array[i] = var_name;
-    }
-    ret = var_name_array;
-    
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_GetNArrayVar(int i_node)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->GetNArrayVar(i_node);
-  } END_ERR_PROP return ret; }
-
-
-  int64_t *NESTGPU_GetSeqSeqConnections(int i_source, int n_source,
-					int i_target, int n_target,
-					int syn_group, int64_t *n_conn)
-  { int64_t *ret = NULL; BEGIN_ERR_PROP {
-      ret = NESTGPU_instance->GetConnections(i_source, n_source, i_target,
-					     n_target, syn_group, n_conn);
-  } END_ERR_PROP return ret; }
-
-  int64_t *NESTGPU_GetSeqGroupConnections(int i_source, int n_source,
-					  int *i_target_pt, int n_target,
-					  int syn_group, int64_t *n_conn)
-  { int64_t *ret = NULL; BEGIN_ERR_PROP {
-      ret = NESTGPU_instance->GetConnections(i_source, n_source, i_target_pt,
-					     n_target, syn_group, n_conn);
-  } END_ERR_PROP return ret; }
-
-  int64_t *NESTGPU_GetGroupSeqConnections(int *i_source_pt, int n_source,
-					  int i_target, int n_target,
-					  int syn_group, int64_t *n_conn)
-  { int64_t *ret = NULL; BEGIN_ERR_PROP {
-      ret = NESTGPU_instance->GetConnections(i_source_pt, n_source, i_target,
-					     n_target, syn_group, n_conn);
-  } END_ERR_PROP return ret; }
-
-  int64_t *NESTGPU_GetGroupGroupConnections(int *i_source_pt, int n_source,
-					    int *i_target_pt, int n_target,
-					    int syn_group, int64_t *n_conn)
-  { int64_t *ret = NULL; BEGIN_ERR_PROP {
-      ret = NESTGPU_instance->GetConnections(i_source_pt, n_source,
-					     i_target_pt, n_target,
-					     syn_group, n_conn);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_GetConnectionStatus(int64_t *conn_ids, int64_t n_conn,
-				  int *i_source, int *i_target,
-				  int *port,
-				  unsigned char *syn_group, float *delay,
-				  float *weight)
-  { int ret = 0; BEGIN_ERR_PROP {
-      ret = NESTGPU_instance->GetConnectionStatus
-	(conn_ids, n_conn, i_source, i_target, port, syn_group, delay,
-	 weight);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_IsConnectionFloatParam(char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-      std::string param_name_str = std::string(param_name);
-      ret = NESTGPU_instance->IsConnectionFloatParam(param_name_str);
-  } END_ERR_PROP return ret; }
-  
-  int NESTGPU_IsConnectionIntParam(char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-      std::string param_name_str = std::string(param_name);
-      ret = NESTGPU_instance->IsConnectionIntParam(param_name_str);
-  } END_ERR_PROP return ret; }
-  
-  int NESTGPU_GetConnectionFloatParam(int64_t *conn_ids, int64_t n_conn,
-				      float *param_arr, char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-      std::string param_name_str = std::string(param_name);
-      ret = NESTGPU_instance->GetConnectionFloatParam(conn_ids, n_conn,
-						      param_arr,
-						      param_name_str);
-  } END_ERR_PROP return ret; }
-  
-  int NESTGPU_GetConnectionIntParam(int64_t *conn_ids, int64_t n_conn,
-				    int *param_arr, char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-      std::string param_name_str = std::string(param_name);
-      ret = NESTGPU_instance->GetConnectionIntParam(conn_ids, n_conn,
-						    param_arr,
-						    param_name_str);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetConnectionFloatParamDistr(int64_t *conn_ids, int64_t n_conn,
-					   char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-      std::string param_name_str = std::string(param_name);
-      ret = NESTGPU_instance->SetConnectionFloatParamDistr(conn_ids, n_conn,
-							   param_name_str);
-    } END_ERR_PROP return ret; }
-  
-  int NESTGPU_SetConnectionIntParamArr(int64_t *conn_ids, int64_t n_conn,
-				       int *param_arr, char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-      std::string param_name_str = std::string(param_name);
-      ret = NESTGPU_instance->SetConnectionIntParamArr(conn_ids, n_conn,
-						       param_arr,
-						       param_name_str);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetConnectionFloatParam(int64_t *conn_ids, int64_t n_conn,
-				      float val, char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-      std::string param_name_str = std::string(param_name);
-      ret = NESTGPU_instance->SetConnectionFloatParam(conn_ids, n_conn,
-						      val, param_name_str);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetConnectionIntParam(int64_t *conn_ids, int64_t n_conn,
-				    int val, char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-      std::string param_name_str = std::string(param_name);
-      ret = NESTGPU_instance->SetConnectionIntParam(conn_ids, n_conn,
-						    val, param_name_str);
-  } END_ERR_PROP return ret; }
-  
-  int NESTGPU_CreateSynGroup(char *model_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string model_name_str = std::string(model_name);
-    ret = NESTGPU_instance->CreateSynGroup(model_name_str);
-  } END_ERR_PROP return ret; }
-
-
-  int NESTGPU_GetSynGroupNParam(int i_syn_group)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->GetSynGroupNParam(i_syn_group);
-  } END_ERR_PROP return ret; }
-
-  
-  char **NESTGPU_GetSynGroupParamNames(int i_syn_group)
-  { char **ret = NULL; BEGIN_ERR_PROP {
-    std::vector<std::string> name_vect =
-      NESTGPU_instance->GetSynGroupParamNames(i_syn_group);
-    char **name_array = (char**)malloc(name_vect.size()
-				       *sizeof(char*));
-    for (unsigned int i=0; i<name_vect.size(); i++) {
-      char *param_name = (char*)malloc((name_vect[i].length() + 1)
-				       *sizeof(char));
-      
-      strcpy(param_name, name_vect[i].c_str());
-      name_array[i] = param_name;
-    }
-    ret = name_array;
-    
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_IsSynGroupParam(int i_syn_group, char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->IsSynGroupParam(i_syn_group, param_name_str);
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_GetSynGroupParamIdx(int i_syn_group, char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->GetSynGroupParamIdx(i_syn_group, param_name_str);
-  } END_ERR_PROP return ret; }
-
-  
-  float NESTGPU_GetSynGroupParam(int i_syn_group, char *param_name)
-  { float ret = 0.0; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->GetSynGroupParam(i_syn_group, param_name_str);
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_SetSynGroupParam(int i_syn_group, char *param_name, float val)
-  { float ret = 0.0; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->SetSynGroupParam(i_syn_group, param_name_str,
-					       val);
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_ActivateSpikeCount(int i_node, int n_node)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-    ret = NESTGPU_instance->ActivateSpikeCount(i_node, n_node);
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_ActivateRecSpikeTimes(int i_node, int n_node,
-				      int max_n_rec_spike_times)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-      ret = NESTGPU_instance->ActivateRecSpikeTimes(i_node, n_node,
-						      max_n_rec_spike_times);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetRecSpikeTimesStep(int i_node, int n_node,
-				   int rec_spike_times_step)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-      ret = NESTGPU_instance->SetRecSpikeTimesStep(i_node, n_node,
-						   rec_spike_times_step);
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_GetNRecSpikeTimes(int i_node)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-      ret = NESTGPU_instance->GetNRecSpikeTimes(i_node);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_GetRecSpikeTimes(int i_node, int n_node,
-			       int **n_spike_times_pt,
-			       float ***spike_times_pt)
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->GetRecSpikeTimes(i_node, n_node, n_spike_times_pt,
-					     spike_times_pt);
-    
-  } END_ERR_PROP return ret; }
-  
-  int NESTGPU_PushSpikesToNodes(int n_spikes, int *node_id)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-      ret = NESTGPU_instance->PushSpikesToNodes(n_spikes, node_id);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_GetExtNeuronInputSpikes(int *n_spikes, int **node, int **port,
-			      float **spike_height, int include_zeros)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-      ret = NESTGPU_instance->GetExtNeuronInputSpikes(n_spikes, node, port,
-							spike_height,
-							include_zeros>0);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_SetNeuronGroupParam(int i_node, int n_node, char *param_name,
-				    float val)
-  { float ret = 0.0; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->SetNeuronGroupParam(i_node, n_node,
-						  param_name_str,
-						  val);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_IsNeuronGroupParam(int i_node, char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->IsNeuronGroupParam(i_node, param_name_str);
-  } END_ERR_PROP return ret; }
-
-  float NESTGPU_GetNeuronGroupParam(int i_node, char *param_name)
-  { float ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->GetNeuronGroupParam(i_node, param_name_str);
-  } END_ERR_PROP return ret; }
-
-
-  int NESTGPU_GetNBoolParam()
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->GetNBoolParam();
-  } END_ERR_PROP return ret; }
-
-  
-  char **NESTGPU_GetBoolParamNames()
-  { char **ret = NULL; BEGIN_ERR_PROP {
-    std::vector<std::string> name_vect =
-      NESTGPU_instance->GetBoolParamNames();
-    char **name_array = (char**)malloc(name_vect.size()
-				       *sizeof(char*));
-    for (unsigned int i=0; i<name_vect.size(); i++) {
-      char *param_name = (char*)malloc((name_vect[i].length() + 1)
-				       *sizeof(char));
-      
-      strcpy(param_name, name_vect[i].c_str());
-      name_array[i] = param_name;
-    }
-    ret = name_array;
-    
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_IsBoolParam(char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->IsBoolParam(param_name_str);
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_GetBoolParamIdx(char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->GetBoolParamIdx(param_name_str);
-  } END_ERR_PROP return ret; }
-
-  
-  bool NESTGPU_GetBoolParam(char *param_name)
-  { bool ret = true; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->GetBoolParam(param_name_str);
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_SetBoolParam(char *param_name, bool val)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->SetBoolParam(param_name_str, val);
-  } END_ERR_PROP return ret; }
-
-
-  int NESTGPU_GetNFloatParam()
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->GetNFloatParam();
-  } END_ERR_PROP return ret; }
-
-  
-  char **NESTGPU_GetFloatParamNames()
-  { char **ret = NULL; BEGIN_ERR_PROP {
-    std::vector<std::string> name_vect =
-      NESTGPU_instance->GetFloatParamNames();
-    char **name_array = (char**)malloc(name_vect.size()
-				       *sizeof(char*));
-    for (unsigned int i=0; i<name_vect.size(); i++) {
-      char *param_name = (char*)malloc((name_vect[i].length() + 1)
-				       *sizeof(char));
-      
-      strcpy(param_name, name_vect[i].c_str());
-      name_array[i] = param_name;
-    }
-    ret = name_array;
-    
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_IsFloatParam(char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->IsFloatParam(param_name_str);
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_GetFloatParamIdx(char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->GetFloatParamIdx(param_name_str);
-  } END_ERR_PROP return ret; }
-
-  
-  float NESTGPU_GetFloatParam(char *param_name)
-  { float ret = 0.0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->GetFloatParam(param_name_str);
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_SetFloatParam(char *param_name, float val)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->SetFloatParam(param_name_str, val);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_GetNIntParam()
-  { int ret = 0; BEGIN_ERR_PROP {
-    ret = NESTGPU_instance->GetNIntParam();
-  } END_ERR_PROP return ret; }
-
-  
-  char **NESTGPU_GetIntParamNames()
-  { char **ret = NULL; BEGIN_ERR_PROP {
-    std::vector<std::string> name_vect =
-      NESTGPU_instance->GetIntParamNames();
-    char **name_array = (char**)malloc(name_vect.size()
-				       *sizeof(char*));
-    for (unsigned int i=0; i<name_vect.size(); i++) {
-      char *param_name = (char*)malloc((name_vect[i].length() + 1)
-				       *sizeof(char));
-      
-      strcpy(param_name, name_vect[i].c_str());
-      name_array[i] = param_name;
-    }
-    ret = name_array;
-    
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_IsIntParam(char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->IsIntParam(param_name_str);
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_GetIntParamIdx(char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string param_name_str = std::string(param_name);
-    
-    ret = NESTGPU_instance->GetIntParamIdx(param_name_str);
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_GetIntParam(char *param_name)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->GetIntParam(param_name_str);
-  } END_ERR_PROP return ret; }
-
-  
-  int NESTGPU_SetIntParam(char *param_name, int val)
-  { int ret = 0; BEGIN_ERR_PROP {
-    
-    std::string param_name_str = std::string(param_name);
-    ret = NESTGPU_instance->SetIntParam(param_name_str, val);
-  } END_ERR_PROP return ret; }
-
-  int NESTGPU_RemoteCreate(int i_host, char *model_name, int n_neuron,
-			     int n_port)
-  { int ret = 0; BEGIN_ERR_PROP {
-    std::string model_name_str = std::string(model_name);
-    RemoteNodeSeq rneur = NESTGPU_instance->RemoteCreate(i_host,
-							   model_name_str,
-							   n_neuron,
-							   n_port);
-    ret = rneur.node_seq[0];
-  } END_ERR_PROP return ret; }
+    NESTGPU_instance->SetOnException( on_exception );
+  }
+
+  unsigned int* RandomInt( size_t n );
+
+  int
+  NESTGPU_SetRandomSeed( unsigned long long seed )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->SetRandomSeed( seed );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetTimeResolution( float time_res )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->SetTimeResolution( time_res );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  float
+  NESTGPU_GetTimeResolution()
+  {
+    float ret = 0.0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetTimeResolution();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetMaxSpikeBufferSize( int max_size )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->SetMaxSpikeBufferSize( max_size );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetMaxSpikeBufferSize()
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetMaxSpikeBufferSize();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetSimTime( float sim_time )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->SetSimTime( sim_time );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetVerbosityLevel( int verbosity_level )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->SetVerbosityLevel( verbosity_level );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNestedLoopAlgo( int nested_loop_algo )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->SetNestedLoopAlgo( nested_loop_algo );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_Create( char* model_name, int n_neuron, int n_port )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string model_name_str = std::string( model_name );
+      NodeSeq neur = NESTGPU_instance->Create( model_name_str, n_neuron, n_port );
+      ret = neur[ 0 ];
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_CreateRecord( char* file_name, char* var_name_arr[], int* i_node_arr, int* port_arr, int n_node )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string file_name_str = std::string( file_name );
+      std::vector< std::string > var_name_vect;
+      for ( int i = 0; i < n_node; i++ )
+      {
+        std::string var_name = std::string( var_name_arr[ i ] );
+        var_name_vect.push_back( var_name );
+      }
+      ret = NESTGPU_instance->CreateRecord( file_name_str, var_name_vect.data(), i_node_arr, port_arr, n_node );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetRecordDataRows( int i_record )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::vector< std::vector< float > >* data_vect_pt = NESTGPU_instance->GetRecordData( i_record );
+
+      ret = data_vect_pt->size();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetRecordDataColumns( int i_record )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::vector< std::vector< float > >* data_vect_pt = NESTGPU_instance->GetRecordData( i_record );
+
+      ret = data_vect_pt->at( 0 ).size();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  float**
+  NESTGPU_GetRecordData( int i_record )
+  {
+    float** ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::vector< std::vector< float > >* data_vect_pt = NESTGPU_instance->GetRecordData( i_record );
+      int nr = data_vect_pt->size();
+      ret = new float*[ nr ];
+      for ( int i = 0; i < nr; i++ )
+      {
+        ret[ i ] = data_vect_pt->at( i ).data();
+      }
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronScalParam( int i_node, int n_neuron, char* param_name, float val )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetNeuronParam( i_node, n_neuron, param_name_str, val );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronArrayParam( int i_node, int n_neuron, char* param_name, float* param, int array_size )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetNeuronParam( i_node, n_neuron, param_name_str, param, array_size );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronPtScalParam( int* i_node, int n_neuron, char* param_name, float val )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetNeuronParam( i_node, n_neuron, param_name_str, val );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronPtArrayParam( int* i_node, int n_neuron, char* param_name, float* param, int array_size )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetNeuronParam( i_node, n_neuron, param_name_str, param, array_size );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_IsNeuronScalParam( int i_node, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->IsNeuronScalParam( i_node, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_IsNeuronPortParam( int i_node, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->IsNeuronPortParam( i_node, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_IsNeuronArrayParam( int i_node, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->IsNeuronArrayParam( i_node, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronIntVar( int i_node, int n_neuron, char* var_name, int val )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string var_name_str = std::string( var_name );
+      ret = NESTGPU_instance->SetNeuronIntVar( i_node, n_neuron, var_name_str, val );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronScalVar( int i_node, int n_neuron, char* var_name, float val )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string var_name_str = std::string( var_name );
+      ret = NESTGPU_instance->SetNeuronVar( i_node, n_neuron, var_name_str, val );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronArrayVar( int i_node, int n_neuron, char* var_name, float* var, int array_size )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string var_name_str = std::string( var_name );
+      ret = NESTGPU_instance->SetNeuronVar( i_node, n_neuron, var_name_str, var, array_size );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronPtIntVar( int* i_node, int n_neuron, char* var_name, int val )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string var_name_str = std::string( var_name );
+      ret = NESTGPU_instance->SetNeuronIntVar( i_node, n_neuron, var_name_str, val );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronPtScalVar( int* i_node, int n_neuron, char* var_name, float val )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string var_name_str = std::string( var_name );
+      ret = NESTGPU_instance->SetNeuronVar( i_node, n_neuron, var_name_str, val );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronPtArrayVar( int* i_node, int n_neuron, char* var_name, float* var, int array_size )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string var_name_str = std::string( var_name );
+      ret = NESTGPU_instance->SetNeuronVar( i_node, n_neuron, var_name_str, var, array_size );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronScalParamDistr( int i_node, int n_neuron, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetNeuronScalParamDistr( i_node, n_neuron, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronScalVarDistr( int i_node, int n_neuron, char* var_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string var_name_str = std::string( var_name );
+      ret = NESTGPU_instance->SetNeuronScalVarDistr( i_node, n_neuron, var_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronPortParamDistr( int i_node, int n_neuron, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetNeuronPortParamDistr( i_node, n_neuron, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronPortVarDistr( int i_node, int n_neuron, char* var_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string var_name_str = std::string( var_name );
+      ret = NESTGPU_instance->SetNeuronPortVarDistr( i_node, n_neuron, var_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronPtScalParamDistr( int* i_node, int n_neuron, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetNeuronPtScalParamDistr( i_node, n_neuron, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronPtScalVarDistr( int* i_node, int n_neuron, char* var_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string var_name_str = std::string( var_name );
+      ret = NESTGPU_instance->SetNeuronPtScalVarDistr( i_node, n_neuron, var_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronPtPortParamDistr( int* i_node, int n_neuron, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetNeuronPtPortParamDistr( i_node, n_neuron, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronPtPortVarDistr( int* i_node, int n_neuron, char* var_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string var_name_str = std::string( var_name );
+      ret = NESTGPU_instance->SetNeuronPtPortVarDistr( i_node, n_neuron, var_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetDistributionIntParam( char* param_name, int val )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetDistributionIntParam( param_name_str, val );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetDistributionScalParam( char* param_name, float val )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetDistributionScalParam( param_name_str, val );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetDistributionVectParam( char* param_name, float val, int i )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetDistributionVectParam( param_name_str, val, i );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetDistributionFloatPtParam( char* param_name, float* array_pt )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetDistributionFloatPtParam( param_name_str, array_pt );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_IsDistributionFloatParam( char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->IsDistributionFloatParam( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_IsNeuronIntVar( int i_node, char* var_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string var_name_str = std::string( var_name );
+
+      ret = NESTGPU_instance->IsNeuronIntVar( i_node, var_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_IsNeuronScalVar( int i_node, char* var_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string var_name_str = std::string( var_name );
+
+      ret = NESTGPU_instance->IsNeuronScalVar( i_node, var_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_IsNeuronPortVar( int i_node, char* var_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string var_name_str = std::string( var_name );
+
+      ret = NESTGPU_instance->IsNeuronPortVar( i_node, var_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_IsNeuronArrayVar( int i_node, char* var_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string var_name_str = std::string( var_name );
+
+      ret = NESTGPU_instance->IsNeuronArrayVar( i_node, var_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetNeuronParamSize( int i_node, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->GetNeuronParamSize( i_node, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetNeuronVarSize( int i_node, char* var_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string var_name_str = std::string( var_name );
+
+      ret = NESTGPU_instance->GetNeuronVarSize( i_node, var_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  float*
+  NESTGPU_GetNeuronParam( int i_node, int n_neuron, char* param_name )
+  {
+    float* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->GetNeuronParam( i_node, n_neuron, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  float*
+  NESTGPU_GetNeuronPtParam( int* i_node, int n_neuron, char* param_name )
+  {
+    float* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->GetNeuronParam( i_node, n_neuron, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  float*
+  NESTGPU_GetArrayParam( int i_node, char* param_name )
+  {
+    float* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->GetArrayParam( i_node, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int*
+  NESTGPU_GetNeuronIntVar( int i_node, int n_neuron, char* param_name )
+  {
+    int* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->GetNeuronIntVar( i_node, n_neuron, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int*
+  NESTGPU_GetNeuronPtIntVar( int* i_node, int n_neuron, char* param_name )
+  {
+    int* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->GetNeuronIntVar( i_node, n_neuron, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  float*
+  NESTGPU_GetNeuronVar( int i_node, int n_neuron, char* param_name )
+  {
+    float* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->GetNeuronVar( i_node, n_neuron, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  float*
+  NESTGPU_GetNeuronPtVar( int* i_node, int n_neuron, char* param_name )
+  {
+    float* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->GetNeuronVar( i_node, n_neuron, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  float*
+  NESTGPU_GetArrayVar( int i_node, char* var_name )
+  {
+    float* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+
+      std::string var_name_str = std::string( var_name );
+      ret = NESTGPU_instance->GetArrayVar( i_node, var_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_Calibrate()
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->Calibrate();
+    }
+    END_ERR_PROP return ret;
+  }
 
+  int
+  NESTGPU_Simulate()
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->Simulate();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_StartSimulation()
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->StartSimulation();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SimulationStep()
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->SimulationStep();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_EndSimulation()
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->EndSimulation();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_ConnectMpiInit( int argc, char* argv[] )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->ConnectMpiInit( argc, argv );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_MpiFinalize()
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->MpiFinalize();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_HostId()
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->HostId();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_HostNum()
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->HostNum();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  size_t
+  NESTGPU_getCUDAMemHostUsed()
+  {
+    size_t ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->getCUDAMemHostUsed();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  size_t
+  NESTGPU_getCUDAMemHostPeak()
+  {
+    size_t ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->getCUDAMemHostPeak();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  size_t
+  NESTGPU_getCUDAMemTotal()
+  {
+    size_t ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->getCUDAMemTotal();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  size_t
+  NESTGPU_getCUDAMemFree()
+  {
+    size_t ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->getCUDAMemFree();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  unsigned int*
+  NESTGPU_RandomInt( size_t n )
+  {
+    unsigned int* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->RandomInt( n );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  float*
+  NESTGPU_RandomUniform( size_t n )
+  {
+    float* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->RandomUniform( n );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  float*
+  NESTGPU_RandomNormal( size_t n, float mean, float stddev )
+  {
+    float* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->RandomNormal( n, mean, stddev );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  float*
+  NESTGPU_RandomNormalClipped( size_t n, float mean, float stddev, float vmin, float vmax, float vstep )
+  {
+    float* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->RandomNormalClipped( n, mean, stddev, vmin, vmax, vstep );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_ConnSpecInit()
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = ConnSpec_instance.Init();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetConnSpecParam( char* param_name, int value )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = ConnSpec_instance.SetParam( param_name_str, value );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_ConnSpecIsParam( char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = ConnSpec::IsParam( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SynSpecInit()
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = SynSpec_instance.Init();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetSynSpecIntParam( char* param_name, int value )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = SynSpec_instance.SetParam( param_name_str, value );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetSynSpecFloatParam( char* param_name, float value )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = SynSpec_instance.SetParam( param_name_str, value );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetSynSpecFloatPtParam( char* param_name, float* array_pt )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = SynSpec_instance.SetParam( param_name_str, array_pt );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SynSpecIsIntParam( char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = SynSpec_instance.IsIntParam( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SynSpecIsFloatParam( char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = SynSpec_instance.IsFloatParam( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SynSpecIsFloatPtParam( char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = SynSpec_instance.IsFloatPtParam( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_ConnectSeqSeq( uint i_source, uint n_source, uint i_target, uint n_target )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->Connect( i_source, n_source, i_target, n_target, ConnSpec_instance, SynSpec_instance );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_ConnectSeqGroup( uint i_source, uint n_source, uint* i_target, uint n_target )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->Connect( i_source, n_source, i_target, n_target, ConnSpec_instance, SynSpec_instance );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_ConnectGroupSeq( uint* i_source, uint n_source, uint i_target, uint n_target )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->Connect( i_source, n_source, i_target, n_target, ConnSpec_instance, SynSpec_instance );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_ConnectGroupGroup( uint* i_source, uint n_source, uint* i_target, uint n_target )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->Connect( i_source, n_source, i_target, n_target, ConnSpec_instance, SynSpec_instance );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_RemoteConnectSeqSeq( int i_source_host,
+    uint i_source,
+    uint n_source,
+    int i_target_host,
+    uint i_target,
+    uint n_target )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->RemoteConnect(
+        i_source_host, i_source, n_source, i_target_host, i_target, n_target, ConnSpec_instance, SynSpec_instance );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_RemoteConnectSeqGroup( int i_source_host,
+    uint i_source,
+    uint n_source,
+    int i_target_host,
+    uint* i_target,
+    uint n_target )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->RemoteConnect(
+        i_source_host, i_source, n_source, i_target_host, i_target, n_target, ConnSpec_instance, SynSpec_instance );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_RemoteConnectGroupSeq( int i_source_host,
+    uint* i_source,
+    uint n_source,
+    int i_target_host,
+    uint i_target,
+    uint n_target )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->RemoteConnect(
+        i_source_host, i_source, n_source, i_target_host, i_target, n_target, ConnSpec_instance, SynSpec_instance );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_RemoteConnectGroupGroup( int i_source_host,
+    uint* i_source,
+    uint n_source,
+    int i_target_host,
+    uint* i_target,
+    uint n_target )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->RemoteConnect(
+        i_source_host, i_source, n_source, i_target_host, i_target, n_target, ConnSpec_instance, SynSpec_instance );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  char**
+  NESTGPU_GetIntVarNames( uint i_node )
+  {
+    char** ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::vector< std::string > var_name_vect = NESTGPU_instance->GetIntVarNames( i_node );
+      char** var_name_array = ( char** ) malloc( var_name_vect.size() * sizeof( char* ) );
+      for ( unsigned int i = 0; i < var_name_vect.size(); i++ )
+      {
+        uint vl = var_name_vect[ i ].length() + 1;
+        char* var_name = ( char* ) malloc( ( vl ) * sizeof( char ) );
+
+        strncpy( var_name, var_name_vect[ i ].c_str(), vl );
+        var_name_array[ i ] = var_name;
+      }
+      ret = var_name_array;
+    }
+    END_ERR_PROP return ret;
+  }
+
+  char**
+  NESTGPU_GetScalVarNames( uint i_node )
+  {
+    char** ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::vector< std::string > var_name_vect = NESTGPU_instance->GetScalVarNames( i_node );
+      char** var_name_array = ( char** ) malloc( var_name_vect.size() * sizeof( char* ) );
+      for ( unsigned int i = 0; i < var_name_vect.size(); i++ )
+      {
+        uint vl = var_name_vect[ i ].length() + 1;
+        char* var_name = ( char* ) malloc( ( vl ) * sizeof( char ) );
+
+        strncpy( var_name, var_name_vect[ i ].c_str(), vl );
+        var_name_array[ i ] = var_name;
+      }
+      ret = var_name_array;
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetNIntVar( uint i_node )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetNIntVar( i_node );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetNScalVar( uint i_node )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetNScalVar( i_node );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  char**
+  NESTGPU_GetPortVarNames( uint i_node )
+  {
+    char** ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::vector< std::string > var_name_vect = NESTGPU_instance->GetPortVarNames( i_node );
+      char** var_name_array = ( char** ) malloc( var_name_vect.size() * sizeof( char* ) );
+      for ( unsigned int i = 0; i < var_name_vect.size(); i++ )
+      {
+        uint vl = var_name_vect[ i ].length() + 1;
+        char* var_name = ( char* ) malloc( ( vl ) * sizeof( char ) );
+
+        strncpy( var_name, var_name_vect[ i ].c_str(), vl );
+        var_name_array[ i ] = var_name;
+      }
+      ret = var_name_array;
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetNPortVar( uint i_node )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetNPortVar( i_node );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  char**
+  NESTGPU_GetScalParamNames( uint i_node )
+  {
+    char** ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::vector< std::string > var_name_vect = NESTGPU_instance->GetScalParamNames( i_node );
+      char** var_name_array = ( char** ) malloc( var_name_vect.size() * sizeof( char* ) );
+      for ( unsigned int i = 0; i < var_name_vect.size(); i++ )
+      {
+        uint vl = var_name_vect[ i ].length() + 1;
+        char* var_name = ( char* ) malloc( ( vl ) * sizeof( char ) );
+
+        strncpy( var_name, var_name_vect[ i ].c_str(), vl );
+        var_name_array[ i ] = var_name;
+      }
+      ret = var_name_array;
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetNScalParam( uint i_node )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetNScalParam( i_node );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  char**
+  NESTGPU_GetGroupParamNames( uint i_node )
+  {
+    char** ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::vector< std::string > var_name_vect = NESTGPU_instance->GetGroupParamNames( i_node );
+      char** var_name_array = ( char** ) malloc( var_name_vect.size() * sizeof( char* ) );
+      for ( unsigned int i = 0; i < var_name_vect.size(); i++ )
+      {
+        uint vl = var_name_vect[ i ].length() + 1;
+        char* var_name = ( char* ) malloc( ( vl ) * sizeof( char ) );
+
+        strncpy( var_name, var_name_vect[ i ].c_str(), vl );
+        var_name_array[ i ] = var_name;
+      }
+      ret = var_name_array;
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetNGroupParam( uint i_node )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetNGroupParam( i_node );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  char**
+  NESTGPU_GetPortParamNames( uint i_node )
+  {
+    char** ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::vector< std::string > var_name_vect = NESTGPU_instance->GetPortParamNames( i_node );
+      char** var_name_array = ( char** ) malloc( var_name_vect.size() * sizeof( char* ) );
+      for ( unsigned int i = 0; i < var_name_vect.size(); i++ )
+      {
+        uint vl = var_name_vect[ i ].length() + 1;
+        char* var_name = ( char* ) malloc( ( vl ) * sizeof( char ) );
+
+        strncpy( var_name, var_name_vect[ i ].c_str(), vl );
+        var_name_array[ i ] = var_name;
+      }
+      ret = var_name_array;
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetNPortParam( uint i_node )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetNPortParam( i_node );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  char**
+  NESTGPU_GetArrayParamNames( uint i_node )
+  {
+    char** ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::vector< std::string > var_name_vect = NESTGPU_instance->GetArrayParamNames( i_node );
+      char** var_name_array = ( char** ) malloc( var_name_vect.size() * sizeof( char* ) );
+      for ( unsigned int i = 0; i < var_name_vect.size(); i++ )
+      {
+        uint vl = var_name_vect[ i ].length() + 1;
+        char* var_name = ( char* ) malloc( ( vl ) * sizeof( char ) );
+
+        strncpy( var_name, var_name_vect[ i ].c_str(), vl );
+        var_name_array[ i ] = var_name;
+      }
+      ret = var_name_array;
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetNArrayParam( uint i_node )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetNArrayParam( i_node );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  char**
+  NESTGPU_GetArrayVarNames( uint i_node )
+  {
+    char** ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::vector< std::string > var_name_vect = NESTGPU_instance->GetArrayVarNames( i_node );
+      char** var_name_array = ( char** ) malloc( var_name_vect.size() * sizeof( char* ) );
+      for ( unsigned int i = 0; i < var_name_vect.size(); i++ )
+      {
+        uint vl = var_name_vect[ i ].length() + 1;
+        char* var_name = ( char* ) malloc( ( vl ) * sizeof( char ) );
+
+        strncpy( var_name, var_name_vect[ i ].c_str(), vl );
+        var_name_array[ i ] = var_name;
+      }
+      ret = var_name_array;
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetNArrayVar( uint i_node )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetNArrayVar( i_node );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int64_t*
+  NESTGPU_GetSeqSeqConnections( uint i_source,
+    uint n_source,
+    uint i_target,
+    uint n_target,
+    int syn_group,
+    int64_t* n_conn )
+  {
+    int64_t* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetConnections( i_source, n_source, i_target, n_target, syn_group, n_conn );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int64_t*
+  NESTGPU_GetSeqGroupConnections( uint i_source,
+    uint n_source,
+    uint* i_target_pt,
+    uint n_target,
+    int syn_group,
+    int64_t* n_conn )
+  {
+    int64_t* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetConnections( i_source, n_source, i_target_pt, n_target, syn_group, n_conn );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int64_t*
+  NESTGPU_GetGroupSeqConnections( uint* i_source_pt,
+    uint n_source,
+    uint i_target,
+    uint n_target,
+    int syn_group,
+    int64_t* n_conn )
+  {
+    int64_t* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetConnections( i_source_pt, n_source, i_target, n_target, syn_group, n_conn );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int64_t*
+  NESTGPU_GetGroupGroupConnections( uint* i_source_pt,
+    uint n_source,
+    uint* i_target_pt,
+    uint n_target,
+    int syn_group,
+    int64_t* n_conn )
+  {
+    int64_t* ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetConnections( i_source_pt, n_source, i_target_pt, n_target, syn_group, n_conn );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetConnectionStatus( int64_t* conn_ids,
+    int64_t n_conn,
+    uint* i_source,
+    uint* i_target,
+    int* port,
+    int* syn_group,
+    float* delay,
+    float* weight )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret =
+        NESTGPU_instance->GetConnectionStatus( conn_ids, n_conn, i_source, i_target, port, syn_group, delay, weight );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_IsConnectionFloatParam( char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->IsConnectionFloatParam( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_IsConnectionIntParam( char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->IsConnectionIntParam( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetConnectionFloatParam( int64_t* conn_ids, int64_t n_conn, float* param_arr, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->GetConnectionFloatParam( conn_ids, n_conn, param_arr, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetConnectionIntParam( int64_t* conn_ids, int64_t n_conn, int* param_arr, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->GetConnectionIntParam( conn_ids, n_conn, param_arr, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetConnectionFloatParamDistr( int64_t* conn_ids, int64_t n_conn, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetConnectionFloatParamDistr( conn_ids, n_conn, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetConnectionIntParamArr( int64_t* conn_ids, int64_t n_conn, int* param_arr, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetConnectionIntParamArr( conn_ids, n_conn, param_arr, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetConnectionFloatParam( int64_t* conn_ids, int64_t n_conn, float val, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetConnectionFloatParam( conn_ids, n_conn, val, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetConnectionIntParam( int64_t* conn_ids, int64_t n_conn, int val, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetConnectionIntParam( conn_ids, n_conn, val, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_CreateSynGroup( char* model_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string model_name_str = std::string( model_name );
+      ret = NESTGPU_instance->CreateSynGroup( model_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetSynGroupNParam( int i_syn_group )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetSynGroupNParam( i_syn_group );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  char**
+  NESTGPU_GetSynGroupParamNames( int i_syn_group )
+  {
+    char** ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::vector< std::string > name_vect = NESTGPU_instance->GetSynGroupParamNames( i_syn_group );
+      char** name_array = ( char** ) malloc( name_vect.size() * sizeof( char* ) );
+      for ( unsigned int i = 0; i < name_vect.size(); i++ )
+      {
+        uint vl = name_vect[ i ].length() + 1;
+        char* param_name = ( char* ) malloc( ( vl ) * sizeof( char ) );
+
+        strncpy( param_name, name_vect[ i ].c_str(), vl );
+        name_array[ i ] = param_name;
+      }
+      ret = name_array;
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_IsSynGroupParam( int i_syn_group, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->IsSynGroupParam( i_syn_group, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetSynGroupParamIdx( int i_syn_group, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->GetSynGroupParamIdx( i_syn_group, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  float
+  NESTGPU_GetSynGroupParam( int i_syn_group, char* param_name )
+  {
+    float ret = 0.0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->GetSynGroupParam( i_syn_group, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetSynGroupParam( int i_syn_group, char* param_name, float val )
+  {
+    float ret = 0.0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetSynGroupParam( i_syn_group, param_name_str, val );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_ActivateSpikeCount( uint i_node, int n_node )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->ActivateSpikeCount( i_node, n_node );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_ActivateRecSpikeTimes( uint i_node, int n_node, int max_n_rec_spike_times )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      ret = NESTGPU_instance->ActivateRecSpikeTimes( i_node, n_node, max_n_rec_spike_times );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetRecSpikeTimesStep( uint i_node, int n_node, int rec_spike_times_step )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      ret = NESTGPU_instance->SetRecSpikeTimesStep( i_node, n_node, rec_spike_times_step );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetNRecSpikeTimes( uint i_node )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetNRecSpikeTimes( i_node );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetRecSpikeTimes( uint i_node, int n_node, int** n_spike_times_pt, float*** spike_times_pt )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetRecSpikeTimes( i_node, n_node, n_spike_times_pt, spike_times_pt );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_PushSpikesToNodes( int n_spikes, int* node_id )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      ret = NESTGPU_instance->PushSpikesToNodes( n_spikes, node_id );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetExtNeuronInputSpikes( int* n_spikes, int** node, int** port, float** spike_height, int include_zeros )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      ret = NESTGPU_instance->GetExtNeuronInputSpikes( n_spikes, node, port, spike_height, include_zeros > 0 );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetNeuronGroupParam( uint i_node, int n_node, char* param_name, float val )
+  {
+    float ret = 0.0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetNeuronGroupParam( i_node, n_node, param_name_str, val );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_IsNeuronGroupParam( uint i_node, char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->IsNeuronGroupParam( i_node, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  float
+  NESTGPU_GetNeuronGroupParam( uint i_node, char* param_name )
+  {
+    float ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->GetNeuronGroupParam( i_node, param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetNBoolParam()
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetNBoolParam();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  char**
+  NESTGPU_GetBoolParamNames()
+  {
+    char** ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::vector< std::string > name_vect = NESTGPU_instance->GetBoolParamNames();
+      char** name_array = ( char** ) malloc( name_vect.size() * sizeof( char* ) );
+      for ( unsigned int i = 0; i < name_vect.size(); i++ )
+      {
+        uint vl = name_vect[ i ].length() + 1;
+        char* param_name = ( char* ) malloc( ( vl ) * sizeof( char ) );
+
+        strncpy( param_name, name_vect[ i ].c_str(), vl );
+        name_array[ i ] = param_name;
+      }
+      ret = name_array;
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_IsBoolParam( char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->IsBoolParam( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetBoolParamIdx( char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->GetBoolParamIdx( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  bool
+  NESTGPU_GetBoolParam( char* param_name )
+  {
+    bool ret = true;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->GetBoolParam( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetBoolParam( char* param_name, bool val )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->SetBoolParam( param_name_str, val );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetNFloatParam()
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetNFloatParam();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  char**
+  NESTGPU_GetFloatParamNames()
+  {
+    char** ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::vector< std::string > name_vect = NESTGPU_instance->GetFloatParamNames();
+      char** name_array = ( char** ) malloc( name_vect.size() * sizeof( char* ) );
+      for ( unsigned int i = 0; i < name_vect.size(); i++ )
+      {
+        uint vl = name_vect[ i ].length() + 1;
+        char* param_name = ( char* ) malloc( ( vl ) * sizeof( char ) );
+
+        strncpy( param_name, name_vect[ i ].c_str(), vl );
+        name_array[ i ] = param_name;
+      }
+      ret = name_array;
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_IsFloatParam( char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->IsFloatParam( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetFloatParamIdx( char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->GetFloatParamIdx( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  float
+  NESTGPU_GetFloatParam( char* param_name )
+  {
+    float ret = 0.0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->GetFloatParam( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetFloatParam( char* param_name, float val )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->SetFloatParam( param_name_str, val );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetNIntParam()
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      ret = NESTGPU_instance->GetNIntParam();
+    }
+    END_ERR_PROP return ret;
+  }
+
+  char**
+  NESTGPU_GetIntParamNames()
+  {
+    char** ret = nullptr;
+    BEGIN_ERR_PROP
+    {
+      std::vector< std::string > name_vect = NESTGPU_instance->GetIntParamNames();
+      char** name_array = ( char** ) malloc( name_vect.size() * sizeof( char* ) );
+      for ( unsigned int i = 0; i < name_vect.size(); i++ )
+      {
+        uint vl = name_vect[ i ].length() + 1;
+        char* param_name = ( char* ) malloc( ( vl ) * sizeof( char ) );
+
+        strncpy( param_name, name_vect[ i ].c_str(), vl );
+        name_array[ i ] = param_name;
+      }
+      ret = name_array;
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_IsIntParam( char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->IsIntParam( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetIntParamIdx( char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string param_name_str = std::string( param_name );
+
+      ret = NESTGPU_instance->GetIntParamIdx( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_GetIntParam( char* param_name )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->GetIntParam( param_name_str );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_SetIntParam( char* param_name, int val )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+
+      std::string param_name_str = std::string( param_name );
+      ret = NESTGPU_instance->SetIntParam( param_name_str, val );
+    }
+    END_ERR_PROP return ret;
+  }
+
+  int
+  NESTGPU_RemoteCreate( int i_host, char* model_name, int n_neuron, int n_port )
+  {
+    int ret = 0;
+    BEGIN_ERR_PROP
+    {
+      std::string model_name_str = std::string( model_name );
+      RemoteNodeSeq rneur = NESTGPU_instance->RemoteCreate( i_host, model_name_str, n_neuron, n_port );
+      ret = rneur.node_seq[ 0 ];
+    }
+    END_ERR_PROP return ret;
+  }
 }
diff --git a/src/nestgpu_C.h b/src/nestgpu_C.h
index 495d6c848..620a7ef73 100644
--- a/src/nestgpu_C.h
+++ b/src/nestgpu_C.h
@@ -20,161 +20,126 @@
  *
  */
 
-
-
-
-
 #ifndef NESTGPUC_H
 #define NESTGPUC_H
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
-  
-  char *NESTGPU_GetErrorMessage();
+
+  char* NESTGPU_GetErrorMessage();
 
   unsigned char NESTGPU_GetErrorCode();
-  
-  void NESTGPU_SetOnException(int on_exception);
 
-  int NESTGPU_SetRandomSeed(unsigned long long seed);
+  void NESTGPU_SetOnException( int on_exception );
+
+  int NESTGPU_SetRandomSeed( unsigned long long seed );
 
-  int NESTGPU_SetTimeResolution(float time_res);
+  int NESTGPU_SetTimeResolution( float time_res );
 
   float NESTGPU_GetTimeResolution();
 
-  int NESTGPU_SetMaxSpikeBufferSize(int max_size);
+  int NESTGPU_SetMaxSpikeBufferSize( int max_size );
 
   int NESTGPU_GetMaxSpikeBufferSize();
 
-  int NESTGPU_SetSimTime(float sim_time);
+  int NESTGPU_SetSimTime( float sim_time );
+
+  int NESTGPU_SetVerbosityLevel( int verbosity_level );
+
+  int NESTGPU_SetNestedLoopAlgo( int nested_loop_algo );
+
+  int NESTGPU_Create( char* model_name, int n_neuron, int n_port );
+
+  int NESTGPU_CreateRecord( char* file_name, char* var_name_arr[], int* i_node_arr, int* port_arr, int n_node );
+
+  int NESTGPU_GetRecordDataRows( int i_record );
+
+  int NESTGPU_GetRecordDataColumns( int i_record );
+
+  float** NESTGPU_GetRecordData( int i_record );
 
-  int NESTGPU_SetVerbosityLevel(int verbosity_level);
+  int NESTGPU_SetNeuronScalParam( int i_node, int n_neuron, char* param_name, float val );
 
-  int NESTGPU_SetNestedLoopAlgo(int nested_loop_algo);
+  int NESTGPU_SetNeuronArrayParam( int i_node, int n_neuron, char* param_name, float* param, int array_size );
 
-  int NESTGPU_Create(char *model_name, int n_neuron, int n_port);
+  int NESTGPU_SetNeuronPtScalParam( int* i_node, int n_neuron, char* param_name, float val );
 
-  int NESTGPU_CreateRecord(char *file_name, char *var_name_arr[],
-			     int *i_node_arr, int *port_arr,
-			     int n_node);
-  
-  int NESTGPU_GetRecordDataRows(int i_record);
-  
-  int NESTGPU_GetRecordDataColumns(int i_record);
+  int NESTGPU_SetNeuronPtArrayParam( int* i_node, int n_neuron, char* param_name, float* param, int array_size );
 
-  float **NESTGPU_GetRecordData(int i_record);
+  int NESTGPU_IsNeuronScalParam( int i_node, char* param_name );
 
-  int NESTGPU_SetNeuronScalParam(int i_node, int n_neuron, char *param_name,
-				   float val);
+  int NESTGPU_IsNeuronPortParam( int i_node, char* param_name );
 
-  int NESTGPU_SetNeuronArrayParam(int i_node, int n_neuron,
-				    char *param_name, float *param,
-				    int array_size);
+  int NESTGPU_IsNeuronArrayParam( int i_node, char* param_name );
 
-  int NESTGPU_SetNeuronPtScalParam(int *i_node, int n_neuron,
-				     char *param_name, float val);
+  int NESTGPU_SetNeuronIntVar( int i_node, int n_neuron, char* var_name, int val );
 
-  int NESTGPU_SetNeuronPtArrayParam(int *i_node, int n_neuron,
-				      char *param_name, float *param,
-				      int array_size);
-  
-  int NESTGPU_IsNeuronScalParam(int i_node, char *param_name);
-  
-  int NESTGPU_IsNeuronPortParam(int i_node, char *param_name);
+  int NESTGPU_SetNeuronScalVar( int i_node, int n_neuron, char* var_name, float val );
 
-  int NESTGPU_IsNeuronArrayParam(int i_node, char *param_name);
-  
+  int NESTGPU_SetNeuronArrayVar( int i_node, int n_neuron, char* var_name, float* var, int array_size );
 
-  int NESTGPU_SetNeuronIntVar(int i_node, int n_neuron, char *var_name,
-				int val);
+  int NESTGPU_SetNeuronPtIntVar( int* i_node, int n_neuron, char* var_name, int val );
 
-  int NESTGPU_SetNeuronScalVar(int i_node, int n_neuron, char *var_name,
-				   float val);
+  int NESTGPU_SetNeuronPtScalVar( int* i_node, int n_neuron, char* var_name, float val );
 
-  int NESTGPU_SetNeuronArrayVar(int i_node, int n_neuron,
-				    char *var_name, float *var,
-				    int array_size);
+  int NESTGPU_SetNeuronPtArrayVar( int* i_node, int n_neuron, char* var_name, float* var, int array_size );
 
-  int NESTGPU_SetNeuronPtIntVar(int *i_node, int n_neuron,
-				  char *var_name, int val);
+  int NESTGPU_SetNeuronScalParamDistr( int i_node, int n_neuron, char* param_name );
 
-  int NESTGPU_SetNeuronPtScalVar(int *i_node, int n_neuron,
-				     char *var_name, float val);
+  int NESTGPU_SetNeuronScalVarDistr( int i_node, int n_neuron, char* var_name );
 
-  int NESTGPU_SetNeuronPtArrayVar(int *i_node, int n_neuron,
-				      char *var_name, float *var,
-				      int array_size);
-  
-  int NESTGPU_SetNeuronScalParamDistr(int i_node, int n_neuron,
-				      char *param_name);
+  int NESTGPU_SetNeuronPortParamDistr( int i_node, int n_neuron, char* param_name );
 
-  int NESTGPU_SetNeuronScalVarDistr(int i_node, int n_neuron,
-				    char *var_name);
+  int NESTGPU_SetNeuronPortVarDistr( int i_node, int n_neuron, char* var_name );
 
+  int NESTGPU_SetNeuronPtScalParamDistr( int* i_node, int n_neuron, char* param_name );
 
-  int NESTGPU_SetNeuronPortParamDistr(int i_node, int n_neuron,
-				      char *param_name);
+  int NESTGPU_SetNeuronPtScalVarDistr( int* i_node, int n_neuron, char* var_name );
 
-  int NESTGPU_SetNeuronPortVarDistr(int i_node, int n_neuron,
-				    char *var_name);
+  int NESTGPU_SetNeuronPtPortParamDistr( int* i_node, int n_neuron, char* param_name );
 
-  int NESTGPU_SetNeuronPtScalParamDistr(int *i_node, int n_neuron,
-					char *param_name);
+  int NESTGPU_SetNeuronPtPortVarDistr( int* i_node, int n_neuron, char* var_name );
 
-  int NESTGPU_SetNeuronPtScalVarDistr(int *i_node, int n_neuron,
-				      char *var_name);
+  int NESTGPU_SetDistributionIntParam( char* param_name, int val );
 
-  int NESTGPU_SetNeuronPtPortParamDistr(int *i_node, int n_neuron,
-					char *param_name);
+  int NESTGPU_SetDistributionScalParam( char* param_name, float val );
 
-  int NESTGPU_SetNeuronPtPortVarDistr(int *i_node, int n_neuron,
-				      char *var_name);
+  int NESTGPU_SetDistributionVectParam( char* param_name, float val, int i );
 
-  int NESTGPU_SetDistributionIntParam(char *param_name, int val);
+  int NESTGPU_SetDistributionFloatPtParam( char* param_name, float* array_pt );
 
-  int NESTGPU_SetDistributionScalParam(char *param_name, float val);
+  int NESTGPU_IsDistributionFloatParam( char* param_name );
 
-  int NESTGPU_SetDistributionVectParam(char *param_name, float val, int i);
+  int NESTGPU_IsNeuronIntVar( int i_node, char* var_name );
 
-  int NESTGPU_SetDistributionFloatPtParam(char *param_name, float *array_pt);
+  int NESTGPU_IsNeuronScalVar( int i_node, char* var_name );
 
-  int NESTGPU_IsDistributionFloatParam(char *param_name);
-  
-  int NESTGPU_IsNeuronIntVar(int i_node, char *var_name);
+  int NESTGPU_IsNeuronPortVar( int i_node, char* var_name );
 
-  int NESTGPU_IsNeuronScalVar(int i_node, char *var_name);
-  
-  int NESTGPU_IsNeuronPortVar(int i_node, char *var_name);
+  int NESTGPU_IsNeuronArrayVar( int i_node, char* var_name );
 
-  int NESTGPU_IsNeuronArrayVar(int i_node, char *var_name);
+  int NESTGPU_GetNeuronParamSize( int i_node, char* param_name );
 
-  int NESTGPU_GetNeuronParamSize(int i_node, char *param_name);
+  int NESTGPU_GetNeuronVarSize( int i_node, char* var_name );
 
-  int NESTGPU_GetNeuronVarSize(int i_node, char *var_name);
-  
-  float *NESTGPU_GetNeuronParam(int i_node, int n_neuron,
-				  char *param_name);
+  float* NESTGPU_GetNeuronParam( int i_node, int n_neuron, char* param_name );
 
-  float *NESTGPU_GetNeuronPtParam(int *i_node, int n_neuron,
-				    char *param_name);
+  float* NESTGPU_GetNeuronPtParam( int* i_node, int n_neuron, char* param_name );
 
-  float *NESTGPU_GetArrayParam(int i_node, char *param_name);
+  float* NESTGPU_GetArrayParam( int i_node, char* param_name );
 
-  int *NESTGPU_GetNeuronIntVar(int i_node, int n_neuron,
-				 char *param_name);
+  int* NESTGPU_GetNeuronIntVar( int i_node, int n_neuron, char* param_name );
 
-  int *NESTGPU_GetNeuronPtIntVar(int *i_node, int n_neuron,
-				   char *param_name);
-  
-  float *NESTGPU_GetNeuronVar(int i_node, int n_neuron,
-				char *param_name);
+  int* NESTGPU_GetNeuronPtIntVar( int* i_node, int n_neuron, char* param_name );
+
+  float* NESTGPU_GetNeuronVar( int i_node, int n_neuron, char* param_name );
+
+  float* NESTGPU_GetNeuronPtVar( int* i_node, int n_neuron, char* param_name );
+
+  float* NESTGPU_GetArrayVar( int i_node, char* var_name );
 
-  float *NESTGPU_GetNeuronPtVar(int *i_node, int n_neuron,
-				  char *param_name);
-  
-  float *NESTGPU_GetArrayVar(int i_node, char *var_name);
-  
   int NESTGPU_Calibrate();
 
   int NESTGPU_Simulate();
@@ -185,230 +150,245 @@ extern "C" {
 
   int NESTGPU_EndSimulation();
 
-  int NESTGPU_ConnectMpiInit(int argc, char *argv[]);
+  int NESTGPU_ConnectMpiInit( int argc, char* argv[] );
 
   int NESTGPU_MpiFinalize();
-  
+
   int NESTGPU_HostId();
 
   int NESTGPU_HostNum();
 
-  unsigned int *NESTGPU_RandomInt(size_t n);
-  
-  float *NESTGPU_RandomUniform(size_t n);
-  
-  float *NESTGPU_RandomNormal(size_t n, float mean, float stddev);
-  
-  float *NESTGPU_RandomNormalClipped(size_t n, float mean, float stddev,
-				       float vmin, float vmax, float vstep);
-  
+  size_t NESTGPU_getCUDAMemHostUsed();
+
+  size_t NESTGPU_getCUDAMemHostPeak();
+
+  size_t NESTGPU_getCUDAMemTotal();
+
+  size_t NESTGPU_getCUDAMemFree();
+
+  unsigned int* NESTGPU_RandomInt( size_t n );
+
+  float* NESTGPU_RandomUniform( size_t n );
+
+  float* NESTGPU_RandomNormal( size_t n, float mean, float stddev );
+
+  float* NESTGPU_RandomNormalClipped( size_t n, float mean, float stddev, float vmin, float vmax, float vstep );
+
   int NESTGPU_ConnSpecInit();
 
-  int NESTGPU_SetConnSpecParam(char *param_name, int value);
+  int NESTGPU_SetConnSpecParam( char* param_name, int value );
 
-  int NESTGPU_ConnSpecIsParam(char *param_name);
+  int NESTGPU_ConnSpecIsParam( char* param_name );
 
   int NESTGPU_SynSpecInit();
 
-  int NESTGPU_SetSynSpecIntParam(char *param_name, int value);
-
-  int NESTGPU_SetSynSpecFloatParam(char *param_name, float value);
-
-  int NESTGPU_SetSynSpecFloatPtParam(char *param_name, float *array_pt);
-
-  int NESTGPU_SynSpecIsIntParam(char *param_name);
-
-  int NESTGPU_SynSpecIsFloatParam(char *param_name);
-
-  int NESTGPU_SynSpecIsFloatPtParam(char *param_name);
-
-  int NESTGPU_ConnectSeqSeq(int i_source, int n_source, int i_target,
-			      int n_target);
-
-  int NESTGPU_ConnectSeqGroup(int i_source, int n_source, int *i_target,
-				int n_target);
-
-  int NESTGPU_ConnectGroupSeq(int *i_source, int n_source, int i_target,
-				int n_target);
-
-  int NESTGPU_ConnectGroupGroup(int *i_source, int n_source, int *i_target,
-				  int n_target);
-
-  int NESTGPU_RemoteConnectSeqSeq(int i_source_host, int i_source,
-				    int n_source, int i_target_host,
-				    int i_target, int n_target);
-
-  int NESTGPU_RemoteConnectSeqGroup(int i_source_host, int i_source,
-				      int n_source, int i_target_host,
-				      int *i_target, int n_target);
-
-  int NESTGPU_RemoteConnectGroupSeq(int i_source_host, int *i_source,
-				      int n_source, int i_target_host,
-				      int i_target, int n_target);
-
-  int NESTGPU_RemoteConnectGroupGroup(int i_source_host, int *i_source,
-					int n_source, int i_target_host,
-					int *i_target, int n_target);
-
-  char **NESTGPU_GetIntVarNames(int i_node);
-
-  char **NESTGPU_GetScalVarNames(int i_node);
-  
-  int NESTGPU_GetNIntVar(int i_node);
-
-  int NESTGPU_GetNScalVar(int i_node);
-    
-  char **NESTGPU_GetPortVarNames(int i_node);
-  
-  int NESTGPU_GetNPortVar(int i_node);
-    
-  char **NESTGPU_GetScalParamNames(int i_node);
-  
-  int NESTGPU_GetNScalParam(int i_node);
-    
-  char **NESTGPU_GetPortParamNames(int i_node);
-
-  int NESTGPU_GetNGroupParam(int i_node);
-  
-  char **NESTGPU_GetGroupParamNames(int i_node);
-
-  int NESTGPU_GetNPortParam(int i_node);
-
-  char **NESTGPU_GetArrayParamNames(int i_node);
-  
-  int NESTGPU_GetNArrayParam(int i_node);
-
-  char **NESTGPU_GetArrayVarNames(int i_node);
-  
-  int NESTGPU_GetNArrayVar(int i_node);
-    
-  int64_t *NESTGPU_GetSeqSeqConnections(int i_source, int n_source,
-					int i_target, int n_target,
-					int syn_group, int64_t *n_conn);
-
-  int64_t *NESTGPU_GetSeqGroupConnections(int i_source, int n_source,
-					  int *i_target_pt, int n_target,
-					  int syn_group, int64_t *n_conn);
-
-  int64_t *NESTGPU_GetGroupSeqConnections(int *i_source_pt, int n_source,
-					  int i_target, int n_target,
-					  int syn_group, int64_t *n_conn);
-
-  int64_t *NESTGPU_GetGroupGroupConnections(int *i_source_pt, int n_source,
-					    int *i_target_pt, int n_target,
-					    int syn_group, int64_t *n_conn);
-  
-  int NESTGPU_GetConnectionStatus(int64_t *conn_ids, int64_t n_conn,
-				  int *i_source, int *i_target,
-				  int *port,
-				  unsigned char *syn_group, float *delay,
-				  float *weight);
-  
-  int NESTGPU_IsConnectionFloatParam(char *param_name);
-  
-  int NESTGPU_IsConnectionIntParam(char *param_name);
-  
-  int NESTGPU_GetConnectionFloatParam(int64_t *conn_ids, int64_t n_conn,
-				      float *param_arr, char *param_name);
-  
-  int NESTGPU_GetConnectionIntParam(int64_t *conn_ids, int64_t n_conn,
-				    int *param_arr, char *param_name);
-  
-  int NESTGPU_SetConnectionFloatParamDistr(int64_t *conn_ids, int64_t n_conn,
-					   char *param_name);
-  
-  int NESTGPU_SetConnectionIntParamArr(int64_t *conn_ids, int64_t n_conn,
-				       int *param_arr, char *param_name);
-  
-  int NESTGPU_SetConnectionFloatParam(int64_t *conn_ids, int64_t n_conn,
-				      float val, char *param_name);
-  
-  int NESTGPU_SetConnectionIntParam(int64_t *conn_ids, int64_t n_conn,
-				    int val, char *param_name);
-
-  int NESTGPU_CreateSynGroup(char *model_name);
-  
-  int NESTGPU_GetSynGroupNParam(int i_syn_group);
-  
-  char **NESTGPU_GetSynGroupParamNames(int i_syn_group);
-  
-  int NESTGPU_IsSynGroupParam(int i_syn_group, char *param_name);
-  
-  int NESTGPU_GetSynGroupParamIdx(int i_syn_group, char *param_name);
-  
-  float NESTGPU_GetSynGroupParam(int i_syn_group, char *param_name);
-  
-  int NESTGPU_SetSynGroupParam(int i_syn_group, char *param_name, float val);
-
-  int NESTGPU_ActivateSpikeCount(int i_node, int n_node);
-
-  int NESTGPU_ActivateRecSpikeTimes(int i_node, int n_node,
-				      int max_n_rec_spike_times);
-  
-  int NESTGPU_SetRecSpikeTimesStep(int i_node, int n_node,
-				   int rec_spike_times_step);
-  
-  int NESTGPU_GetNRecSpikeTimes(int i_node);
-
-  int NESTGPU_GetRecSpikeTimes(int i_node, int n_node,
-			       int **n_spike_times_pt,
-			       float ***spike_times_pt);
-
-  int NESTGPU_PushSpikesToNodes(int n_spikes, int *node_id);
- 
-  int NESTGPU_GetExtNeuronInputSpikes(int *n_spikes, int **node, int **port,
-					float **spike_height,
-					int include_zeros);
-
-  int NESTGPU_SetNeuronGroupParam(int i_node, int n_node, char *param_name,
-				    float val);
-
-  int NESTGPU_IsNeuronGroupParam(int i_node, char *param_name);
-
-  float NESTGPU_GetNeuronGroupParam(int i_node, char *param_name);
+  int NESTGPU_SetSynSpecIntParam( char* param_name, int value );
+
+  int NESTGPU_SetSynSpecFloatParam( char* param_name, float value );
+
+  int NESTGPU_SetSynSpecFloatPtParam( char* param_name, float* array_pt );
+
+  int NESTGPU_SynSpecIsIntParam( char* param_name );
+
+  int NESTGPU_SynSpecIsFloatParam( char* param_name );
+
+  int NESTGPU_SynSpecIsFloatPtParam( char* param_name );
+
+  int NESTGPU_ConnectSeqSeq( uint i_source, uint n_source, uint i_target, uint n_target );
+
+  int NESTGPU_ConnectSeqGroup( uint i_source, uint n_source, uint* i_target, uint n_target );
+
+  int NESTGPU_ConnectGroupSeq( uint* i_source, uint n_source, uint i_target, uint n_target );
+
+  int NESTGPU_ConnectGroupGroup( uint* i_source, uint n_source, uint* i_target, uint n_target );
+
+  int NESTGPU_RemoteConnectSeqSeq( int i_source_host,
+    uint i_source,
+    uint n_source,
+    int i_target_host,
+    uint i_target,
+    uint n_target );
+
+  int NESTGPU_RemoteConnectSeqGroup( int i_source_host,
+    uint i_source,
+    uint n_source,
+    int i_target_host,
+    uint* i_target,
+    uint n_target );
+
+  int NESTGPU_RemoteConnectGroupSeq( int i_source_host,
+    uint* i_source,
+    uint n_source,
+    int i_target_host,
+    uint i_target,
+    uint n_target );
+
+  int NESTGPU_RemoteConnectGroupGroup( int i_source_host,
+    uint* i_source,
+    uint n_source,
+    int i_target_host,
+    uint* i_target,
+    uint n_target );
+
+  char** NESTGPU_GetIntVarNames( uint i_node );
+
+  char** NESTGPU_GetScalVarNames( uint i_node );
+
+  int NESTGPU_GetNIntVar( uint i_node );
+
+  int NESTGPU_GetNScalVar( uint i_node );
+
+  char** NESTGPU_GetPortVarNames( uint i_node );
+
+  int NESTGPU_GetNPortVar( uint i_node );
+
+  char** NESTGPU_GetScalParamNames( uint i_node );
+
+  int NESTGPU_GetNScalParam( uint i_node );
+
+  char** NESTGPU_GetPortParamNames( uint i_node );
+
+  int NESTGPU_GetNGroupParam( uint i_node );
+
+  char** NESTGPU_GetGroupParamNames( uint i_node );
+
+  int NESTGPU_GetNPortParam( uint i_node );
+
+  char** NESTGPU_GetArrayParamNames( uint i_node );
+
+  int NESTGPU_GetNArrayParam( uint i_node );
+
+  char** NESTGPU_GetArrayVarNames( uint i_node );
+
+  int NESTGPU_GetNArrayVar( uint i_node );
+
+  int64_t* NESTGPU_GetSeqSeqConnections( uint i_source,
+    uint n_source,
+    uint i_target,
+    uint n_target,
+    int syn_group,
+    int64_t* n_conn );
+
+  int64_t* NESTGPU_GetSeqGroupConnections( uint i_source,
+    uint n_source,
+    uint* i_target_pt,
+    uint n_target,
+    int syn_group,
+    int64_t* n_conn );
+
+  int64_t* NESTGPU_GetGroupSeqConnections( uint* i_source_pt,
+    uint n_source,
+    uint i_target,
+    uint n_target,
+    int syn_group,
+    int64_t* n_conn );
+
+  int64_t* NESTGPU_GetGroupGroupConnections( uint* i_source_pt,
+    uint n_source,
+    uint* i_target_pt,
+    uint n_target,
+    int syn_group,
+    int64_t* n_conn );
+
+  int NESTGPU_GetConnectionStatus( int64_t* conn_ids,
+    int64_t n_conn,
+    uint* i_source,
+    uint* i_target,
+    int* port,
+    int* syn_group,
+    float* delay,
+    float* weight );
+
+  int NESTGPU_IsConnectionFloatParam( char* param_name );
+
+  int NESTGPU_IsConnectionIntParam( char* param_name );
+
+  int NESTGPU_GetConnectionFloatParam( int64_t* conn_ids, int64_t n_conn, float* param_arr, char* param_name );
+
+  int NESTGPU_GetConnectionIntParam( int64_t* conn_ids, int64_t n_conn, int* param_arr, char* param_name );
+
+  int NESTGPU_SetConnectionFloatParamDistr( int64_t* conn_ids, int64_t n_conn, char* param_name );
+
+  int NESTGPU_SetConnectionIntParamArr( int64_t* conn_ids, int64_t n_conn, int* param_arr, char* param_name );
+
+  int NESTGPU_SetConnectionFloatParam( int64_t* conn_ids, int64_t n_conn, float val, char* param_name );
+
+  int NESTGPU_SetConnectionIntParam( int64_t* conn_ids, int64_t n_conn, int val, char* param_name );
+
+  int NESTGPU_CreateSynGroup( char* model_name );
+
+  int NESTGPU_GetSynGroupNParam( int i_syn_group );
+
+  char** NESTGPU_GetSynGroupParamNames( int i_syn_group );
+
+  int NESTGPU_IsSynGroupParam( int i_syn_group, char* param_name );
+
+  int NESTGPU_GetSynGroupParamIdx( int i_syn_group, char* param_name );
+
+  float NESTGPU_GetSynGroupParam( int i_syn_group, char* param_name );
+
+  int NESTGPU_SetSynGroupParam( int i_syn_group, char* param_name, float val );
+
+  int NESTGPU_ActivateSpikeCount( uint i_node, int n_node );
+
+  int NESTGPU_ActivateRecSpikeTimes( uint i_node, int n_node, int max_n_rec_spike_times );
+
+  int NESTGPU_SetRecSpikeTimesStep( uint i_node, int n_node, int rec_spike_times_step );
+
+  int NESTGPU_GetNRecSpikeTimes( uint i_node );
+
+  int NESTGPU_GetRecSpikeTimes( uint i_node, int n_node, int** n_spike_times_pt, float*** spike_times_pt );
+
+  int NESTGPU_PushSpikesToNodes( int n_spikes, int* node_id );
+
+  int NESTGPU_GetExtNeuronInputSpikes( int* n_spikes, int** node, int** port, float** spike_height, int include_zeros );
+
+  int NESTGPU_SetNeuronGroupParam( uint i_node, int n_node, char* param_name, float val );
+
+  int NESTGPU_IsNeuronGroupParam( uint i_node, char* param_name );
+
+  float NESTGPU_GetNeuronGroupParam( uint i_node, char* param_name );
 
   int NESTGPU_GetNBoolParam();
-  
-  char **NESTGPU_GetBoolParamNames();
-  
-  int NESTGPU_IsBoolParam(char *param_name);
-  
-  int NESTGPU_GetBoolParamIdx(char *param_name);
-  
-  bool NESTGPU_GetBoolParam(char *param_name);
-  
-  int NESTGPU_SetBoolParam(char *param_name, bool val);
+
+  char** NESTGPU_GetBoolParamNames();
+
+  int NESTGPU_IsBoolParam( char* param_name );
+
+  int NESTGPU_GetBoolParamIdx( char* param_name );
+
+  bool NESTGPU_GetBoolParam( char* param_name );
+
+  int NESTGPU_SetBoolParam( char* param_name, bool val );
 
   int NESTGPU_GetNFloatParam();
-  
-  char **NESTGPU_GetFloatParamNames();
-  
-  int NESTGPU_IsFloatParam(char *param_name);
-  
-  int NESTGPU_GetFloatParamIdx(char *param_name);
-  
-  float NESTGPU_GetFloatParam(char *param_name);
-  
-  int NESTGPU_SetFloatParam(char *param_name, float val);
+
+  char** NESTGPU_GetFloatParamNames();
+
+  int NESTGPU_IsFloatParam( char* param_name );
+
+  int NESTGPU_GetFloatParamIdx( char* param_name );
+
+  float NESTGPU_GetFloatParam( char* param_name );
+
+  int NESTGPU_SetFloatParam( char* param_name, float val );
 
   int NESTGPU_GetNIntParam();
-  
-  char **NESTGPU_GetIntParamNames();
-  
-  int NESTGPU_IsIntParam(char *param_name);
-  
-  int NESTGPU_GetIntParamIdx(char *param_name);
-  
-  int NESTGPU_GetIntParam(char *param_name);
-  
-  int NESTGPU_SetIntParam(char *param_name, int val);
-
-  int NESTGPU_RemoteCreate(int i_host, char *model_name, int n_neuron,
-			     int n_port);
+
+  char** NESTGPU_GetIntParamNames();
+
+  int NESTGPU_IsIntParam( char* param_name );
+
+  int NESTGPU_GetIntParamIdx( char* param_name );
+
+  int NESTGPU_GetIntParam( char* param_name );
+
+  int NESTGPU_SetIntParam( char* param_name, int val );
+
+  int NESTGPU_RemoteCreate( int i_host, char* model_name, int n_neuron, int n_port );
 
 #ifdef __cplusplus
 }
 #endif
 
-
 #endif
diff --git a/src/neuron_models.cu b/src/neuron_models.cu
index 3cba077e3..efd9ba30d 100644
--- a/src/neuron_models.cu
+++ b/src/neuron_models.cu
@@ -20,187 +20,210 @@
  *
  */
 
-
-
-
-
 #include <config.h>
 #include <iostream>
 #include <string>
 
-#include "ngpu_exception.h"
-#include "cuda_error.h"
-#include "getRealTime.h"
-#include "neuron_models.h"
-#include "nestgpu.h"
-#include "iaf_psc_exp.h"
-#include "iaf_psc_exp_hc.h"
-#include "iaf_psc_exp_g.h"
-#include "iaf_psc_alpha.h"
-#include "ext_neuron.h"
 #include "aeif_cond_alpha.h"
+#include "aeif_cond_alpha_multisynapse.h"
 #include "aeif_cond_beta.h"
+#include "aeif_cond_beta_multisynapse.h"
 #include "aeif_psc_alpha.h"
+#include "aeif_psc_alpha_multisynapse.h"
 #include "aeif_psc_delta.h"
 #include "aeif_psc_exp.h"
-#include "aeif_cond_beta_multisynapse.h"
-#include "aeif_cond_alpha_multisynapse.h"
-#include "aeif_psc_alpha_multisynapse.h"
 #include "aeif_psc_exp_multisynapse.h"
-#include "poiss_gen.h"
-#include "spike_generator.h"
-#include "parrot_neuron.h"
-#include "spike_detector.h"
-#include "izhikevich_cond_beta.h"
+#include "cuda_error.h"
+#include "ext_neuron.h"
+#include "getRealTime.h"
+#include "iaf_psc_alpha.h"
+#include "iaf_psc_exp.h"
+#include "iaf_psc_exp_g.h"
+#include "iaf_psc_exp_hc.h"
 #include "izhikevich.h"
-#include "izhikevich_psc_exp_5s.h"
-#include "izhikevich_psc_exp_2s.h"
+#include "izhikevich_cond_beta.h"
 #include "izhikevich_psc_exp.h"
+#include "izhikevich_psc_exp_2s.h"
+#include "izhikevich_psc_exp_5s.h"
+#include "nestgpu.h"
+#include "neuron_models.h"
+#include "ngpu_exception.h"
+#include "parrot_neuron.h"
+#include "poiss_gen.h"
+#include "spike_detector.h"
+#include "spike_generator.h"
 #include "user_m1.h"
 #include "user_m2.h"
 
-NodeSeq NESTGPU::_Create(std::string model_name, int n_nodes /*=1*/,
-			 int n_ports /*=1*/)
+NodeSeq
+NESTGPU::_Create( std::string model_name, int n_nodes /*=1*/, int n_ports /*=1*/ )
 {
-  if (!create_flag_) {
+  if ( !create_flag_ )
+  {
     create_flag_ = true;
     start_real_time_ = getRealTime();
   }
-  CheckUncalibrated("Nodes cannot be created after calibration");
-  if (n_nodes <= 0) {
-    throw ngpu_exception("Number of nodes must be greater than zero.");
+  CheckUncalibrated( "Nodes cannot be created after calibration" );
+  if ( n_nodes <= 0 )
+  {
+    throw ngpu_exception( "Number of nodes must be greater than zero." );
   }
-  else if (n_ports < 0) {
-    throw ngpu_exception("Number of ports must be >= zero.");
+  else if ( n_ports < 0 )
+  {
+    throw ngpu_exception( "Number of ports must be >= zero." );
   }
-  if (model_name == neuron_model_name[i_iaf_psc_exp_g_model]) {
+  if ( model_name == neuron_model_name[ i_iaf_psc_exp_g_model ] )
+  {
     n_ports = 1;
-    iaf_psc_exp_g *iaf_psc_exp_g_group = new iaf_psc_exp_g;
-    node_vect_.push_back(iaf_psc_exp_g_group);
+    iaf_psc_exp_g* iaf_psc_exp_g_group = new iaf_psc_exp_g;
+    node_vect_.push_back( iaf_psc_exp_g_group );
   }
-  else if (model_name == neuron_model_name[i_iaf_psc_exp_hc_model]) {
+  else if ( model_name == neuron_model_name[ i_iaf_psc_exp_hc_model ] )
+  {
     n_ports = 1;
-    iaf_psc_exp_hc *iaf_psc_exp_hc_group = new iaf_psc_exp_hc;
-    node_vect_.push_back(iaf_psc_exp_hc_group);
+    iaf_psc_exp_hc* iaf_psc_exp_hc_group = new iaf_psc_exp_hc;
+    node_vect_.push_back( iaf_psc_exp_hc_group );
   }
-  else if (model_name == neuron_model_name[i_iaf_psc_exp_model]) {
+  else if ( model_name == neuron_model_name[ i_iaf_psc_exp_model ] )
+  {
     n_ports = 2;
-    iaf_psc_exp *iaf_psc_exp_group = new iaf_psc_exp;
-    node_vect_.push_back(iaf_psc_exp_group);
+    iaf_psc_exp* iaf_psc_exp_group = new iaf_psc_exp;
+    node_vect_.push_back( iaf_psc_exp_group );
   }
-  else if (model_name == neuron_model_name[i_iaf_psc_alpha_model]) {
+  else if ( model_name == neuron_model_name[ i_iaf_psc_alpha_model ] )
+  {
     n_ports = 2;
-    iaf_psc_alpha *iaf_psc_alpha_group = new iaf_psc_alpha;
-    node_vect_.push_back(iaf_psc_alpha_group);
+    iaf_psc_alpha* iaf_psc_alpha_group = new iaf_psc_alpha;
+    node_vect_.push_back( iaf_psc_alpha_group );
   }
-  else if (model_name == neuron_model_name[i_ext_neuron_model]) {
-    ext_neuron *ext_neuron_group = new ext_neuron;
-    node_vect_.push_back(ext_neuron_group);
+  else if ( model_name == neuron_model_name[ i_ext_neuron_model ] )
+  {
+    ext_neuron* ext_neuron_group = new ext_neuron;
+    node_vect_.push_back( ext_neuron_group );
   }
-  else if (model_name == neuron_model_name[i_aeif_cond_alpha_model]) {
+  else if ( model_name == neuron_model_name[ i_aeif_cond_alpha_model ] )
+  {
     n_ports = 2;
-    aeif_cond_alpha *aeif_cond_alpha_group = new aeif_cond_alpha;
-    node_vect_.push_back(aeif_cond_alpha_group);
+    aeif_cond_alpha* aeif_cond_alpha_group = new aeif_cond_alpha;
+    node_vect_.push_back( aeif_cond_alpha_group );
   }
-  else if (model_name == neuron_model_name[i_aeif_cond_beta_model]) {
+  else if ( model_name == neuron_model_name[ i_aeif_cond_beta_model ] )
+  {
     n_ports = 2;
-    aeif_cond_beta *aeif_cond_beta_group = new aeif_cond_beta;
-    node_vect_.push_back(aeif_cond_beta_group);
+    aeif_cond_beta* aeif_cond_beta_group = new aeif_cond_beta;
+    node_vect_.push_back( aeif_cond_beta_group );
   }
-  else if (model_name == neuron_model_name[i_aeif_psc_alpha_model]) {
+  else if ( model_name == neuron_model_name[ i_aeif_psc_alpha_model ] )
+  {
     n_ports = 2;
-    aeif_psc_alpha *aeif_psc_alpha_group = new aeif_psc_alpha;
-    node_vect_.push_back(aeif_psc_alpha_group);
+    aeif_psc_alpha* aeif_psc_alpha_group = new aeif_psc_alpha;
+    node_vect_.push_back( aeif_psc_alpha_group );
   }
-  else if (model_name == neuron_model_name[i_aeif_psc_delta_model]) {
+  else if ( model_name == neuron_model_name[ i_aeif_psc_delta_model ] )
+  {
     n_ports = 1;
-    aeif_psc_delta *aeif_psc_delta_group = new aeif_psc_delta;
-    node_vect_.push_back(aeif_psc_delta_group);
+    aeif_psc_delta* aeif_psc_delta_group = new aeif_psc_delta;
+    node_vect_.push_back( aeif_psc_delta_group );
   }
-  else if (model_name == neuron_model_name[i_aeif_psc_exp_model]) {
+  else if ( model_name == neuron_model_name[ i_aeif_psc_exp_model ] )
+  {
     n_ports = 2;
-    aeif_psc_exp *aeif_psc_exp_group = new aeif_psc_exp;
-    node_vect_.push_back(aeif_psc_exp_group);
-  }
-  else if (model_name == neuron_model_name[i_aeif_cond_beta_multisynapse_model]) {
-    aeif_cond_beta_multisynapse *aeif_cond_beta_multisynapse_group = new aeif_cond_beta_multisynapse;
-    node_vect_.push_back(aeif_cond_beta_multisynapse_group);
-  }
-  else if (model_name == neuron_model_name[i_aeif_cond_alpha_multisynapse_model]) {
-    aeif_cond_alpha_multisynapse *aeif_cond_alpha_multisynapse_group = new aeif_cond_alpha_multisynapse;
-    node_vect_.push_back(aeif_cond_alpha_multisynapse_group);
-  }
-  else if (model_name == neuron_model_name[i_aeif_psc_exp_multisynapse_model]) {
-    aeif_psc_exp_multisynapse *aeif_psc_exp_multisynapse_group = new aeif_psc_exp_multisynapse;
-    node_vect_.push_back(aeif_psc_exp_multisynapse_group);
-  }
-  else if (model_name == neuron_model_name[i_aeif_psc_alpha_multisynapse_model]) {
-    aeif_psc_alpha_multisynapse *aeif_psc_alpha_multisynapse_group = new aeif_psc_alpha_multisynapse;
-    node_vect_.push_back(aeif_psc_alpha_multisynapse_group);
-  }
-  else if (model_name == neuron_model_name[i_user_m1_model]) {
-    user_m1 *user_m1_group = new user_m1;
-    node_vect_.push_back(user_m1_group);
-  }
-  else if (model_name == neuron_model_name[i_user_m2_model]) {
-    user_m2 *user_m2_group = new user_m2;
-    node_vect_.push_back(user_m2_group);
-  }
-  else if (model_name == neuron_model_name[i_poisson_generator_model]) {
+    aeif_psc_exp* aeif_psc_exp_group = new aeif_psc_exp;
+    node_vect_.push_back( aeif_psc_exp_group );
+  }
+  else if ( model_name == neuron_model_name[ i_aeif_cond_beta_multisynapse_model ] )
+  {
+    aeif_cond_beta_multisynapse* aeif_cond_beta_multisynapse_group = new aeif_cond_beta_multisynapse;
+    node_vect_.push_back( aeif_cond_beta_multisynapse_group );
+  }
+  else if ( model_name == neuron_model_name[ i_aeif_cond_alpha_multisynapse_model ] )
+  {
+    aeif_cond_alpha_multisynapse* aeif_cond_alpha_multisynapse_group = new aeif_cond_alpha_multisynapse;
+    node_vect_.push_back( aeif_cond_alpha_multisynapse_group );
+  }
+  else if ( model_name == neuron_model_name[ i_aeif_psc_exp_multisynapse_model ] )
+  {
+    aeif_psc_exp_multisynapse* aeif_psc_exp_multisynapse_group = new aeif_psc_exp_multisynapse;
+    node_vect_.push_back( aeif_psc_exp_multisynapse_group );
+  }
+  else if ( model_name == neuron_model_name[ i_aeif_psc_alpha_multisynapse_model ] )
+  {
+    aeif_psc_alpha_multisynapse* aeif_psc_alpha_multisynapse_group = new aeif_psc_alpha_multisynapse;
+    node_vect_.push_back( aeif_psc_alpha_multisynapse_group );
+  }
+  else if ( model_name == neuron_model_name[ i_user_m1_model ] )
+  {
+    user_m1* user_m1_group = new user_m1;
+    node_vect_.push_back( user_m1_group );
+  }
+  else if ( model_name == neuron_model_name[ i_user_m2_model ] )
+  {
+    user_m2* user_m2_group = new user_m2;
+    node_vect_.push_back( user_m2_group );
+  }
+  else if ( model_name == neuron_model_name[ i_poisson_generator_model ] )
+  {
     n_ports = 0;
-    poiss_gen *poiss_gen_group = new poiss_gen;
-    node_vect_.push_back(poiss_gen_group);
+    poiss_gen* poiss_gen_group = new poiss_gen;
+    node_vect_.push_back( poiss_gen_group );
   }
-  else if (model_name == neuron_model_name[i_spike_generator_model]) {
+  else if ( model_name == neuron_model_name[ i_spike_generator_model ] )
+  {
     n_ports = 0;
-    spike_generator *spike_generator_group = new spike_generator;
-    node_vect_.push_back(spike_generator_group);
+    spike_generator* spike_generator_group = new spike_generator;
+    node_vect_.push_back( spike_generator_group );
   }
-  else if (model_name == neuron_model_name[i_parrot_neuron_model]) {
+  else if ( model_name == neuron_model_name[ i_parrot_neuron_model ] )
+  {
     n_ports = 2;
-    parrot_neuron *parrot_neuron_group = new parrot_neuron;
-    node_vect_.push_back(parrot_neuron_group);
+    parrot_neuron* parrot_neuron_group = new parrot_neuron;
+    node_vect_.push_back( parrot_neuron_group );
   }
-  else if (model_name == neuron_model_name[i_spike_detector_model]) {
+  else if ( model_name == neuron_model_name[ i_spike_detector_model ] )
+  {
     n_ports = 1;
-    spike_detector *spike_detector_group = new spike_detector;
-    node_vect_.push_back(spike_detector_group);
-  }
-  else if (model_name == neuron_model_name[i_izhikevich_model]) {
-    izhikevich *izhikevich_group = new izhikevich;
-    node_vect_.push_back(izhikevich_group);
-  }
-  else if (model_name == neuron_model_name[i_izhikevich_cond_beta_model]) {
-    izhikevich_cond_beta *izhikevich_cond_beta_group = new izhikevich_cond_beta;
-    node_vect_.push_back(izhikevich_cond_beta_group);
-  }
-  else if (model_name == neuron_model_name[i_izhikevich_psc_exp_5s_model]) {
-    izhikevich_psc_exp_5s *izhikevich_psc_exp_5s_group =
-      new izhikevich_psc_exp_5s;
-    node_vect_.push_back(izhikevich_psc_exp_5s_group);
-  }
-  else if (model_name == neuron_model_name[i_izhikevich_psc_exp_2s_model]) {
-    izhikevich_psc_exp_2s *izhikevich_psc_exp_2s_group =
-      new izhikevich_psc_exp_2s;
-    node_vect_.push_back(izhikevich_psc_exp_2s_group);
-  }
-  else if (model_name == neuron_model_name[i_izhikevich_psc_exp_model]) {
-    izhikevich_psc_exp *izhikevich_psc_exp_group = new izhikevich_psc_exp;
-    node_vect_.push_back(izhikevich_psc_exp_group);
-  }
-  else {
-    throw ngpu_exception(std::string("Unknown neuron model name: ")
-			 + model_name);
-  }
-  return NodeSeq(CreateNodeGroup(n_nodes, n_ports), n_nodes);
+    spike_detector* spike_detector_group = new spike_detector;
+    node_vect_.push_back( spike_detector_group );
+  }
+  else if ( model_name == neuron_model_name[ i_izhikevich_model ] )
+  {
+    izhikevich* izhikevich_group = new izhikevich;
+    node_vect_.push_back( izhikevich_group );
+  }
+  else if ( model_name == neuron_model_name[ i_izhikevich_cond_beta_model ] )
+  {
+    izhikevich_cond_beta* izhikevich_cond_beta_group = new izhikevich_cond_beta;
+    node_vect_.push_back( izhikevich_cond_beta_group );
+  }
+  else if ( model_name == neuron_model_name[ i_izhikevich_psc_exp_5s_model ] )
+  {
+    izhikevich_psc_exp_5s* izhikevich_psc_exp_5s_group = new izhikevich_psc_exp_5s;
+    node_vect_.push_back( izhikevich_psc_exp_5s_group );
+  }
+  else if ( model_name == neuron_model_name[ i_izhikevich_psc_exp_2s_model ] )
+  {
+    izhikevich_psc_exp_2s* izhikevich_psc_exp_2s_group = new izhikevich_psc_exp_2s;
+    node_vect_.push_back( izhikevich_psc_exp_2s_group );
+  }
+  else if ( model_name == neuron_model_name[ i_izhikevich_psc_exp_model ] )
+  {
+    izhikevich_psc_exp* izhikevich_psc_exp_group = new izhikevich_psc_exp;
+    node_vect_.push_back( izhikevich_psc_exp_group );
+  }
+  else
+  {
+    throw ngpu_exception( std::string( "Unknown neuron model name: " ) + model_name );
+  }
+  return NodeSeq( CreateNodeGroup( n_nodes, n_ports ), n_nodes );
 }
 
-NodeSeq NESTGPU::Create(std::string model_name, int n_nodes,
-			int n_ports)
+NodeSeq
+NESTGPU::Create( std::string model_name, int n_nodes, int n_ports )
 {
-  for (int i_host=0; i_host<n_hosts_; i_host++) {
-    n_remote_nodes_[i_host] += n_nodes;
+  for ( int i_host = 0; i_host < n_hosts_; i_host++ )
+  {
+    n_remote_nodes_[ i_host ] += n_nodes;
   }
-  
-  return _Create(model_name, n_nodes, n_ports);
+
+  return _Create( model_name, n_nodes, n_ports );
 }
diff --git a/src/neuron_models.h b/src/neuron_models.h
index 16206b7f9..2898a5eed 100644
--- a/src/neuron_models.h
+++ b/src/neuron_models.h
@@ -20,16 +20,13 @@
  *
  */
 
-
-
-
-
 #ifndef NEURONMODELS_H
 #define NEURONMODELS_H
 
-enum NeuronModels {
+enum NeuronModels
+{
   i_null_model = 0,
-  i_iaf_psc_exp_g_model,  
+  i_iaf_psc_exp_g_model,
   i_iaf_psc_exp_hc_model,
   i_iaf_psc_exp_model,
   i_iaf_psc_alpha_model,
@@ -57,8 +54,7 @@ enum NeuronModels {
   N_NEURON_MODELS
 };
 
-const std::string neuron_model_name[N_NEURON_MODELS] = {
-  "",
+const std::string neuron_model_name[ N_NEURON_MODELS ] = { "",
   "iaf_psc_exp_g",
   "iaf_psc_exp_hc",
   "iaf_psc_exp",
@@ -83,7 +79,6 @@ const std::string neuron_model_name[N_NEURON_MODELS] = {
   "izhikevich_psc_exp_2s",
   "izhikevich_psc_exp",
   "user_m1",
-  "user_m2"
-};
+  "user_m2" };
 
 #endif
diff --git a/src/ngpu_exception.h b/src/ngpu_exception.h
index 69573b61a..ad08490e8 100644
--- a/src/ngpu_exception.h
+++ b/src/ngpu_exception.h
@@ -27,34 +27,50 @@
 
 #ifndef NGPUEXCEPTION_H
 #define NGPUEXCEPTION_H
-#include <string>
 #include <cstring>
 #include <exception>
+#include <string>
 
 ///////////////////////////////////
 // ngpu_exception class definition
 // in case of errors displays a message and stop the execution
 //////////////////////////////////
-class ngpu_exception: public std::exception
+class ngpu_exception : public std::exception
 {
-  const char *Message; // error message
-  
- public:
+  const char* Message; // error message
+
+public:
   // constructors
-  ngpu_exception(const char *ch)  {Message=strdup(ch);}
-  ngpu_exception(std::string s)  {Message=strdup(s.c_str());}
+  ngpu_exception( const char* ch )
+  {
+    Message = strdup( ch );
+  }
+  ngpu_exception( std::string s )
+  {
+    Message = strdup( s.c_str() );
+  }
   // throw method
-  virtual const char* what() const throw()
+  virtual const char*
+  what() const throw()
   {
     return Message;
   }
 };
 
 #define BEGIN_TRY try
-#define END_TRY catch (ngpu_exception &e){ \
-    std::cerr << "Error: " << e.what() << "\n"; }			\
-  catch (bad_alloc&) { std::cerr << "Error allocating memory." << "\n"; } \
-  catch (...) { std::cerr << "Unrecognized error\n"; }
-
+#define END_TRY                                 \
+  catch ( ngpu_exception & e )                  \
+  {                                             \
+    std::cerr << "Error: " << e.what() << "\n"; \
+  }                                             \
+  catch ( bad_alloc& )                          \
+  {                                             \
+    std::cerr << "Error allocating memory."     \
+              << "\n";                          \
+  }                                             \
+  catch ( ... )                                 \
+  {                                             \
+    std::cerr << "Unrecognized error\n";        \
+  }
 
 #endif
diff --git a/src/node_group.cu b/src/node_group.cu
index 774e2cf14..32d3e2803 100644
--- a/src/node_group.cu
+++ b/src/node_group.cu
@@ -20,48 +20,45 @@
  *
  */
 
-
-
-
-
 #include <config.h>
 #include <iostream>
 
 #include "cuda_error.h"
-#include "node_group.h"
 #include "nestgpu.h"
+#include "node_group.h"
 
-__constant__ NodeGroupStruct NodeGroupArray[MAX_N_NODE_GROUPS];
-__device__ int16_t *NodeGroupMap;
+__constant__ NodeGroupStruct NodeGroupArray[ MAX_N_NODE_GROUPS ];
+__device__ int16_t* NodeGroupMap;
 
-__global__
-void NodeGroupMapInit(int16_t *node_group_map)
+__global__ void
+NodeGroupMapInit( int16_t* node_group_map )
 {
   NodeGroupMap = node_group_map;
 }
 
-int NESTGPU::NodeGroupArrayInit()
+int
+NESTGPU::NodeGroupArrayInit()
 {
-  CUDAMALLOCCTRL("&d_node_group_map_",&d_node_group_map_,
-		       node_group_map_.size()*sizeof(int16_t));
+  CUDAMALLOCCTRL( "&d_node_group_map_", &d_node_group_map_, node_group_map_.size() * sizeof( int16_t ) );
 
-  std::vector<NodeGroupStruct> ngs_vect;
-  for (unsigned int i=0; i<node_vect_.size(); i++) {
+  std::vector< NodeGroupStruct > ngs_vect;
+  for ( unsigned int i = 0; i < node_vect_.size(); i++ )
+  {
     NodeGroupStruct ngs;
-    ngs.node_type_ = node_vect_[i]->node_type_;
-    ngs.i_node_0_ = node_vect_[i]->i_node_0_;
-    ngs.n_node_ = node_vect_[i]->n_node_;
-    ngs.n_port_ = node_vect_[i]->n_port_;
-    ngs.n_param_ = node_vect_[i]->n_param_;
-    ngs.get_spike_array_ = node_vect_[i]->get_spike_array_;
-
-    ngs.spike_count_ = node_vect_[i]->spike_count_;
-    ngs.rec_spike_times_ = node_vect_[i]->rec_spike_times_;
-    ngs.n_rec_spike_times_ = node_vect_[i]->n_rec_spike_times_;
-    ngs.max_n_rec_spike_times_ = node_vect_[i]->max_n_rec_spike_times_;
-    ngs.den_delay_arr_ = node_vect_[i]->den_delay_arr_;
-    
-    ngs_vect.push_back(ngs);
+    ngs.node_type_ = node_vect_[ i ]->node_type_;
+    ngs.i_node_0_ = node_vect_[ i ]->i_node_0_;
+    ngs.n_node_ = node_vect_[ i ]->n_node_;
+    ngs.n_port_ = node_vect_[ i ]->n_port_;
+    ngs.n_param_ = node_vect_[ i ]->n_param_;
+    ngs.get_spike_array_ = node_vect_[ i ]->get_spike_array_;
+
+    ngs.spike_count_ = node_vect_[ i ]->spike_count_;
+    ngs.rec_spike_times_ = node_vect_[ i ]->rec_spike_times_;
+    ngs.n_rec_spike_times_ = node_vect_[ i ]->n_rec_spike_times_;
+    ngs.max_n_rec_spike_times_ = node_vect_[ i ]->max_n_rec_spike_times_;
+    ngs.den_delay_arr_ = node_vect_[ i ]->den_delay_arr_;
+
+    ngs_vect.push_back( ngs );
   }
 
   //  gpuErrchk( cudaPeekAtLastError() );
@@ -71,40 +68,41 @@ int NESTGPU::NodeGroupArrayInit()
   //  std::cout << this_host_ << "ngs_vect.size(): " << ngs_vect.size() << "\n";
   //  std::cout << this_host_ << "NodeGroupArray: " << NodeGroupArray << "\n";
 
-  if (ngs_vect.size() > MAX_N_NODE_GROUPS) {
-    throw ngpu_exception("Number of neuron groups larger than limit.");
+  if ( ngs_vect.size() > MAX_N_NODE_GROUPS )
+  {
+    throw ngpu_exception( "Number of neuron groups larger than limit." );
   }
-  
-  gpuErrchk(cudaMemcpyToSymbolAsync(NodeGroupArray, ngs_vect.data(),
-			       ngs_vect.size()*sizeof(NodeGroupStruct)));
+
+  gpuErrchk( cudaMemcpyToSymbolAsync( NodeGroupArray, ngs_vect.data(), ngs_vect.size() * sizeof( NodeGroupStruct ) ) );
 
   // Memcopy will be synchronized with NodeGroupMapInit kernel
-  gpuErrchk(cudaMemcpyAsync(d_node_group_map_, node_group_map_.data(),
-		       node_group_map_.size()*sizeof(int16_t),
-		       cudaMemcpyHostToDevice));
+  gpuErrchk( cudaMemcpyAsync(
+    d_node_group_map_, node_group_map_.data(), node_group_map_.size() * sizeof( int16_t ), cudaMemcpyHostToDevice ) );
   // temporary
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  NodeGroupMapInit<<<1, 1>>>(d_node_group_map_);
+  NodeGroupMapInit<<< 1, 1 >>>( d_node_group_map_ );
   gpuErrchk( cudaPeekAtLastError() );
 
   return 0;
 }
 
-double *NESTGPU::InitGetSpikeArray (int n_node, int n_port)
+double*
+NESTGPU::InitGetSpikeArray( int n_node, int n_port )
 {
-  double *d_get_spike_array = NULL;
-  if (n_node*n_port > 0) {
-    CUDAMALLOCCTRL("&d_get_spike_array",&d_get_spike_array, n_node*n_port
-			 *sizeof(double));
+  double* d_get_spike_array = nullptr;
+  if ( n_node * n_port > 0 )
+  {
+    CUDAMALLOCCTRL( "&d_get_spike_array", &d_get_spike_array, n_node * n_port * sizeof( double ) );
   }
-  
+
   return d_get_spike_array;
 }
 
-int NESTGPU::FreeNodeGroupMap()
+int
+NESTGPU::FreeNodeGroupMap()
 {
-  CUDAFREECTRL("d_node_group_map_",d_node_group_map_);
-	    
+  CUDAFREECTRL( "d_node_group_map_", d_node_group_map_ );
+
   return 0;
 }
diff --git a/src/node_group.h b/src/node_group.h
index 10db46ed9..5dee5aad1 100644
--- a/src/node_group.h
+++ b/src/node_group.h
@@ -20,10 +20,6 @@
  *
  */
 
-
-
-
-
 #ifndef NODEGROUP_H
 #define NODEGROUP_H
 
@@ -36,12 +32,12 @@ struct NodeGroupStruct
   int n_node_;
   int n_port_;
   int n_param_;
-  double *get_spike_array_;
-  int *spike_count_;
-  float *rec_spike_times_;
-  int *n_rec_spike_times_;
+  double* get_spike_array_;
+  int* spike_count_;
+  float* rec_spike_times_;
+  int* n_rec_spike_times_;
   int max_n_rec_spike_times_;
-  float *den_delay_arr_;
+  float* den_delay_arr_;
 };
 
 #endif
diff --git a/src/parrot_neuron.cu b/src/parrot_neuron.cu
index a56400277..8a3211bb3 100644
--- a/src/parrot_neuron.cu
+++ b/src/parrot_neuron.cu
@@ -20,135 +20,133 @@
  *
  */
 
-
-
-
-
-#include <config.h>
 #include <cmath>
+#include <config.h>
 #include <iostream>
 #include <string>
-//#include <stdio.h>
+// #include <stdio.h>
 
 #include "cuda_error.h"
 #include "nestgpu.h"
 #include "neuron_models.h"
 #include "parrot_neuron.h"
 #include "spike_buffer.h"
-//#include "parrot_neuron_variables.h"
+// #include "parrot_neuron_variables.h"
 
-enum {
-  i_parrot_neuron_hold_spike_height=0,
+enum
+{
+  i_parrot_neuron_hold_spike_height = 0,
   i_parrot_neuron_den_delay,
   N_PARROT_NEURON_SCAL_PARAM
 };
 
-const std::string parrot_neuron_scal_param_name[N_PARROT_NEURON_SCAL_PARAM]
-= {"hold_spike_height", "den_delay"};
+const std::string parrot_neuron_scal_param_name[ N_PARROT_NEURON_SCAL_PARAM ] = { "hold_spike_height", "den_delay" };
 
-enum {
-  i_parrot_neuron_input_spike_height=0,
+enum
+{
+  i_parrot_neuron_input_spike_height = 0,
   i_parrot_neuron_dummy_input,
   i_parrot_neuron_V,
   N_PARROT_NEURON_SCAL_VAR
 };
 
-const std::string parrot_neuron_scal_var_name[N_PARROT_NEURON_SCAL_VAR]
-= {"input_spike_height", "dummy_input", "V"};
-
+const std::string parrot_neuron_scal_var_name[ N_PARROT_NEURON_SCAL_VAR ] = { "input_spike_height",
+  "dummy_input",
+  "V" };
 
-__global__
-void parrot_neuron_UpdateKernel(int i_node_0, int n_node, float *var_arr,
-				float *param_arr, int n_var, int n_param)
+__global__ void
+parrot_neuron_UpdateKernel( int i_node_0, int n_node, float* var_arr, float* param_arr, int n_var, int n_param )
 {
   int irel_node = threadIdx.x + blockIdx.x * blockDim.x;
-  if (irel_node < n_node) {
-    float *input_spike_height_pt = var_arr + irel_node*n_var
-      + i_parrot_neuron_input_spike_height;
-    float *V_pt = var_arr + irel_node*n_var + i_parrot_neuron_V;
-    float *hold_spike_height_pt = param_arr + irel_node*n_param +
-      i_parrot_neuron_hold_spike_height;
+  if ( irel_node < n_node )
+  {
+    float* input_spike_height_pt = var_arr + irel_node * n_var + i_parrot_neuron_input_spike_height;
+    float* V_pt = var_arr + irel_node * n_var + i_parrot_neuron_V;
+    float* hold_spike_height_pt = param_arr + irel_node * n_param + i_parrot_neuron_hold_spike_height;
     int i_node = i_node_0 + irel_node;
     float spike_height = *input_spike_height_pt;
     *V_pt = spike_height;
-    if (spike_height != 0.0) {
-      if (*hold_spike_height_pt==0.0) {
-	spike_height = 1.0;
+    if ( spike_height != 0.0 )
+    {
+      if ( *hold_spike_height_pt == 0.0 )
+      {
+        spike_height = 1.0;
       }
       *input_spike_height_pt = 0;
-      PushSpike(i_node, spike_height);
+      PushSpike( i_node, spike_height );
     }
   }
 }
 
-
-int parrot_neuron::Init(int i_node_0, int n_node, int /*n_port*/,
-			int i_group)
+int
+parrot_neuron::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 2 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 2 /*n_port*/, i_group );
   node_type_ = i_parrot_neuron_model;
 
   n_scal_var_ = N_PARROT_NEURON_SCAL_VAR;
   n_var_ = n_scal_var_;
   scal_var_name_ = parrot_neuron_scal_var_name;
-  
+
   n_scal_param_ = N_PARROT_NEURON_SCAL_PARAM;
   n_param_ = n_scal_param_;
   scal_param_name_ = parrot_neuron_scal_param_name;
 
-  CUDAMALLOCCTRL("&var_arr_",&var_arr_, n_node_*n_var_*sizeof(float));
+  CUDAMALLOCCTRL( "&var_arr_", &var_arr_, n_node_ * n_var_ * sizeof( float ) );
 
-  CUDAMALLOCCTRL("&param_arr_",&param_arr_, n_node_*n_param_*sizeof(float));
+  CUDAMALLOCCTRL( "&param_arr_", &param_arr_, n_node_ * n_param_ * sizeof( float ) );
 
-  SetScalParam(0, n_node, "hold_spike_height", 0.0);
+  SetScalParam( 0, n_node, "hold_spike_height", 0.0 );
 
-  SetScalParam(0, n_node, "den_delay", 0.0);
+  SetScalParam( 0, n_node, "den_delay", 0.0 );
 
-  SetScalVar(0, n_node, "input_spike_height", 0.0);
+  SetScalVar( 0, n_node, "input_spike_height", 0.0 );
 
-  SetScalVar(0, n_node, "dummy_input", 0.0);
+  SetScalVar( 0, n_node, "dummy_input", 0.0 );
 
-  SetScalVar(0, n_node, "V", 0.0);
+  SetScalVar( 0, n_node, "V", 0.0 );
 
   // multiplication factor of input signal is always 1 for all nodes
-  float input_weight[] = {1.0, 0.0};
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, 2*sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, input_weight,
-			 2*sizeof(float), cudaMemcpyHostToDevice));
+  float input_weight[] = { 1.0, 0.0 };
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, 2 * sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, input_weight, 2 * sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 1;
-  
+
   // input signal is stored in input_spike_height
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("input_spike_height");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "input_spike_height" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 1;
 
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int parrot_neuron::Update(long long /*i_time*/, double /*t1*/)
+int
+parrot_neuron::Update( long long /*i_time*/, double /*t1*/ )
 {
-  parrot_neuron_UpdateKernel<<<(n_node_+1023)/1024, 1024>>>
-    (i_node_0_, n_node_, var_arr_, param_arr_, n_var_, n_param_);
-  //gpuErrchk( cudaPeekAtLastError() );
-  //gpuErrchk( cudaDeviceSynchronize() );
+  parrot_neuron_UpdateKernel<<< ( n_node_ + 1023 ) / 1024, 1024 >>>(
+    i_node_0_, n_node_, var_arr_, param_arr_, n_var_, n_param_ );
+  // gpuErrchk( cudaPeekAtLastError() );
+  // gpuErrchk( cudaDeviceSynchronize() );
 
   return 0;
 }
 
-int parrot_neuron::Free()
+int
+parrot_neuron::Free()
 {
-  CUDAFREECTRL("var_arr_",var_arr_);
-  CUDAFREECTRL("param_arr_",param_arr_);	    
+  CUDAFREECTRL( "var_arr_", var_arr_ );
+  CUDAFREECTRL( "param_arr_", param_arr_ );
 
   return 0;
 }
 
 parrot_neuron::~parrot_neuron()
 {
-  if (n_node_>0) {
+  if ( n_node_ > 0 )
+  {
     Free();
   }
 }
diff --git a/src/parrot_neuron.h b/src/parrot_neuron.h
index 7039abcd1..0cc5149f1 100644
--- a/src/parrot_neuron.h
+++ b/src/parrot_neuron.h
@@ -20,19 +20,14 @@
  *
  */
 
-
-
-
-
 #ifndef PARROTNEURON_H
 #define PARROTNEURON_H
 
 #include <iostream>
 #include <string>
-//#include "node_group.h"
+// #include "node_group.h"
 #include "base_neuron.h"
-//#include "neuron_models.h"
-
+// #include "neuron_models.h"
 
 /* BeginUserDocs: neuron, parrot
 
@@ -59,29 +54,26 @@ Remarks
 - Weights on connections *from* the ``parrot_neuron`` are handled as usual.
 - Delays are honored on incoming and outgoing connections.
 
-Only spikes arriving on connections to port (``receptor``) 0 will 
+Only spikes arriving on connections to port (``receptor``) 0 will
 be repeated. Connections onto port 1 will be accepted, but spikes
 incoming through port 1 will be ignored. This allows setting
-exact pre- and postsynaptic spike times for STDP protocols by 
+exact pre- and postsynaptic spike times for STDP protocols by
 connecting two parrot neurons spiking at desired times by, e.g.,
 a `stdp` onto port 1 on the postsynaptic parrot neuron.
 
 
 EndUserDocs */
 
-
 class parrot_neuron : public BaseNeuron
 {
- public:
+public:
   ~parrot_neuron();
 
-  int Init(int i_node_0, int n_node, int n_port, int i_group);
+  int Init( int i_node_0, int n_node, int n_port, int i_group );
 
   int Free();
-  
-  int Update(long long it, double t1);
 
+  int Update( long long it, double t1 );
 };
 
-
 #endif
diff --git a/src/poiss_gen.cu b/src/poiss_gen.cu
index ac010d7dd..2121d53f6 100644
--- a/src/poiss_gen.cu
+++ b/src/poiss_gen.cu
@@ -20,480 +20,161 @@
  *
  */
 
-
-
-
-
-#include <config.h>
 #include <cmath>
+#include <config.h>
 #include <iostream>
-//#include <stdio.h>
-#include <stdint.h>
+// #include <stdio.h>
+#include <cub/cub.cuh>
 #include <curand.h>
 #include <curand_kernel.h>
-#include <cub/cub.cuh>
+#include <stdint.h>
 
-#include "utilities.h"
+#include "connect.h"
+#include "copass_kernels.h"
 #include "nestgpu.h"
 #include "neuron_models.h"
 #include "poiss_gen.h"
 #include "poiss_gen_variables.h"
-#include "copass_kernels.h"
-#include "connect.h"
+#include "utilities.h"
 
 extern __constant__ double NESTGPUTime;
 extern __constant__ float NESTGPUTimeResolution;
 extern __constant__ NodeGroupStruct NodeGroupArray[];
-extern __device__ int16_t *NodeGroupMap;
+extern __device__ int16_t* NodeGroupMap;
 
 namespace poiss_conn
 {
-  typedef uint key_t;
-  typedef regular_block_array<key_t> array_t;
-  key_t **d_poiss_key_array_data_pt;
-  array_t *d_poiss_subarray;
-  int64_t *d_poiss_num;
-  int64_t *d_poiss_sum;
-  key_t *d_poiss_thresh;
-};
-
-// max delay functor
-struct MaxDelay
+Connection* conn_;
+// typedef uint key_t;
+// typedef regular_block_array<key_t> array_t;
+// key_t **d_poiss_key_array_data_pt;
+// array_t *d_poiss_subarray;
+void* d_poiss_key_array_data_pt;
+void* d_poiss_subarray;
+
+int64_t* d_poiss_num;
+int64_t* d_poiss_sum;
+// key_t *d_poiss_thresh;
+void* d_poiss_thresh;
+int
+organizeDirectConnections( Connection* conn )
 {
-  //template <typename T>
-    __device__ __forceinline__
-    //T operator()(const T &source_delay_a, const T &source_delay_b) const {
-    uint operator()(const uint &source_delay_a, const uint &source_delay_b)
-      const {
-      uint i_delay_a = source_delay_a & PortSynMask;
-      uint i_delay_b = source_delay_b & PortSynMask;
-        return (i_delay_b > i_delay_a) ? i_delay_b : i_delay_a;
-    }
-};
+  conn_ = conn;
+  return conn->organizeDirectConnections(
+    d_poiss_key_array_data_pt, d_poiss_subarray, d_poiss_num, d_poiss_sum, d_poiss_thresh );
+}
+}; // namespace poiss_conn
 
-__global__ void SetupPoissKernel(curandState *curand_state, uint64_t n_conn,
-				 unsigned long long seed)
+__global__ void
+SetupPoissKernel( curandState* curand_state, uint64_t n_conn, unsigned long long seed )
 {
-  uint64_t blockId   = (uint64_t)blockIdx.y * gridDim.x + blockIdx.x;
+  uint64_t blockId = ( uint64_t ) blockIdx.y * gridDim.x + blockIdx.x;
   uint64_t i_conn = blockId * blockDim.x + threadIdx.x;
-  if (i_conn<n_conn) {
-    curand_init(seed, i_conn, 0, &curand_state[i_conn]);
+  if ( i_conn < n_conn )
+  {
+    curand_init( seed, i_conn, 0, &curand_state[ i_conn ] );
   }
 }
 
-
-__global__ void PoissGenUpdateKernel(long long time_idx,
-				     int n_node, int max_delay,
-				     float *param_arr, int n_param,
-				     float *mu_arr)
+__global__ void
+PoissGenUpdateKernel( long long time_idx, int n_node, int max_delay, float* param_arr, int n_param, float* mu_arr )
 {
   int i_node = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_node<n_node) {
-    float *param = param_arr + i_node*n_param;
+  if ( i_node < n_node )
+  {
+    float* param = param_arr + i_node * n_param;
     double t_rel = NESTGPUTime - origin;
-    if ((t_rel>=start) && (t_rel<=stop)) {
-      int it = (int)(time_idx % max_delay);
-      mu_arr[it*n_node + i_node] = NESTGPUTimeResolution*rate/1000.0;
+    if ( ( t_rel >= start ) && ( t_rel <= stop ) )
+    {
+      int it = ( int ) ( time_idx % max_delay );
+      mu_arr[ it * n_node + i_node ] = NESTGPUTimeResolution * rate / 1000.0;
     }
   }
 }
 
-__global__ void PoissGenSubstractFirstNodeIndexKernel(int64_t n_conn,
-						      uint *poiss_key_array,
-						      int i_node_0)
+int
+poiss_gen::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  uint64_t blockId   = (uint64_t)blockIdx.y * gridDim.x + blockIdx.x;
-  uint64_t i_conn_rel = blockId * blockDim.x + threadIdx.x;
-  if (i_conn_rel >= n_conn) {
-    return;
-  }
-  uint source_delay = poiss_key_array[i_conn_rel];
-  int i_source_rel = (source_delay >> MaxPortSynNBits) - i_node_0;
-  int i_delay = source_delay & PortSynMask;
-  poiss_key_array[i_conn_rel] = (i_source_rel << MaxPortSynNBits) | i_delay; 
-}
+  BaseNeuron::Init( i_node_0, n_node, 0 /*n_port*/, i_group );
+  node_type_ = i_poisson_generator_model;
+  n_scal_param_ = N_POISS_GEN_SCAL_PARAM;
+  n_param_ = n_scal_param_;
+  scal_param_name_ = poiss_gen_scal_param_name;
+  has_dir_conn_ = true;
 
-/*
-__global__ void PoissGenSendSpikeKernel(curandState *curand_state, double t,
-					float time_step, float *param_arr,
-					int n_param,
-					DirectConnection *dir_conn_array,
-					uint64_t n_dir_conn)
-{
-  uint64_t blockId   = (uint64_t)blockIdx.y * gridDim.x + blockIdx.x;
-  uint64_t i_conn = blockId * blockDim.x + threadIdx.x;
-  if (i_conn<n_dir_conn) {
-    DirectConnection dir_conn = dir_conn_array[i_conn];
-    int irel = dir_conn.irel_source_;
-    int i_target = dir_conn.i_target_;
-    int port = dir_conn.port_;
-    float weight = dir_conn.weight_;
-    float delay = dir_conn.delay_;
-    float *param = param_arr + irel*n_param;
-    double t_rel = t - origin - delay;
+  CUDAMALLOCCTRL( "&param_arr_", &param_arr_, n_node_ * n_param_ * sizeof( float ) );
 
-    if ((t_rel>=start) && (t_rel<=stop)){
-      int n = curand_poisson(curand_state+i_conn, time_step*rate);
-      if (n>0) { // //Send direct spike (i_target, port, weight*n);
-	/////////////////////////////////////////////////////////////////
-	int i_group=NodeGroupMap[i_target];
-	int i = port*NodeGroupArray[i_group].n_node_ + i_target
-	  - NodeGroupArray[i_group].i_node_0_;
-	double d_val = (double)(weight*n);
-	atomicAddDouble(&NodeGroupArray[i_group].get_spike_array_[i], d_val); 
-	////////////////////////////////////////////////////////////////
-      }
-    }
-  }
+  SetScalParam( 0, n_node, "rate", 0.0 );
+  SetScalParam( 0, n_node, "origin", 0.0 );
+  SetScalParam( 0, n_node, "start", 0.0 );
+  SetScalParam( 0, n_node, "stop", 1.0e30 );
+
+  return 0;
 }
-*/
 
-__global__ void PoissGenSendSpikeKernel(curandState *curand_state,
-					long long time_idx,
-					float *mu_arr,
-					uint *poiss_key_array,
-					int64_t n_conn, int64_t i_conn_0,
-					int64_t block_size, int n_node,
-					int max_delay)
+int
+poiss_gen::buildDirectConnections()
 {
-  uint64_t blockId   = (uint64_t)blockIdx.y * gridDim.x + blockIdx.x;
-  uint64_t i_conn_rel = blockId * blockDim.x + threadIdx.x;
-  if (i_conn_rel >= n_conn) {
-    return;
-  }
-  uint source_delay = poiss_key_array[i_conn_rel];
-  int i_source = source_delay >> MaxPortSynNBits;
-  int i_delay = source_delay & PortSynMask;
-  int id = (int)((time_idx - i_delay + 1) % max_delay);
-  float mu = mu_arr[id*n_node + i_source];
-  int n = curand_poisson(curand_state+i_conn_rel, mu);
-  if (n>0) {
-    int64_t i_conn = i_conn_0 + i_conn_rel;
-    int i_block = (int)(i_conn / block_size);
-    int64_t i_block_conn = i_conn % block_size;
-    connection_struct conn = ConnectionArray[i_block][i_block_conn];
-    uint target_port_syn = conn.target_port_syn;
-    int i_target = target_port_syn >> MaxPortSynNBits;
-    uint port = (target_port_syn & PortSynMask) >> MaxSynNBits;
-    float weight = conn.weight;
-
-    int i_group=NodeGroupMap[i_target];
-    int i = port*NodeGroupArray[i_group].n_node_ + i_target
-      - NodeGroupArray[i_group].i_node_0_;
-    double d_val = (double)(weight*n);
-    atomicAddDouble(&NodeGroupArray[i_group].get_spike_array_[i], d_val);
-  }
+  // printf("i_node_0_ %d n_node_ %d i_conn0_ %ld n_dir_conn_ %ld
+  //  max_delay_ %d\n",
+  // i_node_0_, n_node_, i_conn0_, n_dir_conn_, max_delay_);
+  return poiss_conn::conn_->buildDirectConnections(
+    i_node_0_, n_node_, i_conn0_, n_dir_conn_, max_delay_, d_mu_arr_, d_poiss_key_array_ );
 }
 
-int poiss_gen::Init(int i_node_0, int n_node, int /*n_port*/,
-		    int i_group)
+int
+poiss_gen::SendDirectSpikes( long long time_idx )
 {
-  BaseNeuron::Init(i_node_0, n_node, 0 /*n_port*/, i_group);
-  node_type_ = i_poisson_generator_model;
-  n_scal_param_ = N_POISS_GEN_SCAL_PARAM;
-  n_param_ = n_scal_param_;
-  scal_param_name_ = poiss_gen_scal_param_name;
-  has_dir_conn_ = true;
-  
-  CUDAMALLOCCTRL("&param_arr_",&param_arr_, n_node_*n_param_*sizeof(float));
-
-  SetScalParam(0, n_node, "rate", 0.0);
-  SetScalParam(0, n_node, "origin", 0.0);
-  SetScalParam(0, n_node, "start", 0.0);
-  SetScalParam(0, n_node, "stop", 1.0e30);
-  
-  return 0;
+  return poiss_conn::conn_->sendDirectSpikes(
+    time_idx, i_conn0_, n_dir_conn_, n_node_, max_delay_, d_mu_arr_, d_poiss_key_array_, d_curand_state_ );
 }
 
-int poiss_gen::Calibrate(double, float)
+int
+poiss_gen::Calibrate( double, float )
 {
-  //buildDirectConnections();
-  CUDAMALLOCCTRL("&d_curand_state_",&d_curand_state_, n_conn_*sizeof(curandState));
+  CUDAMALLOCCTRL( "&d_curand_state_", &d_curand_state_, n_dir_conn_ * sizeof( curandState ) );
 
   unsigned int grid_dim_x, grid_dim_y;
 
-  if (n_conn_<65536*1024) { // max grid dim * max block dim
-    grid_dim_x = (n_conn_+1023)/1024;
+  if ( n_dir_conn_ < 65536 * 1024 )
+  { // max grid dim * max block dim
+    grid_dim_x = ( n_dir_conn_ + 1023 ) / 1024;
     grid_dim_y = 1;
   }
-  else {
+  else
+  {
     grid_dim_x = 64; // I think it's not necessary to increase it
-    if (n_conn_>grid_dim_x*1024*65535) {
-      throw ngpu_exception(std::string("Number of direct connections ")
-			   + std::to_string(n_conn_) +
-			   " larger than threshold "
-			   + std::to_string(grid_dim_x*1024*65535));
+    if ( n_dir_conn_ > grid_dim_x * 1024 * 65535 )
+    {
+      throw ngpu_exception( std::string( "Number of direct connections " ) + std::to_string( n_dir_conn_ )
+        + " larger than threshold " + std::to_string( grid_dim_x * 1024 * 65535 ) );
     }
-    grid_dim_y = (n_conn_ + grid_dim_x*1024 -1) / (grid_dim_x*1024);
+    grid_dim_y = ( n_dir_conn_ + grid_dim_x * 1024 - 1 ) / ( grid_dim_x * 1024 );
   }
-  dim3 numBlocks(grid_dim_x, grid_dim_y);
-  
-  unsigned int *d_seed;
+  dim3 numBlocks( grid_dim_x, grid_dim_y );
+
+  unsigned int* d_seed;
   unsigned int h_seed;
 
-  CUDAMALLOCCTRL("&d_seed",&d_seed, sizeof(unsigned int));
-  CURAND_CALL(curandGenerate(*random_generator_, d_seed, 1));
+  CUDAMALLOCCTRL( "&d_seed", &d_seed, sizeof( unsigned int ) );
+  CURAND_CALL( curandGenerate( *random_generator_, d_seed, 1 ) );
   // Copy seed from device memory to host
-  gpuErrchk(cudaMemcpy(&h_seed, d_seed, sizeof(unsigned int),
-  		       cudaMemcpyDeviceToHost));
-  //std::cout << "h_seed: " << h_seed << "\n";
-
-  SetupPoissKernel<<<numBlocks, 1024>>>(d_curand_state_, n_conn_, h_seed);
-  gpuErrchk( cudaPeekAtLastError() );
-  gpuErrchk( cudaDeviceSynchronize() );
-  
-  return 0;
-}
-
+  gpuErrchk( cudaMemcpy( &h_seed, d_seed, sizeof( unsigned int ), cudaMemcpyDeviceToHost ) );
+  // std::cout << "h_seed: " << h_seed << "\n";
 
-int poiss_gen::Update(long long it, double)
-{
-  PoissGenUpdateKernel<<<(n_node_+1023)/1024, 1024>>>
-    (it, n_node_, max_delay_, param_arr_, n_param_, d_mu_arr_);
-    DBGCUDASYNC
-
-  return 0;
-}
-
-/*
-int poiss_gen::SendDirectSpikes(double t, float time_step)
-{
-  unsigned int grid_dim_x, grid_dim_y;
-  
-  if (n_dir_conn_<65536*1024) { // max grid dim * max block dim
-    grid_dim_x = (n_dir_conn_+1023)/1024;
-    grid_dim_y = 1;
-  }
-  else {
-    grid_dim_x = 64; // I think it's not necessary to increase it
-    if (n_dir_conn_>grid_dim_x*1024*65535) {
-      throw ngpu_exception(std::string("Number of direct connections ")
-			   + std::to_string(n_dir_conn_) +
-			   " larger than threshold "
-			   + std::to_string(grid_dim_x*1024*65535));
-    }
-    grid_dim_y = (n_dir_conn_ + grid_dim_x*1024 -1) / (grid_dim_x*1024);
-  }
-  dim3 numBlocks(grid_dim_x, grid_dim_y);
-  PoissGenSendSpikeKernel<<<numBlocks, 1024>>>(d_curand_state_, t, time_step,
-					       param_arr_, n_param_,
-					       d_dir_conn_array_, n_dir_conn_);
-  
+  SetupPoissKernel<<< numBlocks, 1024 >>>( d_curand_state_, n_dir_conn_, h_seed );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
 
   return 0;
 }
-*/
 
-int poiss_gen::SendDirectSpikes(long long time_idx)
+int
+poiss_gen::Update( long long it, double )
 {
-  unsigned int grid_dim_x, grid_dim_y;
-  
-  if (n_conn_<65536*1024) { // max grid dim * max block dim
-    grid_dim_x = (n_conn_+1023)/1024;
-    grid_dim_y = 1;
-  }
-  else {
-    grid_dim_x = 64; // I think it's not necessary to increase it
-    if (n_conn_>grid_dim_x*1024*65535) {
-      throw ngpu_exception(std::string("Number of direct connections ")
-			   + std::to_string(n_conn_) +
-			   " larger than threshold "
-			   + std::to_string(grid_dim_x*1024*65535));
-    }
-    grid_dim_y = (n_conn_ + grid_dim_x*1024 -1) / (grid_dim_x*1024);
-  }
-  dim3 numBlocks(grid_dim_x, grid_dim_y);
-  PoissGenSendSpikeKernel<<<numBlocks, 1024>>>
-    (d_curand_state_,
-     time_idx, d_mu_arr_, d_poiss_key_array_,
-     n_conn_, i_conn0_,
-     h_ConnBlockSize, n_node_, max_delay_);
-
+  PoissGenUpdateKernel<<< ( n_node_ + 1023 ) / 1024, 1024 >>>(
+    it, n_node_, max_delay_, param_arr_, n_param_, d_mu_arr_ );
   DBGCUDASYNC
 
   return 0;
 }
-
-
-
-namespace poiss_conn
-{
-  int OrganizeDirectConnections()
-  {    
-    uint k = KeySubarray.size();
-    int64_t n = NConn;
-    int64_t block_size = h_ConnBlockSize;
-    
-    key_t **key_subarray = KeySubarray.data();
-    
-    
-    CUDAMALLOCCTRL("&d_poiss_key_array_data_pt",&d_poiss_key_array_data_pt, k*sizeof(key_t*));
-    gpuErrchk(cudaMemcpy(d_poiss_key_array_data_pt, key_subarray,
-			 k*sizeof(key_t*), cudaMemcpyHostToDevice));
-
-    array_t h_poiss_subarray[k];
-    for (uint i=0; i<k; i++) {
-      h_poiss_subarray[i].h_data_pt = key_subarray;
-      h_poiss_subarray[i].data_pt = d_poiss_key_array_data_pt; //key_subarray;
-      h_poiss_subarray[i].block_size = block_size;
-      h_poiss_subarray[i].offset = i * block_size;
-      h_poiss_subarray[i].size = i<k-1 ? block_size : n-(k-1)*block_size;
-    }
-
-    CUDAMALLOCCTRL("&d_poiss_subarray",&d_poiss_subarray, k*sizeof(array_t));
-    gpuErrchk(cudaMemcpyAsync(d_poiss_subarray, h_poiss_subarray,
-			      k*sizeof(array_t), cudaMemcpyHostToDevice));
-
-    CUDAMALLOCCTRL("&d_poiss_num",&d_poiss_num, 2*k*sizeof(int64_t));
-    CUDAMALLOCCTRL("&d_poiss_sum",&d_poiss_sum, 2*sizeof(int64_t));
-  
-
-    CUDAMALLOCCTRL("&d_poiss_thresh",&d_poiss_thresh, 2*sizeof(key_t));
-
-    return 0;
-  }
-};
-
-int poiss_gen::buildDirectConnections()
-{
-  uint k = KeySubarray.size();
-  int64_t block_size = h_ConnBlockSize;
-  
-  poiss_conn::key_t **key_subarray = KeySubarray.data();  
-  poiss_conn::key_t h_poiss_thresh[2];
-  h_poiss_thresh[0] = i_node_0_ << h_MaxPortSynNBits;
-  h_poiss_thresh[1] = (i_node_0_ + n_node_) << h_MaxPortSynNBits;
-  gpuErrchk(cudaMemcpy(poiss_conn::d_poiss_thresh, h_poiss_thresh,
-		       2*sizeof(poiss_conn::key_t),
-		       cudaMemcpyHostToDevice));
-  
-  int64_t h_poiss_num[2*k];
-  int64_t *d_num0 = &poiss_conn::d_poiss_num[0];
-  int64_t *d_num1 = &poiss_conn::d_poiss_num[k];
-  int64_t *h_num0 = &h_poiss_num[0];
-  int64_t *h_num1 = &h_poiss_num[k];
-
-  search_multi_down<poiss_conn::key_t, poiss_conn::array_t, 1024>
-    (poiss_conn::d_poiss_subarray, k, &poiss_conn::d_poiss_thresh[0], d_num0,
-     &poiss_conn::d_poiss_sum[0]);
-  CUDASYNC
-    
-  search_multi_down<poiss_conn::key_t, poiss_conn::array_t, 1024>
-    (poiss_conn::d_poiss_subarray, k, &poiss_conn::d_poiss_thresh[1], d_num1,
-     &poiss_conn::d_poiss_sum[1]);
-  CUDASYNC
-
-  gpuErrchk(cudaMemcpy(h_poiss_num, poiss_conn::d_poiss_num,
-		       2*k*sizeof(int64_t), cudaMemcpyDeviceToHost));
-
-  i_conn0_ = 0;
-  int64_t i_conn1 = 0;
-  uint ib0 = 0;
-  uint ib1 = 0;
-  for (uint i=0; i<k; i++) {
-    if (h_num0[i] < block_size) {
-      i_conn0_ = block_size*i + h_num0[i];
-      ib0 = i;
-      break;
-    }
-  }
-  for (uint i=0; i<k; i++) {
-    if (h_num1[i] < block_size) {
-      i_conn1 = block_size*i + h_num1[i];
-      ib1 = i;
-      break;
-    }
-  }
-  n_conn_ = i_conn1 - i_conn0_;
-  if (n_conn_>0) {
-    CUDAMALLOCCTRL("&d_poiss_key_array_",&d_poiss_key_array_, n_conn_*sizeof(key_t));
-    
-    int64_t offset = 0;
-    for (uint ib=ib0; ib<=ib1; ib++) {
-      if (ib==ib0 && ib==ib1) {
-	gpuErrchk(cudaMemcpy(d_poiss_key_array_, key_subarray[ib] + h_num0[ib],
-			     n_conn_*sizeof(key_t), cudaMemcpyDeviceToDevice));
-	break;
-      }
-      else if (ib==ib0) {
-	offset = block_size - h_num0[ib];
-	gpuErrchk(cudaMemcpy(d_poiss_key_array_, key_subarray[ib] + h_num0[ib],
-			     offset*sizeof(key_t),
-			     cudaMemcpyDeviceToDevice));
-      }
-      else if (ib==ib1) {
-	gpuErrchk(cudaMemcpy(d_poiss_key_array_ + offset,
-			     key_subarray[ib],
-			     h_num1[ib]*sizeof(key_t),
-			     cudaMemcpyDeviceToDevice));
-	break;
-      }
-      else {
-	gpuErrchk(cudaMemcpy(d_poiss_key_array_ + offset,
-			     key_subarray[ib],
-			     block_size*sizeof(key_t),
-			     cudaMemcpyDeviceToDevice));
-	offset += block_size;
-      }
-    }
-
-    unsigned int grid_dim_x, grid_dim_y;
-  
-    if (n_conn_<65536*1024) { // max grid dim * max block dim
-      grid_dim_x = (n_conn_+1023)/1024;
-      grid_dim_y = 1;
-    }
-    else {
-      grid_dim_x = 64; // I think it's not necessary to increase it
-      if (n_conn_>grid_dim_x*1024*65535) {
-	throw ngpu_exception(std::string("Number of direct connections ")
-			     + std::to_string(n_conn_) +
-			     " larger than threshold "
-			     + std::to_string(grid_dim_x*1024*65535));
-      }
-      grid_dim_y = (n_conn_ + grid_dim_x*1024 -1) / (grid_dim_x*1024);
-    }
-    dim3 numBlocks(grid_dim_x, grid_dim_y);
-    PoissGenSubstractFirstNodeIndexKernel<<<numBlocks, 1024>>>
-      (n_conn_, d_poiss_key_array_, i_node_0_);
-    DBGCUDASYNC
-
-  }
-
-  // Find maximum delay of poisson direct connections
-  uint *d_max_delay; // maximum delay pointer in device memory
-  CUDAMALLOCCTRL("&d_max_delay",&d_max_delay, sizeof(int));
-  MaxDelay max_op; // comparison operator used by Reduce function 
-  // Determine temporary device storage requirements
-  void *d_temp_storage = NULL;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
-			    d_poiss_key_array_, d_max_delay, n_conn_, max_op,
-			    INT_MIN);
-  // Allocate temporary storage
-  CUDAMALLOCCTRL("&d_temp_storage",&d_temp_storage, temp_storage_bytes);
-  // Run reduction
-  cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
-			    d_poiss_key_array_, d_max_delay, n_conn_, max_op,
-			    INT_MIN);
-  gpuErrchk(cudaMemcpy(&max_delay_, d_max_delay, sizeof(int),
-		       cudaMemcpyDeviceToHost));
-
-  // max_delay_ = 200;
-  printf("Max delay of direct (poisson generator) connections: %d\n",
-	 max_delay_);
-  CUDAMALLOCCTRL("&d_mu_arr_",&d_mu_arr_, n_node_*max_delay_*sizeof(float));
-  gpuErrchk(cudaMemset(d_mu_arr_, 0, n_node_*max_delay_*sizeof(float)));
-  
-  /*
-  CUDAFREECTRL("d_key_array_data_pt",d_key_array_data_pt);
-  CUDAFREECTRL("d_subarray",d_subarray);
-  CUDAFREECTRL("d_num",d_num);
-  CUDAFREECTRL("d_sum",d_sum);
-  CUDAFREECTRL("d_thresh",d_thresh);
-  */
-  
-  return 0;
-}
diff --git a/src/poiss_gen.h b/src/poiss_gen.h
index 376e57015..b3be2c55c 100644
--- a/src/poiss_gen.h
+++ b/src/poiss_gen.h
@@ -20,21 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef POISSGEN_H
 #define POISSGEN_H
 
-#include <iostream>
-#include <string>
-#include <curand.h>
-#include <curand_kernel.h>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "connect.h"
+#include "copass_kernels.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include <curand.h>
+#include <curand_kernel.h>
+#include <iostream>
+#include <string>
 
 /*
 const int N_POISS_GEN_SCAL_PARAM = 4;
@@ -46,10 +44,6 @@ const std::string poiss_gen_scal_param_name[] = {
 };
 */
 
-namespace poiss_conn
-{
-  int OrganizeDirectConnections();
-};
 /* BeginUserDocs: device, generator
 
 Short description
@@ -83,24 +77,24 @@ EndUserDocs */
 
 class poiss_gen : public BaseNeuron
 {
-  curandState *d_curand_state_;
-  uint *d_poiss_key_array_;
+  // Connection *conn_;
+  curandState* d_curand_state_;
+  void* d_poiss_key_array_;
   int64_t i_conn0_;
-  int64_t n_conn_;
-  float *d_mu_arr_;
+  int64_t n_dir_conn_;
+  float* d_mu_arr_;
   int max_delay_;
-  
- public:
-  
-  int Init(int i_node_0, int n_node, int n_port, int i_group);
-
-  int Calibrate(double, float);
-		
-  int Update(long long it, double t1);
-  int SendDirectSpikes(long long time_idx);
-  int buildDirectConnections();
-};
 
+public:
+  int Init( int i_node_0, int n_node, int n_port, int i_group );
+
+  int Calibrate( double, float );
+
+  int Update( long long it, double t1 );
 
+  int SendDirectSpikes( long long time_idx );
+
+  int buildDirectConnections();
+};
 
 #endif
diff --git a/src/poiss_gen_variables.h b/src/poiss_gen_variables.h
index c85b5dddc..46f114916 100644
--- a/src/poiss_gen_variables.h
+++ b/src/poiss_gen_variables.h
@@ -20,16 +20,13 @@
  *
  */
 
-
-
-
-
 #ifndef POISSGENVARIABLES_H
 #define POISSGENVARIABLES_H
 
 #include <string>
 
-enum {
+enum
+{
   i_rate = 0,
   i_origin,
   i_start,
@@ -37,16 +34,16 @@ enum {
   N_POISS_GEN_SCAL_PARAM
 };
 
-const std::string poiss_gen_scal_param_name[N_POISS_GEN_SCAL_PARAM] = {
+const std::string poiss_gen_scal_param_name[ N_POISS_GEN_SCAL_PARAM ] = {
   "rate",
   "origin",
   "start",
   "stop",
 };
 
-#define rate param[i_rate]
-#define origin param[i_origin]
-#define start param[i_start]
-#define stop param[i_stop]
+#define rate param[ i_rate ]
+#define origin param[ i_origin ]
+#define start param[ i_start ]
+#define stop param[ i_stop ]
 
 #endif
diff --git a/src/prefix_scan.cu b/src/prefix_scan.cu
index a7ce982d4..918ca8df3 100644
--- a/src/prefix_scan.cu
+++ b/src/prefix_scan.cu
@@ -20,37 +20,36 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <stdio.h>
 #include "prefix_scan.h"
 #include "scan.h"
+#include <config.h>
+#include <stdio.h>
 
 const unsigned int PrefixScan::AllocSize = 13 * 1048576 / 2;
 
-int PrefixScan::Init()
+int
+PrefixScan::Init()
 {
-  //printf("Initializing CUDA-C scan...\n\n");
-  //initScan();
-  
+  // printf("Initializing CUDA-C scan...\n\n");
+  // initScan();
+
   return 0;
 }
 
-int PrefixScan::Scan(int *d_Output, int *d_Input, int n)
+int
+PrefixScan::Scan( int* d_Output, int* d_Input, int n )
 {
-  prefix_scan(d_Output, d_Input, n, true);
+  prefix_scan( d_Output, d_Input, n, true );
 
   return 0;
 }
 
-int PrefixScan::Free()
+int
+PrefixScan::Free()
 {
-  //closeScan();
-  //CUDAFREECTRL("d_Output",d_Output);
-  //CUDAFREECTRL("d_Input",d_Input);
-  
+  // closeScan();
+  // CUDAFREECTRL("d_Output",d_Output);
+  // CUDAFREECTRL("d_Input",d_Input);
+
   return 0;
 }
diff --git a/src/prefix_scan.h b/src/prefix_scan.h
index 5caf9c721..1c5ea8ae4 100644
--- a/src/prefix_scan.h
+++ b/src/prefix_scan.h
@@ -20,16 +20,12 @@
  *
  */
 
-
-
-
-
 #ifndef PREFIXSCAN_H
 #define PREFIXSCAN_H
 
 class PrefixScan
 {
- public:
+public:
   static const unsigned int AllocSize;
 
   /*
@@ -43,10 +39,10 @@ class PrefixScan
 
   uint *h_OutputGPU;
   */
-  
+
   int Init();
 
-  int Scan(int *d_Output, int *d_Input, int n);
+  int Scan( int* d_Output, int* d_Input, int n );
 
   int Free();
 };
diff --git a/src/propagate_error.h b/src/propagate_error.h
index 694e95bfd..4456b3a1e 100644
--- a/src/propagate_error.h
+++ b/src/propagate_error.h
@@ -1,32 +1,36 @@
 #ifndef PROPAGATEERROR_H
 #define PROPAGATEERROR_H
 
-#define BEGIN_ERR_PROP			   \
-  checkNESTGPUInstance();		   \
-  NESTGPU_instance->SetErrorFlag(false); \
-  NESTGPU_instance->SetErrorMessage(""); \
-  NESTGPU_instance->SetErrorCode(0);	   \
+#define BEGIN_ERR_PROP                     \
+  checkNESTGPUInstance();                  \
+  NESTGPU_instance->SetErrorFlag( false ); \
+  NESTGPU_instance->SetErrorMessage( "" ); \
+  NESTGPU_instance->SetErrorCode( 0 );     \
   try
 
-#define END_ERR_PROP							 \
-  catch (ngpu_exception &e){                                             \
-    NESTGPU_instance->SetErrorFlag(true);				 \
-    NESTGPU_instance->SetErrorMessage(e.what());			 \
-    NESTGPU_instance->SetErrorCode(2);				 \
-  }								         \
-  catch (std::bad_alloc&) {						 \
-    NESTGPU_instance->SetErrorFlag(true);			         \
-    NESTGPU_instance->SetErrorMessage("Memory allocation error.");     \
-    NESTGPU_instance->SetErrorCode(1);			         \
-  }									 \
-  catch (...) {                                                          \
-    NESTGPU_instance->SetErrorFlag(true);				 \
-    NESTGPU_instance->SetErrorMessage("Error in NESTGPU function."); \
-    NESTGPU_instance->SetErrorCode(255);				 \
-  }                                                                      \
-  if (NESTGPU_instance->OnException() == ON_EXCEPTION_EXIT) {          \
+#define END_ERR_PROP                                                   \
+  catch ( ngpu_exception & e )                                         \
+  {                                                                    \
+    NESTGPU_instance->SetErrorFlag( true );                            \
+    NESTGPU_instance->SetErrorMessage( e.what() );                     \
+    NESTGPU_instance->SetErrorCode( 2 );                               \
+  }                                                                    \
+  catch ( std::bad_alloc& )                                            \
+  {                                                                    \
+    NESTGPU_instance->SetErrorFlag( true );                            \
+    NESTGPU_instance->SetErrorMessage( "Memory allocation error." );   \
+    NESTGPU_instance->SetErrorCode( 1 );                               \
+  }                                                                    \
+  catch ( ... )                                                        \
+  {                                                                    \
+    NESTGPU_instance->SetErrorFlag( true );                            \
+    NESTGPU_instance->SetErrorMessage( "Error in NESTGPU function." ); \
+    NESTGPU_instance->SetErrorCode( 255 );                             \
+  }                                                                    \
+  if ( NESTGPU_instance->OnException() == ON_EXCEPTION_EXIT )          \
+  {                                                                    \
     std::cerr << NESTGPU_instance->GetErrorMessage();                  \
-    exit(NESTGPU_instance->GetErrorCode());                            \
+    exit( NESTGPU_instance->GetErrorCode() );                          \
   }
 
 #endif
diff --git a/src/propagator_stability.cu b/src/propagator_stability.cu
index c12a3966c..5f5404707 100644
--- a/src/propagator_stability.cu
+++ b/src/propagator_stability.cu
@@ -26,22 +26,19 @@
 #include <cmath>
 
 // Includes from libnestutil:
-//#include "numerics.h"
+// #include "numerics.h"
 
-__device__
-double propagator_32( double tau_syn, double tau, double C, double h )
+__device__ double
+propagator_32( double tau_syn, double tau, double C, double h )
 {
-  const double P32_linear = 1.0 / ( 2.0 * C * tau * tau ) * h * h
-    * ( tau_syn - tau ) * exp( -h / tau );
+  const double P32_linear = 1.0 / ( 2.0 * C * tau * tau ) * h * h * ( tau_syn - tau ) * exp( -h / tau );
   const double P32_singular = h / C * exp( -h / tau );
   const double P32 =
-    -tau / ( C * ( 1.0 - tau / tau_syn ) ) * exp( -h / tau_syn )
-    * expm1( h * ( 1.0 / tau_syn - 1.0 / tau ) );
+    -tau / ( C * ( 1.0 - tau / tau_syn ) ) * exp( -h / tau_syn ) * expm1( h * ( 1.0 / tau_syn - 1.0 / tau ) );
 
   const double dev_P32 = fabs( P32 - P32_singular );
 
-  if ( tau == tau_syn || ( fabs( tau - tau_syn ) < 0.1 && dev_P32 > 2.0
-			   * fabs( P32_linear ) ) )
+  if ( tau == tau_syn || ( fabs( tau - tau_syn ) < 0.1 && dev_P32 > 2.0 * fabs( P32_linear ) ) )
   {
     return P32_singular;
   }
@@ -51,13 +48,14 @@ double propagator_32( double tau_syn, double tau, double C, double h )
   }
 }
 
-__device__
-double propagator_31( double tau_syn, double tau, double C, double h )
+__device__ double
+propagator_31( double tau_syn, double tau, double C, double h )
 {
   const double P31_linear = 1.0 / ( 3.0 * C * tau * tau ) * h * h * h * ( tau_syn - tau ) * exp( -h / tau );
-  const double P31 =
-    1.0 / C * ( exp( -h / tau_syn ) * expm1( -h / tau + h / tau_syn ) / ( tau / tau_syn - 1.0 ) * tau
-              - h * exp( -h / tau_syn ) ) / ( -1.0 - -tau / tau_syn ) * tau;
+  const double P31 = 1.0 / C
+    * ( exp( -h / tau_syn ) * expm1( -h / tau + h / tau_syn ) / ( tau / tau_syn - 1.0 ) * tau
+      - h * exp( -h / tau_syn ) )
+    / ( -1.0 - -tau / tau_syn ) * tau;
   const double P31_singular = h * h / 2.0 / C * exp( -h / tau );
   const double dev_P31 = fabs( P31 - P31_singular );
 
diff --git a/src/propagator_stability.h b/src/propagator_stability.h
index bb5e2422f..640c2c745 100644
--- a/src/propagator_stability.h
+++ b/src/propagator_stability.h
@@ -24,9 +24,7 @@
 #define PROPAGATOR_STABILITY_H
 
 // Propagators to handle similar tau_m and tau_syn_* time constants.
-__device__
-double propagator_31( double tau_syn, double tau, double C, double h );
-__device__
-double propagator_32( double tau_syn, double tau, double C, double h );
+__device__ double propagator_31( double tau_syn, double tau, double C, double h );
+__device__ double propagator_32( double tau_syn, double tau, double C, double h );
 
 #endif
diff --git a/src/random.cu b/src/random.cu
index 15ef4d9e8..52fbfdc70 100644
--- a/src/random.cu
+++ b/src/random.cu
@@ -1,77 +1,72 @@
+#include "cuda_error.h"
 #include <config.h>
-#include <stdio.h>
-#include <stdlib.h>
 #include <cuda.h>
 #include <curand.h>
-#include "cuda_error.h"
+#include <stdio.h>
+#include <stdlib.h>
 
-unsigned int *curand_int(curandGenerator_t &gen, size_t n)
+unsigned int*
+curand_int( curandGenerator_t& gen, size_t n )
 {
-  unsigned int *dev_data;
+  unsigned int* dev_data;
   // Allocate n integers on host
-  unsigned int *host_data = new unsigned int[n];
-  
+  unsigned int* host_data = new unsigned int[ n ];
+
   // Allocate n integers on device
-  CUDA_CALL(cudaMalloc((void **)&dev_data, n*sizeof(unsigned int)));
-  // Create pseudo-random number generator
+  CUDAMALLOCCTRL( "&dev_data", ( void** ) &dev_data, n * sizeof( unsigned int ) );
 
   // Generate n integers on device
-  CURAND_CALL(curandGenerate(gen, dev_data, n));
-  //cudaDeviceSynchronize();
-  // Copy device memory to host
-  CUDA_CALL(cudaMemcpy(host_data, dev_data, n*sizeof(unsigned int),
-                       cudaMemcpyDeviceToHost));
+  CURAND_CALL( curandGenerate( gen, dev_data, n ) );
+  // cudaDeviceSynchronize();
+  //  Copy device memory to host
+  CUDA_CALL( cudaMemcpy( host_data, dev_data, n * sizeof( unsigned int ), cudaMemcpyDeviceToHost ) );
   // Cleanup
-  CUDA_CALL(cudaFree(dev_data));
-  
+  CUDAFREECTRL( "dev_data", dev_data );
+
   return host_data;
 }
 
-float *curand_uniform(curandGenerator_t &gen, size_t n)
+float*
+curand_uniform( curandGenerator_t& gen, size_t n )
 {
-  float *dev_data;
+  float* dev_data;
   // Allocate n floats on host
-  float *host_data = new float[n];
-  
+  float* host_data = new float[ n ];
+
   // Allocate n floats on device
-  CUDA_CALL(cudaMalloc((void **)&dev_data, n*sizeof(float)));
-  // Create pseudo-random number generator
+  CUDAMALLOCCTRL( "&dev_data", ( void** ) &dev_data, n * sizeof( float ) );
 
   // Generate n integers on device
-  CURAND_CALL(curandGenerateUniform(gen, dev_data, n));
-  //cudaDeviceSynchronize();
-  // Copy device memory to host
-  CUDA_CALL(cudaMemcpy(host_data, dev_data, n*sizeof(float),
-                       cudaMemcpyDeviceToHost));
+  CURAND_CALL( curandGenerateUniform( gen, dev_data, n ) );
+  // cudaDeviceSynchronize();
+  //  Copy device memory to host
+  CUDA_CALL( cudaMemcpy( host_data, dev_data, n * sizeof( float ), cudaMemcpyDeviceToHost ) );
   // Cleanup
-  CUDA_CALL(cudaFree(dev_data));
-  
+  CUDAFREECTRL( "dev_data", dev_data );
+
   return host_data;
 }
 
-float *curand_normal(curandGenerator_t &gen, size_t n, float mean,
-		     float stddev)
+float*
+curand_normal( curandGenerator_t& gen, size_t n, float mean, float stddev )
 {
-  size_t n1 = ( (n % 2) == 0 ) ? n : n + 1; // round up to multiple of 2
-  float *dev_data;
+  size_t n1 = ( ( n % 2 ) == 0 ) ? n : n + 1; // round up to multiple of 2
+  float* dev_data;
   // Allocate n floats on host
-  float *host_data = new float[n];
-  
+  float* host_data = new float[ n ];
+
   // Allocate n1 floats on device
-  CUDA_CALL(cudaMalloc((void **)&dev_data, n1*sizeof(float)));
-  // Create pseudo-random number generator
+  CUDAMALLOCCTRL( "&dev_data", ( void** ) &dev_data, n1 * sizeof( float ) );
 
   // Generate n1 integers on device
-  //printf("curandGenerateNormal n1: %d\tmean: %f\tstd: %f\n", (int)n1, mean,
+  // printf("curandGenerateNormal n1: %d\tmean: %f\tstd: %f\n", (int)n1, mean,
   //	 stddev);
-  CURAND_CALL(curandGenerateNormal(gen, dev_data, n1, mean, stddev));
-  //cudaDeviceSynchronize();
-  // Copy device memory to host
-  CUDA_CALL(cudaMemcpy(host_data, dev_data, n*sizeof(float),
-                       cudaMemcpyDeviceToHost));
+  CURAND_CALL( curandGenerateNormal( gen, dev_data, n1, mean, stddev ) );
+  // cudaDeviceSynchronize();
+  //  Copy device memory to host
+  CUDA_CALL( cudaMemcpy( host_data, dev_data, n * sizeof( float ), cudaMemcpyDeviceToHost ) );
   // Cleanup
-  CUDA_CALL(cudaFree(dev_data));
-  
+  CUDAFREECTRL( "dev_data", dev_data );
+
   return host_data;
 }
-
diff --git a/src/random.h b/src/random.h
index 38407daec..53ad4bf74 100644
--- a/src/random.h
+++ b/src/random.h
@@ -2,11 +2,10 @@
 #define RANDOM_H
 #include <curand.h>
 
-unsigned int *curand_int(curandGenerator_t &gen, size_t n);
+unsigned int* curand_int( curandGenerator_t& gen, size_t n );
 
-float *curand_uniform(curandGenerator_t &gen, size_t n);
+float* curand_uniform( curandGenerator_t& gen, size_t n );
 
-float *curand_normal(curandGenerator_t &gen, size_t n, float mean,
-		     float stddev);
+float* curand_normal( curandGenerator_t& gen, size_t n, float mean, float stddev );
 
 #endif
diff --git a/src/remote_connect.cu b/src/remote_connect.cu
index 8823a5e53..302faaf02 100644
--- a/src/remote_connect.cu
+++ b/src/remote_connect.cu
@@ -1,4 +1,3 @@
-//#define CHECKRC
 
 #include <iostream>
 #include <vector>
@@ -23,25 +22,17 @@
 // elements, which are allocated dynamically
 
 __constant__ uint node_map_block_size; // = 100000;
-uint h_node_map_block_size; // = 100000;
 
 // number of elements in the map for each source host
 // n_remote_source_node_map[i_source_host]
 // with i_source_host = 0, ..., n_hosts-1 excluding this host itself
-__device__ uint *n_remote_source_node_map; // [n_hosts];
-uint *d_n_remote_source_node_map;
-std::vector<uint> h_n_remote_source_node_map;
+__device__ uint* n_remote_source_node_map; // [n_hosts];
 
 // remote_source_node_map[i_source_host][i_block][i]
-std::vector< std::vector<int*> > h_remote_source_node_map;
-__device__ int ***remote_source_node_map;
+__device__ uint*** remote_source_node_map;
 
 // local_spike_buffer_map[i_source_host][i_block][i]
-std::vector< std::vector<int*> > h_local_spike_buffer_map;
-__device__ int ***local_spike_buffer_map;
-int ***d_local_spike_buffer_map;
-// hd_local_spike_buffer_map[i_source_host] vector of pointers to gpu memory
-std::vector<int**> hd_local_spike_buffer_map;
+__device__ uint*** local_spike_buffer_map;
 
 // Define two arrays that map local source nodes to remote spike buffers.
 // The structure is the same as for remote source nodes
@@ -49,882 +40,260 @@ std::vector<int**> hd_local_spike_buffer_map;
 // number of elements in the map for each target host
 // n_local_source_node_map[i_target_host]
 // with i_target_host = 0, ..., n_hosts-1 excluding this host itself
-__device__ uint *n_local_source_node_map; // [n_hosts]; 
-uint *d_n_local_source_node_map;
-std::vector<uint> h_n_local_source_node_map;
+__device__ uint* n_local_source_node_map; // [n_hosts];
 
 // local_source_node_map[i_target_host][i_block][i]
-std::vector< std::vector<int*> > h_local_source_node_map;
-__device__ int ***local_source_node_map;
-int ***d_local_source_node_map;
-// hd_local_source_node_map[i_target_host] vector of pointers to gpu memory
-std::vector<int**> hd_local_source_node_map;
+__device__ uint*** local_source_node_map;
 
+__constant__ uint n_local_nodes; // number of local nodes
 
-// number of remote target hosts on which each local node
-// has outgoing connections. Must be initially set to 0
-int *d_n_target_hosts; // [n_nodes] 
-// cumulative sum of d_n_target_hosts
-int *d_n_target_hosts_cumul; // [n_nodes+1]
-
-// Global array with remote target hosts indexes of all nodes
-// target_host_array[total_num] where total_num is the sum
-// of n_target_hosts[i_node] on all nodes
-int *d_target_host_array;
-// pointer to the starting position in target_host_array
-// of the target hosts for the node i_node
-int **d_node_target_hosts; // [i_node]
-
-// Global array with remote target hosts map indexes of all nodes
-// target_host_i_map[total_num] where total_num is the sum
-// of n_target_hosts[i_node] on all nodes
-int *d_target_host_i_map;
-// pointer to the starting position in target_host_i_map array
-// of the target host map indexes for the node i_node
-int **d_node_target_host_i_map; // [i_node]
-
-// node map index
-int **d_node_map_index; // [i_node]
-
-// Define a boolean array with one boolean value for each connection rule
-// - true if the rule always creates at least one outgoing connection
-// from each source node (one_to_one, all_to_all, fixed_outdegree)
-// - false otherwise (fixed_indegree, fixed_total_number, pairwise_bernoulli)
-bool *use_all_source_nodes; // [n_connection_rules]:
-
-__constant__ int n_local_nodes; // number of local nodes
-
-// Allocate GPU memory for new remote-source-node-map blocks
-int allocRemoteSourceNodeMapBlocks(std::vector<int*> &i_remote_src_node_map,
-				   std::vector<int*> &i_local_spike_buf_map,
-				   int64_t block_size, uint new_n_block)
-{
-  // allocate new blocks if needed
-  for (uint ib=i_remote_src_node_map.size(); ib<new_n_block; ib++) {
-    int *d_remote_src_node_blk_pt;
-    int *d_local_spike_buf_blk_pt;
-    // allocate GPU memory for new blocks 
-    CUDAMALLOCCTRL("&d_remote_src_node_blk_pt",&d_remote_src_node_blk_pt, block_size*sizeof(int));
-    CUDAMALLOCCTRL("&d_local_spike_buf_blk_pt",&d_local_spike_buf_blk_pt, block_size*sizeof(int));
-      
-    i_remote_src_node_map.push_back(d_remote_src_node_blk_pt);
-    i_local_spike_buf_map.push_back(d_local_spike_buf_blk_pt);
-  }
-    
-  return 0;
-}
-
-// Allocate GPU memory for new local-source-node-map blocks
-int allocLocalSourceNodeMapBlocks(std::vector<int*> &i_local_src_node_map,
-				  int64_t block_size, uint new_n_block)
-{
-  // allocate new blocks if needed
-  for (uint ib=i_local_src_node_map.size(); ib<new_n_block; ib++) {
-    int *d_local_src_node_blk_pt;
-    // allocate GPU memory for new blocks 
-    CUDAMALLOCCTRL("&d_local_src_node_blk_pt",&d_local_src_node_blk_pt, block_size*sizeof(int));
-      
-    i_local_src_node_map.push_back(d_local_src_node_blk_pt);
-  }
-    
-  return 0;
-}
-
-
-// Initialize the maps for n_hosts hosts
-int RemoteConnectionMapInit(uint n_hosts)
+// kernel that flags source nodes used in at least one new connection
+// of a given block
+__global__ void
+setUsedSourceNodeOnSourceHostKernel( inode_t* conn_source_ids, int64_t n_conn, uint* source_node_flag )
 {
-#ifdef CHECKRC
-  h_node_map_block_size = 3; // initialize node map block size
-#else
-  h_node_map_block_size = 10000; // initialize node map block size
-#endif
-
-  cudaMemcpyToSymbol(node_map_block_size, &h_node_map_block_size, sizeof(int));
-
-  // allocate and init to 0 n. of elements in the map for each source host
-  CUDAMALLOCCTRL("&d_n_remote_source_node_map",&d_n_remote_source_node_map, n_hosts*sizeof(int));
-  gpuErrchk(cudaMemset(d_n_remote_source_node_map, 0, n_hosts*sizeof(int)));
-
-  // allocate and init to 0 n. of elements in the map for each source host
-  //std::cout << "ok0 " << n_hosts << "\n";
-  CUDAMALLOCCTRL("&d_n_local_source_node_map",&d_n_local_source_node_map, n_hosts*sizeof(int));
-  gpuErrchk(cudaMemset(d_n_local_source_node_map, 0, n_hosts*sizeof(int)));
-
-  // initialize maps
-  for (uint i_host=0; i_host<n_hosts; i_host++) {
-    std::vector<int*> rsn_map;
-    h_remote_source_node_map.push_back(rsn_map);
-      
-    std::vector<int*> lsb_map;
-    h_local_spike_buffer_map.push_back(lsb_map);
-
-    std::vector<int*> lsn_map;
-    h_local_source_node_map.push_back(lsn_map);
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
   }
-    
+  inode_t i_source = conn_source_ids[ i_conn ];
+  // it is not necessary to use atomic operation. See:
+  // https://stackoverflow.com/questions/8416374/several-threads-writing-the-same-value-in-the-same-global-memory-location
+  // printf("i_conn: %ld\t i_source: %d\n", i_conn, i_source);
 
-  // launch kernel to copy pointers to CUDA variables ?? maybe in calibration?
-  // .....
-  //RemoteConnectionMapInitKernel // <<< , >>>
-  //  (d_n_remote_source_node_map,
-  //   d_remote_source_node_map,
-  //   d_local_spike_buffer_map,
-  //   d_n_local_source_node_map,
-  //   d_local_source_node_map);
-    
-  return 0;
+  source_node_flag[ i_source ] = 1;
 }
 
-
-__global__ void setTargetHostArrayNodePointersKernel
-(int *target_host_array, int *target_host_i_map, int *n_target_hosts_cumul,
- int **node_target_hosts, int **node_target_host_i_map, int n_nodes)
+__global__ void
+setTargetHostArrayNodePointersKernel( uint* target_host_array,
+  uint* target_host_i_map,
+  uint* n_target_hosts_cumul,
+  uint** node_target_hosts,
+  uint** node_target_host_i_map,
+  uint n_nodes )
 {
   uint i_node = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_node>=n_nodes) return;
-  node_target_hosts[i_node] = target_host_array + n_target_hosts_cumul[i_node];
-  node_target_host_i_map[i_node] = target_host_i_map
-    + n_target_hosts_cumul[i_node];
+  if ( i_node >= n_nodes )
+  {
+    return;
+  }
+  node_target_hosts[ i_node ] = target_host_array + n_target_hosts_cumul[ i_node ];
+  node_target_host_i_map[ i_node ] = target_host_i_map + n_target_hosts_cumul[ i_node ];
 }
 
-
 // kernel that fills the arrays target_host_array
 // and target_host_i_map using the node map
-__global__ void fillTargetHostArrayFromMapKernel
-(int **node_map, int n_node_map, int *count_mapped, int **node_target_hosts,
- int **node_target_host_i_map, int n_nodes, int i_target_host)
+__global__ void
+fillTargetHostArrayFromMapKernel( uint** node_map,
+  uint n_node_map,
+  uint* count_mapped,
+  uint** node_target_hosts,
+  uint** node_target_host_i_map,
+  uint n_nodes,
+  uint i_target_host )
 {
   uint i_node = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_node>=n_nodes) return;
-  int i_block;
-  int i_in_block;
+  if ( i_node >= n_nodes )
+  {
+    return;
+  }
+  uint i_block;
+  uint i_in_block;
   // check if node index is in map
-  bool mapped = checkIfValueIsIn2DArr(i_node, node_map,
-				      n_node_map, node_map_block_size,
-				      &i_block, &i_in_block);
+  bool mapped = checkIfValueIsIn2DArr( i_node, node_map, n_node_map, node_map_block_size, &i_block, &i_in_block );
   // If it is mapped
-  if (mapped) {
-    int i_node_map = i_block*node_map_block_size + i_in_block;
-    int pos = count_mapped[i_node]++;
-    node_target_host_i_map[i_node][pos] = i_node_map;
-    node_target_hosts[i_node][pos] = i_target_host;  
+  if ( mapped )
+  {
+    uint i_node_map = i_block * node_map_block_size + i_in_block;
+    uint pos = count_mapped[ i_node ]++;
+    node_target_host_i_map[ i_node ][ pos ] = i_node_map;
+    node_target_hosts[ i_node ][ pos ] = i_target_host;
   }
 }
 
-
-// Calibrate the maps
-int  NESTGPU::RemoteConnectionMapCalibrate(int i_host, int n_hosts)
-{
-  //std::cout << "In RemoteConnectionMapCalibrate " << i_host << " "
-  //	    << n_hosts << "\n";
-  // vector of pointers to local source node maps in device memory
-  // per target host hd_local_source_node_map[target_host]
-  // type std::vector<int*>
-  // set its size and initialize to NULL
-  hd_local_source_node_map.resize(n_hosts, NULL);
-  // number of elements in each local source node map
-  // h_n_local_source_node_map[target_host]
-  // set its size and initialize to 0
-  h_n_local_source_node_map.resize(n_hosts, 0);
-  // vector of pointers to local spike buffer maps in device memory
-  // per source host hd_local_spike_buffer_map[source_host]
-  // type std::vector<int*>
-  // set its size and initialize to NULL
-  hd_local_spike_buffer_map.resize(n_hosts, NULL);
-  // number of elements in each remote-source-node->local-spike-buffer map
-  // h_n_remote_source_node_map[source_host]
-  // set its size and initialize to 0
-  h_n_remote_source_node_map.resize(n_hosts, 0);
-  // loop on target hosts, skip self host
-  for (int tg_host=0; tg_host<n_hosts; tg_host++) {
-    if (tg_host != i_host) {
-      // get number of elements in each map from device memory
-      int n_node_map;
-      gpuErrchk(cudaMemcpy(&n_node_map,
-			   &d_n_local_source_node_map[tg_host], sizeof(int),
-			   cudaMemcpyDeviceToHost));
-      // put it in h_n_local_source_node_map[tg_host]
-      h_n_local_source_node_map[tg_host] = n_node_map;
-      // Allocate array of local source node map blocks
-      // and copy their address from host to device
-      hd_local_source_node_map[tg_host] = NULL;
-      int n_blocks = h_local_source_node_map[tg_host].size();
-      if (n_blocks>0) {
-	CUDAMALLOCCTRL("&hd_local_source_node_map[tg_host]",&hd_local_source_node_map[tg_host],
-			     n_blocks*sizeof(int*));
-	gpuErrchk(cudaMemcpy(hd_local_source_node_map[tg_host],
-			     &h_local_source_node_map[tg_host][0],
-			     n_blocks*sizeof(int*),
-			     cudaMemcpyHostToDevice));
-      }
-    }
-  }
-  // allocate d_local_source_node_map and copy it from host to device
-  CUDAMALLOCCTRL("&d_local_source_node_map",&d_local_source_node_map, n_hosts*sizeof(int**));
-  gpuErrchk(cudaMemcpy(d_local_source_node_map, &hd_local_source_node_map[0],
-		       n_hosts*sizeof(int**), cudaMemcpyHostToDevice));
-  gpuErrchk(cudaMemcpyToSymbol(local_source_node_map,
-			       &d_local_source_node_map, sizeof(int***)));
-
-  // loop on source hosts, skip self host
-  for (int src_host=0; src_host<n_hosts; src_host++) {
-    if (src_host != i_host) {
-      // get number of elements in each map from device memory
-      int n_node_map;
-      gpuErrchk(cudaMemcpy(&n_node_map,
-			   &d_n_remote_source_node_map[src_host], sizeof(int),
-			   cudaMemcpyDeviceToHost));
-      // put it in h_n_remote_source_node_map[src_host]
-      h_n_remote_source_node_map[src_host] = n_node_map;
-      // Allocate array of local spike buffer map blocks
-      // and copy their address from host to device
-      int n_blocks = h_local_spike_buffer_map[src_host].size();
-      hd_local_spike_buffer_map[src_host] = NULL;
-      if (n_blocks>0) {
-	CUDAMALLOCCTRL("&hd_local_spike_buffer_map[src_host]",&hd_local_spike_buffer_map[src_host],
-			     n_blocks*sizeof(int*));
-	gpuErrchk(cudaMemcpy(hd_local_spike_buffer_map[src_host],
-			     &h_local_spike_buffer_map[src_host][0],
-			     n_blocks*sizeof(int*),
-			     cudaMemcpyHostToDevice));
-      }
-    }
-  }
-  // allocate d_local_spike_buffer_map and copy it from host to device
-  CUDAMALLOCCTRL("&d_local_spike_buffer_map",&d_local_spike_buffer_map, n_hosts*sizeof(int**));
-  gpuErrchk(cudaMemcpy(d_local_spike_buffer_map, &hd_local_spike_buffer_map[0],
-		       n_hosts*sizeof(int**), cudaMemcpyHostToDevice));
-  gpuErrchk(cudaMemcpyToSymbol(local_spike_buffer_map,
-			       &d_local_spike_buffer_map, sizeof(int***)));
-
-#ifdef CHECKRC
-  //// TEMPORARY, FOR CHECK
-  std::cout << "////////////////////////////////////////\n";
-  std::cout << "IN MAP CALIBRATION\n";
-  
-  int tmp_n_hosts = 2;
-  int tmp_tg_host = 0;
-  int tmp_src_host = 1;
-  
-  int **tmp_pt2[tmp_n_hosts];
-  int tmp_n[tmp_n_hosts];
-  int tmp_map[h_node_map_block_size];
-  int n_map;
-  int n_blocks;
-
-  gpuErrchk(cudaMemcpy(tmp_n, d_n_local_source_node_map,
-		       tmp_n_hosts*sizeof(int), cudaMemcpyDeviceToHost));
-  n_map = tmp_n[tmp_tg_host];
-  if (n_map>0) {
-    std::cout << "////////////////////////////////////////\n";
-    std::cout << "Local Source Node Map\n";
-    std::cout << "target host: " << tmp_tg_host << "\n";
-    std::cout << "n_local_source_node_map: " << n_map << "\n";
-    gpuErrchk(cudaMemcpy(tmp_pt2, d_local_source_node_map,
-			 tmp_n_hosts*sizeof(int**), cudaMemcpyDeviceToHost));
-  
-    n_blocks = (n_map - 1) / h_node_map_block_size + 1;
-    std::cout << "n_blocks: " << n_blocks << "\n";
-    int *tmp_pt1[n_blocks];
-    gpuErrchk(cudaMemcpy(tmp_pt1, tmp_pt2[tmp_tg_host],
-			 n_blocks*sizeof(int*), cudaMemcpyDeviceToHost));
-    
-    for (int ib=0; ib<n_blocks; ib++) {
-      std::cout << "block " << ib << "\n";
-      int n = h_node_map_block_size;
-      if (ib==n_blocks-1) {
-	n = (n_map - 1) % h_node_map_block_size + 1;
-      }
-      gpuErrchk(cudaMemcpy(tmp_map, tmp_pt1[ib],
-			   n*sizeof(int), cudaMemcpyDeviceToHost));
-      std::cout << "local source node index\n";
-      for (int i=0; i<n; i++) {
-	std::cout << tmp_map[i] << "\n";
-      }
-    }
-  }
-
-  //gpuErrchk(cudaMemcpy(tmp_n, d_n_local_spike_buffer_map,
-  gpuErrchk(cudaMemcpy(tmp_n, d_n_remote_source_node_map,
-		       tmp_n_hosts*sizeof(int), cudaMemcpyDeviceToHost));
-  n_map = tmp_n[tmp_src_host];
-  if (n_map>0) {
-    std::cout << "////////////////////////////////////////\n";
-    std::cout << "Local Spike Buffer Map\n";
-    std::cout << "source host: " << tmp_src_host << "\n";
-    std::cout << "n_local_spike_buffer_map: " << n_map << "\n";
-    gpuErrchk(cudaMemcpy(tmp_pt2, d_local_spike_buffer_map,
-			 tmp_n_hosts*sizeof(int**), cudaMemcpyDeviceToHost));
-  
-    n_blocks = (n_map - 1) / h_node_map_block_size + 1;
-    std::cout << "n_blocks: " << n_blocks << "\n";
-    int *tmp_pt1[n_blocks];
-    gpuErrchk(cudaMemcpy(tmp_pt1, tmp_pt2[tmp_src_host],
-			 n_blocks*sizeof(int*), cudaMemcpyDeviceToHost));
-    
-    for (int ib=0; ib<n_blocks; ib++) {
-      std::cout << "block " << ib << "\n";
-      int n = h_node_map_block_size;
-      if (ib==n_blocks-1) {
-	n = (n_map - 1) % h_node_map_block_size + 1;
-      }
-      gpuErrchk(cudaMemcpy(tmp_map, tmp_pt1[ib],
-			   n*sizeof(int), cudaMemcpyDeviceToHost));
-      std::cout << "local spike buffer index\n";
-      for (int i=0; i<n; i++) {
-	std::cout << tmp_map[i] << "\n";
-      }
-    }
-  }
-
-  ////////////////////////////////////////
-#endif
-
-  int n_nodes = GetNLocalNodes(); // number of nodes
-  // n_target_hosts[i_node] is the number of remote target hosts
-  // on which each local node
-  // has outgoing connections
-  // allocate d_n_target_hosts[n_nodes] and init to 0
-  // std::cout << "allocate d_n_target_hosts n_nodes: " << n_nodes << "\n";
-  CUDAMALLOCCTRL("&d_n_target_hosts",&d_n_target_hosts, n_nodes*sizeof(int));
-  // std::cout << "d_n_target_hosts: " << d_n_target_hosts << "\n";
-  gpuErrchk(cudaMemset(d_n_target_hosts, 0, n_nodes*sizeof(int)));
-  // allocate d_n_target_hosts_cumul[n_nodes+1]
-  // representing the prefix scan (cumulative sum) of d_n_target_hosts
-  CUDAMALLOCCTRL("&d_n_target_hosts_cumul",&d_n_target_hosts_cumul, (n_nodes+1)*sizeof(int));
-
-  // For each local node, count the number of remote target hosts
-  // on which it has outgoing connections, i.e. n_target_hosts[i_node] 
-  // Loop on target hosts
-  for (int tg_host=0; tg_host<n_hosts; tg_host++) {
-    if (tg_host != i_host) {
-      int **d_node_map = hd_local_source_node_map[tg_host];
-      int n_node_map = h_n_local_source_node_map[tg_host];
-      // Launch kernel that searches each node in the map
-      // of local source nodes having outgoing connections to target host
-      // if found, increase n_target_hosts[i_node]
-      searchNodeIndexInMapKernel<<<(n_nodes+1023)/1024, 1024>>>
-	(d_node_map, n_node_map, d_n_target_hosts, n_nodes);
-      gpuErrchk( cudaPeekAtLastError() );
-      gpuErrchk( cudaDeviceSynchronize() );
-    }
-  }
-
-#ifdef CHECKRC  
-  // TEMPORARY, FOR TESTING
-  int h_n_target_hosts[n_nodes];
-  gpuErrchk(cudaMemcpy(h_n_target_hosts, d_n_target_hosts,
-  		       n_nodes*sizeof(int), cudaMemcpyDeviceToHost));
-  std::cout << "////////////////////////////////////////\n";
-  std::cout << "i_node, n_target_hosts\n";
-  for (int i_node=0; i_node<n_nodes; i_node++) {
-    std::cout << i_node << "\t" << h_n_target_hosts[i_node] << "\n";
-  }
-  ////////////////////////////////////////////////
-#endif
-  
-  //////////////////////////////////////////////////////////////////////
-  // Evaluate exclusive sum of reverse connections per target node
-  // Determine temporary device storage requirements
-  void *d_temp_storage = NULL;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,
-				d_n_target_hosts,
-				d_n_target_hosts_cumul,
-				n_nodes+1);
-  // Allocate temporary storage
-  CUDAMALLOCCTRL("&d_temp_storage",&d_temp_storage, temp_storage_bytes);
-  // Run exclusive prefix sum
-  cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,
-				d_n_target_hosts,
-				d_n_target_hosts_cumul,
-				n_nodes+1);
-  CUDAFREECTRL("d_temp_storage",d_temp_storage);
-  // The last element is the sum of all elements of n_target_hosts
-  int n_target_hosts_sum;
-  gpuErrchk(cudaMemcpy(&n_target_hosts_sum, &d_n_target_hosts_cumul[n_nodes],
-		       sizeof(int), cudaMemcpyDeviceToHost));
-
-#ifdef CHECKRC
-  // TEMPORARY, FOR TESTING
-  int h_n_target_hosts_cumul[n_nodes+1];
-  gpuErrchk(cudaMemcpy(h_n_target_hosts_cumul, d_n_target_hosts_cumul,
-  		       (n_nodes+1)*sizeof(int), cudaMemcpyDeviceToHost));
-  std::cout << "////////////////////////////////////////\n";
-  std::cout << "i_node, n_target_hosts_cumul\n";
-  for (int i_node=0; i_node<n_nodes+1; i_node++) {
-    std::cout << i_node << "\t" << h_n_target_hosts_cumul[i_node] << "\n";
-  }
-  ////////////////////////////////////////////////
-#endif
-  
-  //////////////////////////////////////////////////////////////////////
-  // allocate global array with remote target hosts of all nodes
-  CUDAMALLOCCTRL("&d_target_host_array",&d_target_host_array, n_target_hosts_sum*sizeof(int));
-  // allocate global array with remote target hosts map index
-  CUDAMALLOCCTRL("&d_target_host_i_map",&d_target_host_i_map, n_target_hosts_sum*sizeof(int));
-  // allocate array of pointers to the starting position in target_host array
-  // of the target hosts for each node
-  CUDAMALLOCCTRL("&d_node_target_hosts",&d_node_target_hosts, n_nodes*sizeof(int*));
-  // allocate array of pointers to the starting position in target_host_i_map
-  // of the target hosts map indexes for each node
-  CUDAMALLOCCTRL("&d_node_target_host_i_map",&d_node_target_host_i_map, n_nodes*sizeof(int*));
-  // Launch kernel to evaluate the pointers d_node_target_hosts
-  // and d_node_target_host_i_map from the positions in target_host_array
-  // given by  n_target_hosts_cumul
-  setTargetHostArrayNodePointersKernel<<<(n_nodes+1023)/1024, 1024>>>
-    (d_target_host_array, d_target_host_i_map, d_n_target_hosts_cumul,
-     d_node_target_hosts, d_node_target_host_i_map, n_nodes);
-  gpuErrchk( cudaPeekAtLastError() );
-  gpuErrchk( cudaDeviceSynchronize() );
-
-  // reset to 0 d_n_target_hosts[n_nodes] to reuse it in the next kernel
-  gpuErrchk(cudaMemset(d_n_target_hosts, 0, n_nodes*sizeof(int)));
-
-  // Loop on target hosts
-  for (int tg_host=0; tg_host<n_hosts; tg_host++) {
-    if (tg_host != i_host) {
-      int **d_node_map = hd_local_source_node_map[tg_host];
-      int n_node_map = h_n_local_source_node_map[tg_host];
-      // Launch kernel to fill the arrays target_host_array
-      // and target_host_i_map using the node map
-      fillTargetHostArrayFromMapKernel<<<(n_nodes+1023)/1024, 1024>>>
-	(d_node_map, n_node_map, d_n_target_hosts, d_node_target_hosts,
-	 d_node_target_host_i_map, n_nodes, tg_host);
-      gpuErrchk( cudaPeekAtLastError() );
-      gpuErrchk( cudaDeviceSynchronize() );
-    }
-  }
-
-#ifdef CHECKRC
-    // TEMPORARY, FOR TESTING
-  std::cout << "////////////////////////////////////////\n";
-  std::cout << "Checking node_target_hosts and node_target_host_i_map\n";
-  int *hd_node_target_hosts[n_nodes];
-  int *hd_node_target_host_i_map[n_nodes];
-  int h_node_target_hosts[n_hosts];
-  int h_node_target_host_i_map[n_hosts];
-  gpuErrchk(cudaMemcpy(h_n_target_hosts, d_n_target_hosts,
-  		       n_nodes*sizeof(int), cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(hd_node_target_hosts, d_node_target_hosts,
-  		       n_nodes*sizeof(int*), cudaMemcpyDeviceToHost));
-  gpuErrchk(cudaMemcpy(hd_node_target_host_i_map, d_node_target_host_i_map,
-  		       n_nodes*sizeof(int*), cudaMemcpyDeviceToHost));
-  for (int i_node=0; i_node<n_nodes; i_node++) {
-    std::cout << "\ni_node: " << i_node << "\n";
-    int nth = h_n_target_hosts[i_node];
-    std::cout << "\tn_target_hosts: " << nth << "\n";
-    
-    gpuErrchk(cudaMemcpy(h_node_target_hosts, hd_node_target_hosts[i_node],
-			 nth*sizeof(int), cudaMemcpyDeviceToHost));
-    gpuErrchk(cudaMemcpy(h_node_target_host_i_map,
-			 hd_node_target_host_i_map[i_node],
-			 nth*sizeof(int), cudaMemcpyDeviceToHost));
-
-    std::cout << "node_target_hosts\tnode_target_host_i_map\n";
-    for (int ith=0; ith<nth; ith++) {
-      std::cout << h_node_target_hosts[ith] << "\t"
-		<< h_node_target_host_i_map[ith] << "\n";
-    }
-  }
-#endif
-
-  
-  return 0;
-}
-
-// kernel that flags source nodes used in at least one new connection
-// of a given block
-__global__ void setUsedSourceNodeKernel(uint *key_subarray,
-					int64_t n_conn,
-					int *source_node_flag)
-{
-  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_conn) return;
-  int i_source = key_subarray[i_conn] >> MaxPortSynNBits;
-  // it is not necessary to use atomic operation. See:
-  // https://stackoverflow.com/questions/8416374/several-threads-writing-the-same-value-in-the-same-global-memory-location
-#ifdef CHECKRC
-  printf("i_conn: %ld\t i_source: %d\n", i_conn, i_source);
-#endif
-  source_node_flag[i_source] = 1;
-}
-      
-// Loop on all new connections and set source_node_flag[i_source]=true
-int setUsedSourceNodes(std::vector<uint*> &key_subarray,
-		       int64_t old_n_conn, int64_t n_conn,
-		       int64_t block_size, int *d_source_node_flag)
-{
-  uint64_t n_new_conn = n_conn - old_n_conn; // number of new connections
-
-#ifdef CHECKRC
-  //////////////////////////////////////////////////////////////////////
-  std::cout << "n_new_conn: " << n_new_conn
-	    << "\tn_conn: " << n_conn
-	    << "\told_n_conn: " << old_n_conn << "\n";
-//////////////////////////////////////////////////////////////////////
-#endif
-  
-  uint ib0 = (uint)(old_n_conn / block_size); // first block index
-  uint ib1 = (uint)((n_conn - 1) / block_size); // last block
-  for (uint ib=ib0; ib<=ib1; ib++) { // loop on blocks
-    uint64_t n_block_conn; // number of connections in a block
-    uint64_t i_conn0; // index of first connection in a block
-    if (ib1 == ib0) {  // all connections are in the same block
-      i_conn0 = old_n_conn % block_size;
-      n_block_conn = n_new_conn;
-    }
-    else if (ib == ib0) { // first block
-      i_conn0 = old_n_conn % block_size;
-      n_block_conn = block_size - i_conn0;
-    }
-    else if (ib == ib1) { // last block
-      i_conn0 = 0;
-      n_block_conn = (n_conn - 1) % block_size + 1;
-    }
-    else {
-      i_conn0 = 0;
-      n_block_conn = block_size;
-    }
-
-    //////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-    std::cout << "n_new_conn: " << n_new_conn
-	      << "\ti_conn0: " << i_conn0
-	      << "\tn_block_conn: " << n_block_conn << "\n";
-#endif
-    //////////////////////////////////////////////////////////////////////
-    
-    setUsedSourceNodeKernel<<<(n_block_conn+1023)/1024, 1024>>>
-      (key_subarray[ib] + i_conn0, n_block_conn, d_source_node_flag);
-    gpuErrchk( cudaPeekAtLastError() );
-    gpuErrchk( cudaDeviceSynchronize() );
-  }
-  return 0;
-}
-      
-
 // kernel that counts source nodes actually used in new connections
-__global__ void countUsedSourceNodeKernel(uint n_source,
-					  int *n_used_source_nodes,
-					  int *source_node_flag)
+__global__ void
+countUsedSourceNodeKernel( uint n_source, uint* n_used_source_nodes, uint* source_node_flag )
 {
   uint i_source = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_source>=n_source) return;
+  if ( i_source >= n_source )
+  {
+    return;
+  }
   // Count how many source_node_flag are true using atomic increase
   // on n_used_source_nodes
-  if (source_node_flag[i_source] != 0) {
-    atomicAdd(n_used_source_nodes, 1);
+  if ( source_node_flag[ i_source ] != 0 )
+  {
+    atomicAdd( n_used_source_nodes, 1 );
   }
 }
 
-
-// device function that checks if an int value is in a sorted 2d-array 
+// device function that checks if an int value is in a sorted 2d-array
 // assuming that the entries in the 2d-array are sorted.
 // The 2d-array is divided in noncontiguous blocks of size block_size
-__device__ bool checkIfValueIsIn2DArr(int value, int **arr, int n_elem,
-				      int block_size, int *i_block,
-				      int *i_in_block)
+__device__ bool
+checkIfValueIsIn2DArr( uint value, uint** arr, uint n_elem, uint block_size, uint* i_block, uint* i_in_block )
 {
   // If the array is empty surely the value is not contained in it
-  if (n_elem<=0) {
+  if ( n_elem <= 0 )
+  {
     return false;
   }
   // determine number of blocks in array
-  int n_blocks = (n_elem - 1) / block_size + 1;
+  uint n_blocks = ( n_elem - 1 ) / block_size + 1;
   // determine number of elements in last block
-  int n_last = (n_elem - 1) % block_size + 1;
+  uint n_last = ( n_elem - 1 ) % block_size + 1;
   // check if value is between the minimum and the maximum in the map
-  if (value<arr[0][0] ||
-      value>arr[n_blocks-1][n_last-1]) {
+  if ( value < arr[ 0 ][ 0 ] || value > arr[ n_blocks - 1 ][ n_last - 1 ] )
+  {
     return false;
   }
-  for (int ib=0; ib<n_blocks; ib++) {
-    if (arr[ib][0] > value) { // the array is sorted, so in this case
-      return false;           // value cannot be in the following elements
+  for ( uint ib = 0; ib < n_blocks; ib++ )
+  {
+    if ( arr[ ib ][ 0 ] > value )
+    {               // the array is sorted, so in this case
+      return false; // value cannot be in the following elements
     }
-    int n = block_size;
-    if (ib==n_blocks-1) { // the last block can be not completely full
+    uint n = block_size;
+    if ( ib == n_blocks - 1 )
+    { // the last block can be not completely full
       n = n_last;
     }
     // search value in the block
-    int pos = locate<int, int>(value, arr[ib], n);
+    int pos = locate< uint, int >( value, arr[ ib ], ( int ) n );
     // if value is in the block return true
-    if (pos>=0 && pos<n && arr[ib][pos]==value) {
+    if ( pos >= 0 && pos < n && arr[ ib ][ pos ] == value )
+    {
       *i_block = ib;
       *i_in_block = pos;
       return true;
     }
   }
   return false; // value not found
-}  
-
+}
 
 // kernel that searches node indexes in map
 // increase counter of mapped nodes
-__global__ void searchNodeIndexInMapKernel
-(
- int **node_map,
- int n_node_map,
- int *count_mapped, // i.e. *n_target_hosts for our application
- int n_node)
+__global__ void
+searchNodeIndexInMapKernel( uint** node_map,
+  uint n_node_map,
+  uint* count_mapped, // i.e. *n_target_hosts for our application
+  uint n_node )
 {
   uint i_node = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_node>=n_node) return;
-  int i_block;
-  int i_in_block;
+  if ( i_node >= n_node )
+  {
+    return;
+  }
+  uint i_block;
+  uint i_in_block;
   // check if node index is in map
-  bool mapped = checkIfValueIsIn2DArr(i_node, node_map,
-				      n_node_map, node_map_block_size,
-				      &i_block, &i_in_block);
+  bool mapped = checkIfValueIsIn2DArr( i_node, node_map, n_node_map, node_map_block_size, &i_block, &i_in_block );
   // If it is mapped
-  if (mapped) {
+  if ( mapped )
+  {
     // i_node_map = i_block*node_map_block_size + i_in_block;
-    count_mapped[i_node]++;
+    count_mapped[ i_node ]++;
   }
 }
 
 // kernel that searches node indexes not in map
 // flags nodes not yet mapped and counts them
-__global__ void searchNodeIndexNotInMapKernel
-(
- int **node_map,
- int n_node_map,
- int *sorted_node_index,
- bool *node_to_map,
- int *n_node_to_map,
- int n_node)
+__global__ void
+searchNodeIndexNotInMapKernel( uint** node_map,
+  uint n_node_map,
+  uint* sorted_node_index,
+  bool* node_to_map,
+  uint* n_node_to_map,
+  uint n_node )
 {
   uint i_node = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_node>=n_node) return;
+  if ( i_node >= n_node )
+  {
+    return;
+  }
   // Check for sorted_node_index unique values:
   // - either if it is the first of the array (i_node = 0)
   // - or it is different from previous
-  int node_index = sorted_node_index[i_node];
-  if (i_node==0 || node_index!=sorted_node_index[i_node-1]) {
-    int i_block;
-    int i_in_block;
-    bool mapped = checkIfValueIsIn2DArr(node_index, node_map,
-					n_node_map, node_map_block_size,
-					&i_block, &i_in_block);
+  uint node_index = sorted_node_index[ i_node ];
+  if ( i_node == 0 || node_index != sorted_node_index[ i_node - 1 ] )
+  {
+    uint i_block;
+    uint i_in_block;
+    bool mapped = checkIfValueIsIn2DArr( node_index, node_map, n_node_map, node_map_block_size, &i_block, &i_in_block );
     // If it is not in the map then flag it to be mapped
     // and atomic increase n_new_source_node_map
-    if (!mapped) {
-      node_to_map[i_node] = true;
-      atomicAdd(n_node_to_map, 1);
+    if ( !mapped )
+    {
+      node_to_map[ i_node ] = true;
+      atomicAdd( n_node_to_map, 1 );
     }
   }
 }
 
-
 // kernel that checks if nodes are already in map
 // if not insert them in the map
 // In the target host unmapped remote source nodes must be mapped
 // to local nodes from n_nodes to n_nodes + n_node_to_map
-__global__ void insertNodesInMapKernel
-(
- int **node_map,
- int **spike_buffer_map,
- int spike_buffer_map_i0,
- int old_n_node_map,
- int *sorted_node_index,
- bool *node_to_map,
- int *i_node_to_map,
- int n_node)
+__global__ void
+insertNodesInMapKernel( uint** node_map,
+  uint** spike_buffer_map,
+  uint spike_buffer_map_i0,
+  uint old_n_node_map,
+  uint* sorted_node_index,
+  bool* node_to_map,
+  uint* i_node_to_map,
+  uint n_node )
 {
   uint i_node = threadIdx.x + blockIdx.x * blockDim.x;
   // if thread is out of range or node is already mapped, return
-  if (i_node>=n_node || !node_to_map[i_node]) return;
+  if ( i_node >= n_node || !node_to_map[ i_node ] )
+  {
+    return;
+  }
   // node has to be inserted in the map
   // get and atomically increase index of node to be mapped
-  int pos = atomicAdd(i_node_to_map, 1);
-  int i_node_map = old_n_node_map + pos;
-  int i_block = i_node_map / node_map_block_size;
-  int i = i_node_map % node_map_block_size;
-  node_map[i_block][i] = sorted_node_index[i_node];
-  if (spike_buffer_map != NULL) {
-    spike_buffer_map[i_block][i] = spike_buffer_map_i0 + pos;
+  uint pos = atomicAdd( i_node_to_map, 1 );
+  uint i_node_map = old_n_node_map + pos;
+  uint i_block = i_node_map / node_map_block_size;
+  uint i = i_node_map % node_map_block_size;
+  node_map[ i_block ][ i ] = sorted_node_index[ i_node ];
+  if ( spike_buffer_map != nullptr )
+  {
+    spike_buffer_map[ i_block ][ i ] = spike_buffer_map_i0 + pos;
   }
 }
 
-// kernel that replaces the source node index
-// in a new remote connection of a given block
-// source_node[i_conn] with the value of the element pointed by the
-// index itself in the array local_node_index
-__global__ void fixConnectionSourceNodeIndexesKernel(uint *key_subarray,
-						     int64_t n_conn,
-						     int *local_node_index)
+__global__ void
+MapIndexToSpikeBufferKernel( uint n_hosts, uint* host_offset, uint* node_index )
 {
-  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_conn) return;
-  int i_source = key_subarray[i_conn] >> MaxPortSynNBits;
-  int i_delay = key_subarray[i_conn] & PortSynMask;
-  int new_i_source = local_node_index[i_source];
-
-  key_subarray[i_conn] = (new_i_source << MaxPortSynNBits) | i_delay;
-
-#ifdef CHECKRC
-  printf("i_conn: %ld\t new_i_source: %d\n", i_conn, new_i_source);
-#endif
-
-}
-
-// Loops on all new connections and replaces the source node index
-// source_node[i_conn] with the value of the element pointed by the
-// index itself in the array local_node_index
-int fixConnectionSourceNodeIndexes(std::vector<uint*> &key_subarray,
-				   int64_t old_n_conn, int64_t n_conn,
-				   int64_t block_size,
-				   int *d_local_node_index)
-{
-  uint64_t n_new_conn = n_conn - old_n_conn; // number of new connections
-
-#ifdef CHECKRC
-  //////////////////////////////////////////////////////////////////////
-  std::cout << "Fixing source node indexes in new remote connections\n";
-  std::cout << "n_new_conn: " << n_new_conn
-	    << "\tn_conn: " << n_conn
-	    << "\told_n_conn: " << old_n_conn << "\n";
-  //////////////////////////////////////////////////////////////////////
-#endif
-  
-  uint ib0 = (uint)(old_n_conn / block_size); // first block index
-  uint ib1 = (uint)((n_conn - 1) / block_size); // last block
-  for (uint ib=ib0; ib<=ib1; ib++) { // loop on blocks
-    uint64_t n_block_conn; // number of connections in a block
-    uint64_t i_conn0; // index of first connection in a block
-    if (ib1 == ib0) {  // all connections are in the same block
-      i_conn0 = old_n_conn % block_size;
-      n_block_conn = n_new_conn;
-    }
-    else if (ib == ib0) { // first block
-      i_conn0 = old_n_conn % block_size;
-      n_block_conn = block_size - i_conn0;
+  const uint i_host = blockIdx.x;
+  if ( i_host < n_hosts )
+  {
+    const uint pos = host_offset[ i_host ];
+    const uint num = host_offset[ i_host + 1 ] - pos;
+    for ( uint i_elem = threadIdx.x; i_elem < num; i_elem += blockDim.x )
+    {
+      const uint i_node_map = node_index[ pos + i_elem ];
+      const uint i_block = i_node_map / node_map_block_size;
+      const uint i = i_node_map % node_map_block_size;
+      const uint i_spike_buffer = local_spike_buffer_map[ i_host ][ i_block ][ i ];
+      node_index[ pos + i_elem ] = i_spike_buffer;
     }
-    else if (ib == ib1) { // last block
-      i_conn0 = 0;
-      n_block_conn = (n_conn - 1) % block_size + 1;
-    }
-    else {
-      i_conn0 = 0;
-      n_block_conn = block_size;
-    }
-
-#ifdef CHECKRC
-    //////////////////////////////////////////////////////////////////////
-    std::cout << "n_new_conn: " << n_new_conn
-	      << "\ti_conn0: " << i_conn0
-	      << "\tn_block_conn: " << n_block_conn << "\n";
-    //////////////////////////////////////////////////////////////////////
-#endif
-    
-    
-    fixConnectionSourceNodeIndexesKernel<<<(n_block_conn+1023)/1024, 1024>>>
-      (key_subarray[ib] + i_conn0, n_block_conn, d_local_node_index);
-    gpuErrchk( cudaPeekAtLastError() );
-    gpuErrchk( cudaDeviceSynchronize() );
-  }
-  return 0;
-}
-
-__global__ void MapIndexToSpikeBufferKernel(int n_hosts, int *host_offset,
-					    int *node_index)
-{
-  const int i_host = blockIdx.x;
-  if (i_host < n_hosts) {    
-    const int pos = host_offset[i_host];
-    const int num = host_offset[i_host+1] - pos;
-    for (int i_elem = threadIdx.x; i_elem < num; i_elem += blockDim.x) {
-      const int i_node_map = node_index[pos + i_elem];
-      const int i_block = i_node_map / node_map_block_size;
-      const int i = i_node_map % node_map_block_size;
-      const int i_spike_buffer = local_spike_buffer_map[i_host][i_block][i];
-      node_index[pos + i_elem] = i_spike_buffer; 
-    }
-  }
-}
-
-__global__ void addOffsetToExternalNodeIdsKernel
-(int64_t n_conn, uint *key_subarray, connection_struct *conn_subarray,
- int i_image_node_0)
-{
-  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_conn) return;
-  uint target_port_syn = conn_subarray[i_conn].target_port_syn;
-  if (target_port_syn & (1 << (MaxPortSynNBits - 1))) {
-    target_port_syn = target_port_syn ^ (1 << (MaxPortSynNBits - 1));
-    conn_subarray[i_conn].target_port_syn = target_port_syn; 
-    key_subarray[i_conn] += (i_image_node_0 << MaxPortSynNBits);
   }
 }
 
-int NESTGPU::addOffsetToExternalNodeIds()
+__global__ void
+addOffsetToSpikeBufferMapKernel( uint i_host, uint n_node_map, uint i_image_node_0 )
 {
-  int64_t block_size = h_ConnBlockSize;
-  int n_blocks = (NConn - 1) / block_size + 1;
-  int i_image_node_0 = GetNLocalNodes();
-  
-  for (int ib=0; ib<n_blocks; ib++) {
-    uint64_t n_block_conn = block_size; // number of connections in the block
-    if (ib == n_blocks-1) { // last block
-      n_block_conn = (NConn - 1) % block_size + 1;
-    }
-    addOffsetToExternalNodeIdsKernel<<<(n_block_conn+1023)/1024, 1024>>>
-      (n_block_conn, KeySubarray[ib], ConnectionSubarray[ib], i_image_node_0);
-    gpuErrchk( cudaPeekAtLastError() );
-    gpuErrchk( cudaDeviceSynchronize() );
-  }
-
-
-  /////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  std::cout << "After addOffsetToExternalNodeIds\n";
-  uint h_source_delay[NConn];
-  int h_source[NConn];
-  int h_delay[NConn];
-  gpuErrchk(cudaMemcpy(h_source_delay, KeySubarray[0],
-		       NConn*sizeof(uint), cudaMemcpyDeviceToHost));
-  for (int i=0; i<NConn; i++) {
-    h_source[i] = h_source_delay[i] >> h_MaxPortNBits;
-    h_delay[i] = h_source_delay[i] & h_PortMask;
-    std::cout << "i_conn: " << i << " source: " << h_source[i];
-    std::cout << " delay: " << h_delay[i] << "\n";
+  uint i_node_map = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_node_map >= n_node_map )
+  {
+    return;
   }
-#endif  
-  //////////////////////////////
-
-  
-  return 0;
-}
 
-__global__ void addOffsetToSpikeBufferMapKernel(int i_host, int n_node_map,
-						int i_image_node_0)
-{
-  int i_node_map = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_node_map>=n_node_map) return;
-  
-  const int i_block = i_node_map / node_map_block_size;
-  const int i = i_node_map % node_map_block_size;
-  local_spike_buffer_map[i_host][i_block][i] += i_image_node_0;
-}
-
-int NESTGPU::addOffsetToSpikeBufferMap()
-{
-  int i_image_node_0 = GetNLocalNodes();
-
-  for (int i_host=0; i_host<n_hosts_; i_host++) {
-    if (i_host != this_host_) {
-      int n_node_map = h_n_remote_source_node_map[i_host];
-      if (n_node_map > 0) {
-	addOffsetToSpikeBufferMapKernel<<<(n_node_map+1023)/1024, 1024>>>
-	  (i_host, n_node_map, i_image_node_0);
-	gpuErrchk( cudaPeekAtLastError() );
-	gpuErrchk( cudaDeviceSynchronize() );
-      }
-    }
-  }
-    
-  return 0;
+  const uint i_block = i_node_map / node_map_block_size;
+  const uint i = i_node_map % node_map_block_size;
+  local_spike_buffer_map[ i_host ][ i_block ][ i ] += i_image_node_0;
 }
diff --git a/src/remote_connect.h b/src/remote_connect.h
index 9f96e7210..75c2188e3 100644
--- a/src/remote_connect.h
+++ b/src/remote_connect.h
@@ -1,769 +1,993 @@
-//#define CHECKRC
 
 #ifndef REMOTECONNECTH
 #define REMOTECONNECTH
+// #include <cub/cub.cuh>
 #include <vector>
-#include <cub/cub.cuh>
-#include "nestgpu.h"
+// #include "nestgpu.h"
 #include "connect.h"
 #include "copass_sort.h"
+#include "cuda_error.h"
 // Arrays that map remote source nodes to local spike buffers
-  
+
 // The map is organized in blocks having block size:
-extern  __constant__ uint node_map_block_size; // = 100000;
-extern uint h_node_map_block_size; // = 100000;
+extern __constant__ uint node_map_block_size; // = 100000;
 
 // number of elements in the map for each source host
 // n_remote_source_node_map[i_source_host]
 // with i_source_host = 0, ..., n_hosts-1 excluding this host itself
-extern __device__ uint *n_remote_source_node_map; // [n_hosts];
-extern uint *d_n_remote_source_node_map;
+extern __device__ uint* n_remote_source_node_map; // [n_hosts];
 
 // remote_source_node_map[i_source_host][i_block][i]
-extern std::vector< std::vector<int*> > h_remote_source_node_map;
-extern __device__ int ***remote_source_node_map;
+extern __device__ uint*** remote_source_node_map;
 
 // local_spike_buffer_map[i_source_host][i_block][i]
-extern std::vector< std::vector<int*> > h_local_spike_buffer_map;
-extern __device__ int ***local_spike_buffer_map;
-extern int ***d_local_spike_buffer_map;
+extern __device__ uint*** local_spike_buffer_map;
 
 // Arrays that map local source nodes to remote spike buffers
 
 // number of elements in the map for each target host
 // n_local_source_node_map[i_target_host]
 // with i_target_host = 0, ..., n_hosts-1 excluding this host itself
-extern __device__ uint *n_local_source_node_map; // [n_hosts]; 
-extern uint *d_n_local_source_node_map;
+extern __device__ uint* n_local_source_node_map; // [n_hosts];
 
 // local_source_node_map[i_target_host][i_block][i]
-extern std::vector< std::vector<int*> > h_local_source_node_map;
-extern __device__ int ***local_source_node_map;
-extern int ***d_local_source_node_map;
-
-// number of remote target hosts on which each local node
-//has outgoing connections
-extern int *d_n_target_hosts; // [n_nodes] 
-// target hosts for the node i_node
-extern int **d_node_target_hosts; // [i_node]
-// target host map indexes for the node i_node
-extern int **d_node_target_host_i_map; // [i_node]
-
-// Boolean array with one boolean value for each connection rule
-// - true if the rule always creates at least one outgoing connection
-// from each source node (one_to_one, all_to_all, fixed_outdegree)
-// - false otherwise (fixed_indegree, fixed_total_number, pairwise_bernoulli)
-extern bool *use_all_source_nodes; // [n_connection_rules]:
-
-extern __constant__ int n_local_nodes; // number of local nodes
-
-// device function that checks if an int value is in a sorted 2d-array 
+extern __device__ uint*** local_source_node_map;
+
+extern __constant__ uint n_local_nodes; // number of local nodes
+
+// device function that checks if an int value is in a sorted 2d-array
 // assuming that the entries in the 2d-array are sorted.
 // The 2d-array is divided in noncontiguous blocks of size block_size
-__device__ bool checkIfValueIsIn2DArr(int value, int **arr, int n_elem,
-				      int block_size, int *i_block,
-				      int *i_in_block);
-
-// Initialize the maps
-int RemoteConnectionMapInit(uint n_hosts);
+__device__ bool
+checkIfValueIsIn2DArr( uint value, uint** arr, uint n_elem, uint block_size, uint* i_block, uint* i_in_block );
+
+template < class ConnKeyT >
+// kernel that flags source nodes used in at least one new connection
+// of a given block
+__global__ void
+setUsedSourceNodeKernel( ConnKeyT* conn_key_subarray, int64_t n_conn, uint* source_node_flag )
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+  inode_t i_source = getConnSource< ConnKeyT >( conn_key_subarray[ i_conn ] );
+  // it is not necessary to use atomic operation. See:
+  // https://stackoverflow.com/questions/8416374/several-threads-writing-the-same-value-in-the-same-global-memory-location
+  // printf("i_conn: %ld\t i_source: %d\n", i_conn, i_source);
 
-// Allocate GPU memory for new remote-source-node-map blocks
-int allocRemoteSourceNodeMapBlocks(std::vector<int*> &i_remote_src_node_map,
-				   std::vector<int*> &i_local_spike_buf_map,
-				   int64_t block_size, uint new_n_block);
+  source_node_flag[ i_source ] = 1;
+}
 
-// Allocate GPU memory for new local-source-node-map blocks
-int allocLocalSourceNodeMapBlocks(std::vector<int*> &i_local_src_node_map,
-				  int64_t block_size, uint new_n_block);
-
-int setUsedSourceNodes(std::vector<uint*> &key_subarray,
-		       int64_t old_n_conn, int64_t n_conn,
-		       int64_t block_size, int *d_source_node_flag);
-
-// kernel that fills the arrays of nodes actually used by new connections 
-template <class T>
-__global__ void getUsedSourceNodeIndexKernel(T source, uint n_source,
-					     int *n_used_source_nodes,
-					     int *source_node_flag,
-					     int *u_source_node_idx,
-					     int *i_source_arr)
+// kernel that flags source nodes used in at least one new connection
+// of a given block
+__global__ void setUsedSourceNodeOnSourceHostKernel( inode_t* conn_source_ids, int64_t n_conn, uint* source_node_flag );
+
+// kernel that fills the arrays of nodes actually used by new connections
+template < class T >
+__global__ void
+getUsedSourceNodeIndexKernel( T source,
+  uint n_source,
+  uint* n_used_source_nodes,
+  uint* source_node_flag,
+  uint* u_source_node_idx,
+  uint* i_source_arr )
 {
   uint i_source = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_source>=n_source) return;
+  if ( i_source >= n_source )
+  {
+    return;
+  }
   // Count how many source_node_flag are true using atomic increase
   // on n_used_source_nodes
-  if (source_node_flag[i_source] != 0) {
-    int pos = atomicAdd(n_used_source_nodes, 1);
-    u_source_node_idx[pos] = GetNodeIndex(source, i_source);
-    i_source_arr[pos] = i_source;
+  if ( source_node_flag[ i_source ] != 0 )
+  {
+    uint pos = atomicAdd( n_used_source_nodes, 1 );
+    u_source_node_idx[ pos ] = getNodeIndex( source, i_source );
+    i_source_arr[ pos ] = i_source;
   }
 }
 
 // kernel that counts source nodes actually used in new connections
-__global__ void countUsedSourceNodeKernel(uint n_source,
-					  int *n_used_source_nodes,
-					  int *source_node_flag);
-
+__global__ void countUsedSourceNodeKernel( uint n_source, uint* n_used_source_nodes, uint* source_node_flag );
 
 // kernel that searches source node indexes in the map,
 // and set local_node_index
-template <class T>
-__global__ void setLocalNodeIndexKernel(T source, uint n_source,
-					int *source_node_flag,
-					int **node_map,
-					int **spike_buffer_map,
-					int n_node_map,
-					int *local_node_index
-					)
+template < class T >
+__global__ void
+setLocalNodeIndexKernel( T source,
+  uint n_source,
+  uint* source_node_flag,
+  uint** node_map,
+  uint** spike_buffer_map,
+  uint n_node_map,
+  uint* local_node_index )
 {
   uint i_source = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_source>=n_source) return;
+  if ( i_source >= n_source )
+  {
+    return;
+  }
   // Count how many source_node_flag are true using atomic increase
   // on n_used_source_nodes
-  if (source_node_flag[i_source] != 0) {
-    int node_index = GetNodeIndex(source, i_source);
-    int i_block;
-    int i_in_block;
-    bool mapped = checkIfValueIsIn2DArr(node_index, node_map,
-					n_node_map, node_map_block_size,
-					&i_block, &i_in_block);
-    if (!mapped) {
-      printf("Error in setLocalNodeIndexKernel: node index not mapped\n");
+  if ( source_node_flag[ i_source ] != 0 )
+  {
+    uint node_index = getNodeIndex( source, i_source );
+    uint i_block;
+    uint i_in_block;
+    bool mapped = checkIfValueIsIn2DArr( node_index, node_map, n_node_map, node_map_block_size, &i_block, &i_in_block );
+    if ( !mapped )
+    {
+      printf( "Error in setLocalNodeIndexKernel: node index not mapped\n" );
       return;
     }
-    int i_spike_buffer = spike_buffer_map[i_block][i_in_block];
-    local_node_index[i_source] = i_spike_buffer;
+    uint i_spike_buffer = spike_buffer_map[ i_block ][ i_in_block ];
+    local_node_index[ i_source ] = i_spike_buffer;
   }
 }
 
-// Loops on all new connections and replaces the source node index
+// kernel that replaces the source node index
+// in a new remote connection of a given block
 // source_node[i_conn] with the value of the element pointed by the
 // index itself in the array local_node_index
-int fixConnectionSourceNodeIndexes(std::vector<uint*> &key_subarray,
-				   int64_t old_n_conn, int64_t n_conn,
-				   int64_t block_size,
-				   int *d_local_node_index);
+template < class ConnKeyT >
+__global__ void
+fixConnectionSourceNodeIndexesKernel( ConnKeyT* conn_key_subarray, int64_t n_conn, uint* local_node_index )
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+  uint i_source = getConnSource< ConnKeyT >( conn_key_subarray[ i_conn ] );
+  uint i_delay = getConnDelay< ConnKeyT >( conn_key_subarray[ i_conn ] );
+  uint new_i_source = local_node_index[ i_source ];
 
-// REMOTE CONNECT FUNCTION
-template <class T1, class T2>
-int NESTGPU::_RemoteConnect(int this_host,
-			    int source_host, T1 source, int n_source,
-			    int target_host, T2 target, int n_target,
-			    ConnSpec &conn_spec, SynSpec &syn_spec)
+  setConnSource< ConnKeyT >( conn_key_subarray[ i_conn ], new_i_source );
+
+  // printf("i_conn: %ld\t new_i_source: %d\n", i_conn, new_i_source);
+}
+
+// kernel that searches node indexes in map
+// increase counter of mapped nodes
+__global__ void searchNodeIndexInMapKernel( uint** node_map,
+  uint n_node_map,
+  uint* count_mapped, // i.e. *n_target_hosts for our application
+  uint n_node );
+
+// kernel that searches node indexes in map
+// flags nodes not yet mapped and counts them
+__global__ void searchNodeIndexNotInMapKernel( uint** node_map,
+  uint n_node_map,
+  uint* sorted_node_index,
+  bool* node_to_map,
+  uint* n_node_to_map,
+  uint n_node );
+
+// kernel that checks if nodes are already in map
+// if not insert them in the map
+// In the target host unmapped remote source nodes must be mapped
+// to local nodes from n_nodes to n_nodes + n_node_to_map
+__global__ void insertNodesInMapKernel( uint** node_map,
+  uint** spike_buffer_map,
+  uint spike_buffer_map_i0,
+  uint old_n_node_map,
+  uint* sorted_node_index,
+  bool* node_to_map,
+  uint* i_node_to_map,
+  uint n_node );
+
+template < class ConnKeyT, class ConnStructT >
+__global__ void
+addOffsetToExternalNodeIdsKernel( int64_t n_conn,
+  ConnKeyT* conn_key_subarray,
+  ConnStructT* conn_struct_subarray,
+  uint i_image_node_0 )
 {
-  if (source_host<0 || source_host>=n_hosts_) {
-    throw ngpu_exception("Source host index out of range in _RemoteConnect");
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
   }
-  if (target_host<0 || target_host>=n_hosts_) {
-    throw ngpu_exception("Target host index out of range in _RemoteConnect");
+  // uint target_port_syn = conn_subarray[i_conn].target_port_syn;
+  // if (target_port_syn & (1 << (MaxPortSynNBits - 1))) {
+  // target_port_syn = target_port_syn ^ (1 << (MaxPortSynNBits - 1));
+  // conn_subarray[i_conn].target_port_syn = target_port_syn;
+  // key_subarray[i_conn] += (i_image_node_0 << MaxPortSynNBits);
+  uint remote_flag =
+    getConnRemoteFlag< ConnKeyT, ConnStructT >( conn_key_subarray[ i_conn ], conn_struct_subarray[ i_conn ] );
+  if ( remote_flag == 1 )
+  {
+    // IN THE FUTURE KEEP IT!!!!!!!!!!!!!!!!!!!!!!!!!!
+    clearConnRemoteFlag< ConnKeyT, ConnStructT >( conn_key_subarray[ i_conn ], conn_struct_subarray[ i_conn ] );
+    uint i_source = getConnSource< ConnKeyT >( conn_key_subarray[ i_conn ] );
+    i_source += i_image_node_0;
+    setConnSource< ConnKeyT >( conn_key_subarray[ i_conn ], i_source );
   }
-  if (this_host<0 || this_host>=n_hosts_) {
-    throw ngpu_exception("this_host index out of range in _RemoteConnect");
+}
+
+__global__ void MapIndexToSpikeBufferKernel( uint n_hosts, uint* host_offset, uint* node_index );
+
+// Allocate GPU memory for new remote-source-node-map blocks
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::allocRemoteSourceNodeMapBlocks(
+  std::vector< uint* >& i_remote_src_node_map,
+  std::vector< uint* >& i_local_spike_buf_map,
+  uint new_n_block )
+{
+  // allocate new blocks if needed
+  for ( uint ib = i_remote_src_node_map.size(); ib < new_n_block; ib++ )
+  {
+    uint* d_remote_src_node_blk_pt;
+    uint* d_local_spike_buf_blk_pt;
+    // allocate GPU memory for new blocks
+    CUDAMALLOCCTRL( "&d_remote_src_node_blk_pt", &d_remote_src_node_blk_pt, node_map_block_size_ * sizeof( uint ) );
+    CUDAMALLOCCTRL( "&d_local_spike_buf_blk_pt", &d_local_spike_buf_blk_pt, node_map_block_size_ * sizeof( uint ) );
+
+    i_remote_src_node_map.push_back( d_remote_src_node_blk_pt );
+    i_local_spike_buf_map.push_back( d_local_spike_buf_blk_pt );
   }
 
-  // Check if it is a local connection
-  if (this_host==source_host && source_host==target_host) {
-    return _Connect(source, n_source, target, n_target,
-		    conn_spec, syn_spec);
+  return 0;
+}
+
+// Allocate GPU memory for new local-source-node-map blocks
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::allocLocalSourceNodeMapBlocks( std::vector< uint* >& i_local_src_node_map,
+  uint new_n_block )
+{
+  // allocate new blocks if needed
+  for ( uint ib = i_local_src_node_map.size(); ib < new_n_block; ib++ )
+  {
+    uint* d_local_src_node_blk_pt;
+    // allocate GPU memory for new blocks
+    CUDAMALLOCCTRL( "&d_local_src_node_blk_pt", &d_local_src_node_blk_pt, node_map_block_size_ * sizeof( uint ) );
+
+    i_local_src_node_map.push_back( d_local_src_node_blk_pt );
   }
-  // Check if target_host matches this_host
-  else if (this_host==target_host) {
-    return _RemoteConnectSource(source_host, source, n_source,
-				target, n_target, conn_spec, syn_spec);
+
+  return 0;
+}
+
+// Loop on all new connections and set source_node_flag[i_source]=true
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::setUsedSourceNodes( int64_t old_n_conn, uint* d_source_node_flag )
+{
+  int64_t n_new_conn = n_conn_ - old_n_conn; // number of new connections
+
+  uint ib0 = ( uint ) ( old_n_conn / conn_block_size_ );      // first block index
+  uint ib1 = ( uint ) ( ( n_conn_ - 1 ) / conn_block_size_ ); // last block
+  for ( uint ib = ib0; ib <= ib1; ib++ )
+  {                       // loop on blocks
+    int64_t n_block_conn; // number of connections in a block
+    int64_t i_conn0;      // index of first connection in a block
+    if ( ib1 == ib0 )
+    { // all connections are in the same block
+      i_conn0 = old_n_conn % conn_block_size_;
+      n_block_conn = n_new_conn;
+    }
+    else if ( ib == ib0 )
+    { // first block
+      i_conn0 = old_n_conn % conn_block_size_;
+      n_block_conn = conn_block_size_ - i_conn0;
+    }
+    else if ( ib == ib1 )
+    { // last block
+      i_conn0 = 0;
+      n_block_conn = ( n_conn_ - 1 ) % conn_block_size_ + 1;
+    }
+    else
+    {
+      i_conn0 = 0;
+      n_block_conn = conn_block_size_;
+    }
+
+    setUsedSourceNodeKernel< ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_vect_[ ib ] + i_conn0, n_block_conn, d_source_node_flag );
+    CUDASYNC;
   }
-  // Check if source_host matches this_host
-  else if (this_host==source_host) {
-    return _RemoteConnectTarget(target_host, source, n_source,
-				target, n_target, conn_spec, syn_spec);
+  return 0;
+}
+
+// Loop on all new connections and set source_node_flag[i_source]=true
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::setUsedSourceNodesOnSourceHost( int64_t old_n_conn,
+  uint* d_source_node_flag )
+{
+  int64_t n_new_conn = n_conn_ - old_n_conn; // number of new connections
+
+  setUsedSourceNodeOnSourceHostKernel<<< ( n_new_conn + 1023 ) / 1024, 1024 >>>(
+    d_conn_source_ids_, n_new_conn, d_source_node_flag );
+  CUDASYNC;
+
+  return 0;
+}
+
+__global__ void setTargetHostArrayNodePointersKernel( uint* target_host_array,
+  uint* target_host_i_map,
+  uint* n_target_hosts_cumul,
+  uint** node_target_hosts,
+  uint** node_target_host_i_map,
+  uint n_nodes );
+
+// kernel that fills the arrays target_host_array
+// and target_host_i_map using the node map
+__global__ void fillTargetHostArrayFromMapKernel( uint** node_map,
+  uint n_node_map,
+  uint* count_mapped,
+  uint** node_target_hosts,
+  uint** node_target_host_i_map,
+  uint n_nodes,
+  uint i_target_host );
+
+__global__ void addOffsetToSpikeBufferMapKernel( uint i_host, uint n_node_map, uint i_image_node_0 );
+
+// Initialize the maps for n_hosts hosts
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::remoteConnectionMapInit()
+{
+  node_map_block_size_ = 10000; // initialize node map block size
+
+  cudaMemcpyToSymbol( node_map_block_size, &node_map_block_size_, sizeof( uint ) );
+
+  // allocate and init to 0 n. of elements in the map for each source host
+  CUDAMALLOCCTRL( "&d_n_remote_source_node_map_", &d_n_remote_source_node_map_, n_hosts_ * sizeof( uint ) );
+  gpuErrchk( cudaMemset( d_n_remote_source_node_map_, 0, n_hosts_ * sizeof( uint ) ) );
+
+  // allocate and init to 0 n. of elements in the map for each source host
+  CUDAMALLOCCTRL( "&d_n_local_source_node_map_", &d_n_local_source_node_map_, n_hosts_ * sizeof( uint ) );
+  gpuErrchk( cudaMemset( d_n_local_source_node_map_, 0, n_hosts_ * sizeof( uint ) ) );
+
+  // initialize maps
+  for ( int i_host = 0; i_host < n_hosts_; i_host++ )
+  {
+    std::vector< uint* > rsn_map;
+    h_remote_source_node_map_.push_back( rsn_map );
+
+    std::vector< uint* > lsb_map;
+    h_local_spike_buffer_map_.push_back( lsb_map );
+
+    std::vector< uint* > lsn_map;
+    h_local_source_node_map_.push_back( lsn_map );
   }
-  
+
+  // launch kernel to copy pointers to CUDA variables ?? maybe in calibration?
+  // .....
+  // RemoteConnectionMapInitKernel // <<< , >>>
+  //  (d_n_remote_source_node_map_,
+  //   d_remote_source_node_map,
+  //   d_local_spike_buffer_map,
+  //   d_n_local_source_node_map_,
+  //   d_local_source_node_map);
+
   return 0;
 }
 
-template
-int NESTGPU::_RemoteConnect<int, int>
-(int this_host, int source_host, int source, int n_source,
- int target_host, int target, int n_target,
- ConnSpec &conn_spec, SynSpec &syn_spec);
-
-template
-int NESTGPU::_RemoteConnect<int, int*>
-(int this_host, int source_host, int source, int n_source,
- int target_host, int *target, int n_target,
- ConnSpec &conn_spec, SynSpec &syn_spec);
-
-template
-int NESTGPU::_RemoteConnect<int*, int>
-(int this_host, int source_host, int *source, int n_source,
- int target_host, int target, int n_target,
- ConnSpec &conn_spec, SynSpec &syn_spec);
-
-template
-int NESTGPU::_RemoteConnect<int*, int*>
-(int this_host, int source_host, int *source, int n_source,
- int target_host, int *target, int n_target,
- ConnSpec &conn_spec, SynSpec &syn_spec);
-
-
-template <class T1, class T2>
-int NESTGPU::_RemoteConnect(int source_host, T1 source, int n_source,
-			    int target_host, T2 target, int n_target,
-			    ConnSpec &conn_spec, SynSpec &syn_spec)
+// Loops on all new connections and replaces the source node index
+// source_node[i_conn] with the value of the element pointed by the
+// index itself in the array local_node_index
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::fixConnectionSourceNodeIndexes( int64_t old_n_conn,
+  uint* d_local_node_index )
 {
-  return _RemoteConnect<T1, T2>(this_host_, source_host, source, n_source,
-			 target_host, target, n_target,
-			 conn_spec, syn_spec);
+  int64_t n_new_conn = n_conn_ - old_n_conn; // number of new connections
+
+  uint ib0 = ( uint ) ( old_n_conn / conn_block_size_ );      // first block index
+  uint ib1 = ( uint ) ( ( n_conn_ - 1 ) / conn_block_size_ ); // last block
+  for ( uint ib = ib0; ib <= ib1; ib++ )
+  {                       // loop on blocks
+    int64_t n_block_conn; // number of connections in a block
+    int64_t i_conn0;      // index of first connection in a block
+    if ( ib1 == ib0 )
+    { // all connections are in the same block
+      i_conn0 = old_n_conn % conn_block_size_;
+      n_block_conn = n_new_conn;
+    }
+    else if ( ib == ib0 )
+    { // first block
+      i_conn0 = old_n_conn % conn_block_size_;
+      n_block_conn = conn_block_size_ - i_conn0;
+    }
+    else if ( ib == ib1 )
+    { // last block
+      i_conn0 = 0;
+      n_block_conn = ( n_conn_ - 1 ) % conn_block_size_ + 1;
+    }
+    else
+    {
+      i_conn0 = 0;
+      n_block_conn = conn_block_size_;
+    }
+
+    fixConnectionSourceNodeIndexesKernel< ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_vect_[ ib ] + i_conn0, n_block_conn, d_local_node_index );
+    gpuErrchk( cudaPeekAtLastError() );
+    gpuErrchk( cudaDeviceSynchronize() );
+  }
+  return 0;
 }
 
-template
-int NESTGPU::_RemoteConnect<int, int>
-(int source_host, int source, int n_source,
- int target_host, int target, int n_target,
- ConnSpec &conn_spec, SynSpec &syn_spec);
+// Calibrate the maps
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::remoteConnectionMapCalibrate( inode_t n_nodes )
+{
+  // std::cout << "In RemoteConnectionMapCalibrate " << i_host << " "
+  //	    << n_hosts_ << "\n";
+  //  vector of pointers to local source node maps in device memory
+  //  per target host hd_local_source_node_map[target_host]
+  //  type std::vector<uint*>
+  //  set its size and initialize to NULL
+  hd_local_source_node_map_.resize( n_hosts_, NULL );
+  // number of elements in each local source node map
+  // h_n_local_source_node_map[target_host]
+  // set its size and initialize to 0
+  h_n_local_source_node_map_.resize( n_hosts_, 0 );
+  // vector of pointers to local spike buffer maps in device memory
+  // per source host hd_local_spike_buffer_map[source_host]
+  // type std::vector<int*>
+  // set its size and initialize to NULL
+  hd_local_spike_buffer_map_.resize( n_hosts_, NULL );
+  // number of elements in each remote-source-node->local-spike-buffer map
+  // h_n_remote_source_node_map[source_host]
+  // set its size and initialize to 0
+  h_n_remote_source_node_map_.resize( n_hosts_, 0 );
+  // loop on target hosts, skip self host
+  for ( int tg_host = 0; tg_host < n_hosts_; tg_host++ )
+  {
+    if ( tg_host != this_host_ )
+    {
+      // get number of elements in each map from device memory
+      uint n_node_map;
+      gpuErrchk(
+        cudaMemcpy( &n_node_map, &d_n_local_source_node_map_[ tg_host ], sizeof( uint ), cudaMemcpyDeviceToHost ) );
+      // put it in h_n_local_source_node_map[tg_host]
+      h_n_local_source_node_map_[ tg_host ] = n_node_map;
+      // Allocate array of local source node map blocks
+      // and copy their address from host to device
+      hd_local_source_node_map_[ tg_host ] = NULL;
+      uint n_blocks = h_local_source_node_map_[ tg_host ].size();
+      if ( n_blocks > 0 )
+      {
+        CUDAMALLOCCTRL(
+          "&hd_local_source_node_map[tg_host]", &hd_local_source_node_map_[ tg_host ], n_blocks * sizeof( uint* ) );
+        gpuErrchk( cudaMemcpy( hd_local_source_node_map_[ tg_host ],
+          &h_local_source_node_map_[ tg_host ][ 0 ],
+          n_blocks * sizeof( uint* ),
+          cudaMemcpyHostToDevice ) );
+      }
+    }
+  }
+  // allocate d_local_source_node_map and copy it from host to device
+  CUDAMALLOCCTRL( "&d_local_source_node_map", &d_local_source_node_map_, n_hosts_ * sizeof( uint** ) );
+  gpuErrchk( cudaMemcpy(
+    d_local_source_node_map_, &hd_local_source_node_map_[ 0 ], n_hosts_ * sizeof( uint** ), cudaMemcpyHostToDevice ) );
+  gpuErrchk( cudaMemcpyToSymbol( local_source_node_map, &d_local_source_node_map_, sizeof( uint*** ) ) );
+
+  // loop on source hosts, skip self host
+  for ( int src_host = 0; src_host < n_hosts_; src_host++ )
+  {
+    if ( src_host != this_host_ )
+    {
+      // get number of elements in each map from device memory
+      uint n_node_map;
+      gpuErrchk(
+        cudaMemcpy( &n_node_map, &d_n_remote_source_node_map_[ src_host ], sizeof( uint ), cudaMemcpyDeviceToHost ) );
+      // put it in h_n_remote_source_node_map[src_host]
+      h_n_remote_source_node_map_[ src_host ] = n_node_map;
+      // Allocate array of local spike buffer map blocks
+      // and copy their address from host to device
+      uint n_blocks = h_local_spike_buffer_map_[ src_host ].size();
+      hd_local_spike_buffer_map_[ src_host ] = NULL;
+      if ( n_blocks > 0 )
+      {
+        CUDAMALLOCCTRL( "&hd_local_spike_buffer_map_[src_host]",
+          &hd_local_spike_buffer_map_[ src_host ],
+          n_blocks * sizeof( uint* ) );
+        gpuErrchk( cudaMemcpy( hd_local_spike_buffer_map_[ src_host ],
+          &h_local_spike_buffer_map_[ src_host ][ 0 ],
+          n_blocks * sizeof( uint* ),
+          cudaMemcpyHostToDevice ) );
+      }
+    }
+  }
+  // allocate d_local_spike_buffer_map and copy it from host to device
+  CUDAMALLOCCTRL( "&d_local_spike_buffer_map_", &d_local_spike_buffer_map_, n_hosts_ * sizeof( uint** ) );
+  gpuErrchk( cudaMemcpy( d_local_spike_buffer_map_,
+    &hd_local_spike_buffer_map_[ 0 ],
+    n_hosts_ * sizeof( uint** ),
+    cudaMemcpyHostToDevice ) );
+  gpuErrchk( cudaMemcpyToSymbol( local_spike_buffer_map, &d_local_spike_buffer_map_, sizeof( uint*** ) ) );
+
+  // uint n_nodes = GetNLocalNodes(); // number of nodes
+  //  n_target_hosts[i_node] is the number of remote target hosts
+  //  on which each local node
+  //  has outgoing connections
+  //  allocate d_n_target_hosts[n_nodes] and init to 0
+  //  std::cout << "allocate d_n_target_hosts n_nodes: " << n_nodes << "\n";
+  CUDAMALLOCCTRL( "&d_n_target_hosts_", &d_n_target_hosts_, n_nodes * sizeof( uint ) );
+  // std::cout << "d_n_target_hosts: " << d_n_target_hosts_ << "\n";
+  gpuErrchk( cudaMemset( d_n_target_hosts_, 0, n_nodes * sizeof( uint ) ) );
+  // allocate d_n_target_hosts_cumul[n_nodes+1]
+  // representing the prefix scan (cumulative sum) of d_n_target_hosts
+  CUDAMALLOCCTRL( "&d_n_target_hosts_cumul_", &d_n_target_hosts_cumul_, ( n_nodes + 1 ) * sizeof( uint ) );
+
+  // For each local node, count the number of remote target hosts
+  // on which it has outgoing connections, i.e. n_target_hosts[i_node]
+  // Loop on target hosts
+  for ( int tg_host = 0; tg_host < n_hosts_; tg_host++ )
+  {
+    if ( tg_host != this_host_ )
+    {
+      uint** d_node_map = hd_local_source_node_map_[ tg_host ];
+      uint n_node_map = h_n_local_source_node_map_[ tg_host ];
+      // Launch kernel that searches each node in the map
+      // of local source nodes having outgoing connections to target host
+      // if found, increase n_target_hosts[i_node]
+      searchNodeIndexInMapKernel<<< ( n_nodes + 1023 ) / 1024, 1024 >>>(
+        d_node_map, n_node_map, d_n_target_hosts_, n_nodes );
+      gpuErrchk( cudaPeekAtLastError() );
+      gpuErrchk( cudaDeviceSynchronize() );
+    }
+  }
 
-template
-int NESTGPU::_RemoteConnect<int, int*>
-(int source_host, int source, int n_source,
- int target_host, int *target, int n_target,
- ConnSpec &conn_spec, SynSpec &syn_spec);
+  //////////////////////////////////////////////////////////////////////
+  // Evaluate exclusive sum of reverse connections per target node
+  // Determine temporary device storage requirements
+  void* d_temp_storage = NULL;
+  size_t temp_storage_bytes = 0;
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceScan::ExclusiveSum(
+    d_temp_storage, temp_storage_bytes, d_n_target_hosts_, d_n_target_hosts_cumul_, n_nodes + 1 );
+  //<END-CLANG-TIDY-SKIP>//
 
-template
-int NESTGPU::_RemoteConnect<int*, int>
-(int source_host, int *source, int n_source,
- int target_host, int target, int n_target,
- ConnSpec &conn_spec, SynSpec &syn_spec);
+  // Allocate temporary storage
+  CUDAMALLOCCTRL( "&d_temp_storage", &d_temp_storage, temp_storage_bytes );
+  // Run exclusive prefix sum
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceScan::ExclusiveSum(
+    d_temp_storage, temp_storage_bytes, d_n_target_hosts_, d_n_target_hosts_cumul_, n_nodes + 1 );
+  //<END-CLANG-TIDY-SKIP>//
+
+  CUDAFREECTRL( "d_temp_storage", d_temp_storage );
+  // The last element is the sum of all elements of n_target_hosts
+  uint n_target_hosts_sum;
+  gpuErrchk(
+    cudaMemcpy( &n_target_hosts_sum, &d_n_target_hosts_cumul_[ n_nodes ], sizeof( uint ), cudaMemcpyDeviceToHost ) );
 
-template
-int NESTGPU::_RemoteConnect<int*, int*>
-(int source_host, int *source, int n_source,
- int target_host, int *target, int n_target,
- ConnSpec &conn_spec, SynSpec &syn_spec);
+  //////////////////////////////////////////////////////////////////////
+  // allocate global array with remote target hosts of all nodes
+  CUDAMALLOCCTRL( "&d_target_host_array_", &d_target_host_array_, n_target_hosts_sum * sizeof( uint ) );
+  // allocate global array with remote target hosts map index
+  CUDAMALLOCCTRL( "&d_target_host_i_map_", &d_target_host_i_map_, n_target_hosts_sum * sizeof( uint ) );
+  // allocate array of pointers to the starting position in target_host array
+  // of the target hosts for each node
+  CUDAMALLOCCTRL( "&d_node_target_hosts_", &d_node_target_hosts_, n_nodes * sizeof( uint* ) );
+  // allocate array of pointers to the starting position in target_host_i_map
+  // of the target hosts map indexes for each node
+  CUDAMALLOCCTRL( "&d_node_target_host_i_map_", &d_node_target_host_i_map_, n_nodes * sizeof( uint* ) );
+  // Launch kernel to evaluate the pointers d_node_target_hosts
+  // and d_node_target_host_i_map from the positions in target_host_array
+  // given by  n_target_hosts_cumul
+  setTargetHostArrayNodePointersKernel<<< ( n_nodes + 1023 ) / 1024, 1024 >>>( d_target_host_array_,
+    d_target_host_i_map_,
+    d_n_target_hosts_cumul_,
+    d_node_target_hosts_,
+    d_node_target_host_i_map_,
+    n_nodes );
+  gpuErrchk( cudaPeekAtLastError() );
+  gpuErrchk( cudaDeviceSynchronize() );
 
+  // reset to 0 d_n_target_hosts[n_nodes] to reuse it in the next kernel
+  gpuErrchk( cudaMemset( d_n_target_hosts_, 0, n_nodes * sizeof( uint ) ) );
+
+  // Loop on target hosts
+  for ( int tg_host = 0; tg_host < n_hosts_; tg_host++ )
+  {
+    if ( tg_host != this_host_ )
+    {
+      uint** d_node_map = hd_local_source_node_map_[ tg_host ];
+      uint n_node_map = h_n_local_source_node_map_[ tg_host ];
+      // Launch kernel to fill the arrays target_host_array
+      // and target_host_i_map using the node map
+      fillTargetHostArrayFromMapKernel<<< ( n_nodes + 1023 ) / 1024, 1024 >>>(
+        d_node_map, n_node_map, d_n_target_hosts_, d_node_target_hosts_, d_node_target_host_i_map_, n_nodes, tg_host );
+      gpuErrchk( cudaPeekAtLastError() );
+      gpuErrchk( cudaDeviceSynchronize() );
+    }
+  }
 
+  addOffsetToSpikeBufferMap( n_nodes );
 
+  return 0;
+}
 
-// kernel that searches node indexes in map
-// increase counter of mapped nodes
-__global__ void searchNodeIndexInMapKernel
-(
- int **node_map,
- int n_node_map,
- int *count_mapped, // i.e. *n_target_hosts for our application
- int n_node);
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::addOffsetToSpikeBufferMap( inode_t n_nodes )
+{
+  uint i_image_node_0 = n_nodes;
+
+  for ( int i_host = 0; i_host < n_hosts_; i_host++ )
+  {
+    if ( i_host != this_host_ )
+    {
+      uint n_node_map = h_n_remote_source_node_map_[ i_host ];
+      if ( n_node_map > 0 )
+      {
+        addOffsetToSpikeBufferMapKernel<<< ( n_node_map + 1023 ) / 1024, 1024 >>>( i_host, n_node_map, i_image_node_0 );
+        gpuErrchk( cudaPeekAtLastError() );
+        gpuErrchk( cudaDeviceSynchronize() );
+      }
+    }
+  }
 
-// kernel that searches node indexes in map
-// flags nodes not yet mapped and counts them
-__global__ void searchNodeIndexNotInMapKernel
-(
- int **node_map,
- int n_node_map,
- int *sorted_node_index,
- bool *node_to_map,
- int *n_node_to_map,
- int n_node);
+  return 0;
+}
 
+// REMOTE CONNECT FUNCTION
+template < class ConnKeyT, class ConnStructT >
+template < class T1, class T2 >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::_RemoteConnect( int source_host,
+  T1 source,
+  inode_t n_source,
+  int target_host,
+  T2 target,
+  inode_t n_target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
+{
+  if ( source_host >= n_hosts_ )
+  {
+    throw ngpu_exception( "Source host index out of range in _RemoteConnect" );
+  }
+  if ( target_host >= n_hosts_ )
+  {
+    throw ngpu_exception( "Target host index out of range in _RemoteConnect" );
+  }
+  if ( this_host_ >= n_hosts_ )
+  {
+    throw ngpu_exception( "this_host index out of range in _RemoteConnect" );
+  }
 
-// kernel that checks if nodes are already in map
-// if not insert them in the map
-// In the target host unmapped remote source nodes must be mapped
-// to local nodes from n_nodes to n_nodes + n_node_to_map
-__global__ void insertNodesInMapKernel
-(
- int **node_map,
- int **spike_buffer_map,
- int spike_buffer_map_i0,
- int old_n_node_map,
- int *sorted_node_index,
- bool *node_to_map,
- int *i_node_to_map,
- int n_node);
+  // Check if it is a local connection
+  if ( this_host_ == source_host && source_host == target_host )
+  {
+    _Connect< T1, T2 >( source, n_source, target, n_target, conn_spec, syn_spec );
+  }
+  // Check if target_host matches this_host
+  else if ( this_host_ == target_host )
+  {
+    remoteConnectSource< T1, T2 >( source_host, source, n_source, target, n_target, conn_spec, syn_spec );
+  }
+  // Check if source_host matches this_host
+  else if ( this_host_ == source_host )
+  {
+    remoteConnectTarget< T1, T2 >( target_host, source, n_source, target, n_target, conn_spec, syn_spec );
+  }
+
+  return 0;
+}
+
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::addOffsetToExternalNodeIds( uint n_local_nodes )
+{
+  uint n_blocks = ( n_conn_ - 1 ) / conn_block_size_ + 1;
+  // uint i_image_node_0 = getNLocalNodes();
+  uint i_image_node_0 = n_local_nodes;
+
+  for ( uint ib = 0; ib < n_blocks; ib++ )
+  {
+    int64_t n_block_conn = conn_block_size_; // number of connections in the block
+    if ( ib == n_blocks - 1 )
+    { // last block
+      n_block_conn = ( n_conn_ - 1 ) % conn_block_size_ + 1;
+    }
+    addOffsetToExternalNodeIdsKernel< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      n_block_conn, conn_key_vect_[ ib ], ( ConnStructT* ) conn_struct_vect_[ ib ], i_image_node_0 );
+    gpuErrchk( cudaPeekAtLastError() );
+    gpuErrchk( cudaDeviceSynchronize() );
+  }
+
+  return 0;
+}
 
 // REMOTE CONNECT FUNCTION for target_host matching this_host
-template <class T1, class T2>
-int NESTGPU::_RemoteConnectSource(int source_host, T1 source, int n_source,
-				  T2 target, int n_target,
-				  ConnSpec &conn_spec, SynSpec &syn_spec)
+template < class ConnKeyT, class ConnStructT >
+template < class T1, class T2 >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::remoteConnectSource( int source_host,
+  T1 source,
+  inode_t n_source,
+  T2 target,
+  inode_t n_target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
 {
   // n_nodes will be the first index for new mapping of remote source nodes
   // to local spike buffers
-  //int spike_buffer_map_i0 = GetNNode();
-  int spike_buffer_map_i0 = n_image_nodes_;
-  syn_spec.port_ = syn_spec.port_ |
-    (1 << (h_MaxPortSynNBits - h_MaxSynNBits - 1));
-    
+  // int spike_buffer_map_i0 = GetNNode();
+  uint spike_buffer_map_i0 = n_image_nodes_;
+  // syn_spec.port_ = syn_spec.port_ |
+  //   (1 << (h_MaxPortSynNBits - max_syn_nbits_ - 1));
+  syn_spec.syn_group_ = syn_spec.syn_group_ | ( 1 << max_syn_nbits_ );
+
   // check if the flag UseAllSourceNodes[conn_rule] is false
   // if (!use_all_source_nodes_flag) {
-    
+
   // on both the source and target hosts create a temporary array
   // of booleans having size equal to the number of source nodes
-    
-  int *d_source_node_flag; // [n_source] // each element is initially false
-  CUDAMALLOCCTRL("&d_source_node_flag",&d_source_node_flag, n_source*sizeof(int));
-  //std::cout << "d_source_node_flag: " << d_source_node_flag << "\n";
-  gpuErrchk(cudaMemset(d_source_node_flag, 0, n_source*sizeof(int)));  
-    
+
+  uint* d_source_node_flag; // [n_source] // each element is initially false
+  CUDAMALLOCCTRL( "&d_source_node_flag", &d_source_node_flag, n_source * sizeof( uint ) );
+  // std::cout << "d_source_node_flag: " << d_source_node_flag << "\n";
+  gpuErrchk( cudaMemset( d_source_node_flag, 0, n_source * sizeof( uint ) ) );
+
   // on the target hosts create a temporary array of integers having size
   // equal to the number of source nodes
-    
-  int *d_local_node_index; // [n_source]; // only on target host
-  CUDAMALLOCCTRL("&d_local_node_index",&d_local_node_index, n_source*sizeof(int));
-    
-  int64_t old_n_conn = NConn;
+
+  uint* d_local_node_index; // [n_source]; // only on target host
+  CUDAMALLOCCTRL( "&d_local_node_index", &d_local_node_index, n_source * sizeof( uint ) );
+
+  int64_t old_n_conn = n_conn_;
   // The connect command is performed on both source and target host using
   // the same initial seed and using as source node indexes the integers
   // from 0 to n_source_nodes - 1
-  _Connect(conn_random_generator_[source_host][this_host_],
-	   0, n_source, target, n_target,
-	   conn_spec, syn_spec);
-  if (NConn == old_n_conn) {
+  _Connect< inode_t, T2 >(
+    conn_random_generator_[ source_host ][ this_host_ ], 0, n_source, target, n_target, conn_spec, syn_spec, false );
+  if ( n_conn_ == old_n_conn )
+  {
     return 0;
   }
-  /////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  uint h_source_delay[NConn];
-  int h_source[NConn];
-  int h_delay[NConn];
-  gpuErrchk(cudaMemcpy(h_source_delay, KeySubarray[0],
-		       NConn*sizeof(uint), cudaMemcpyDeviceToHost));
-  for (int i=0; i<NConn; i++) {
-    h_source[i] = h_source_delay[i] >> h_MaxPortNBits;
-    h_delay[i] = h_source_delay[i] & h_PortMask;
-    std::cout << "i_conn: " << i << " source: " << h_source[i];
-    std::cout << " delay: " << h_delay[i] << "\n";
-  }
-#endif  
-  //////////////////////////////
-    
 
   // flag source nodes used in at least one new connection
   // Loop on all new connections and set source_node_flag[i_source]=true
-  setUsedSourceNodes(KeySubarray, old_n_conn, NConn, h_ConnBlockSize,
-		     d_source_node_flag);
+  setUsedSourceNodes( old_n_conn, d_source_node_flag );
 
-  //////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  std::cout << "n_source: " << n_source << "\n";
-  int h_source_node_flag[n_source];
-  //std::cout << "d_source_node_flag: " << d_source_node_flag << "\n";
-    
-  gpuErrchk(cudaMemcpy(h_source_node_flag, d_source_node_flag,
-		       n_source*sizeof(int), cudaMemcpyDeviceToHost));
-
-  for (int i=0; i<n_source; i++) {
-    std::cout << "i_source: " << i << " source_node_flag: "
-	      << h_source_node_flag[i] << "\n";
-  }
-#endif
-  //////////////////////////////
-    
   // Count source nodes actually used in new connections
   // Allocate n_used_source_nodes and initialize it to 0
-  int *d_n_used_source_nodes;
-  CUDAMALLOCCTRL("&d_n_used_source_nodes",&d_n_used_source_nodes, sizeof(int));
-  gpuErrchk(cudaMemset(d_n_used_source_nodes, 0, sizeof(int)));  
+  uint* d_n_used_source_nodes;
+  CUDAMALLOCCTRL( "&d_n_used_source_nodes", &d_n_used_source_nodes, sizeof( uint ) );
+  gpuErrchk( cudaMemset( d_n_used_source_nodes, 0, sizeof( uint ) ) );
   // Launch kernel to count used nodes
-  countUsedSourceNodeKernel<<<(n_source+1023)/1024, 1024>>>
-    (n_source, d_n_used_source_nodes, d_source_node_flag);
+  countUsedSourceNodeKernel<<< ( n_source + 1023 ) / 1024, 1024 >>>(
+    n_source, d_n_used_source_nodes, d_source_node_flag );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
 
   // copy result from GPU to CPU memory
-  int n_used_source_nodes;
-  gpuErrchk(cudaMemcpy(&n_used_source_nodes, d_n_used_source_nodes,
-		       sizeof(int), cudaMemcpyDeviceToHost));
-
-#ifdef CHECKRC
-  // TEMPORARY
-  std::cout << "n_used_source_nodes: " << n_used_source_nodes << "\n";
-  //
-#endif
-    
+  uint n_used_source_nodes;
+  gpuErrchk( cudaMemcpy( &n_used_source_nodes, d_n_used_source_nodes, sizeof( uint ), cudaMemcpyDeviceToHost ) );
+
   // Define and allocate arrays of size n_used_source_nodes
-  int *d_unsorted_source_node_index; // [n_used_source_nodes];
-  int *d_sorted_source_node_index; // [n_used_source_nodes];
+  uint* d_unsorted_source_node_index; // [n_used_source_nodes];
+  uint* d_sorted_source_node_index;   // [n_used_source_nodes];
   // i_source_arr are the positions in the arrays source_node_flag
-  // and local_node_index 
-  int *d_i_unsorted_source_arr; // [n_used_source_nodes];
-  int *d_i_sorted_source_arr; // [n_used_source_nodes];
-  bool *d_source_node_index_to_be_mapped; //[n_used_source_nodes]; // initially false
-  CUDAMALLOCCTRL("&d_unsorted_source_node_index",&d_unsorted_source_node_index,
-		       n_used_source_nodes*sizeof(int));
-  CUDAMALLOCCTRL("&d_sorted_source_node_index",&d_sorted_source_node_index,
-		       n_used_source_nodes*sizeof(int));
-  CUDAMALLOCCTRL("&d_i_unsorted_source_arr",&d_i_unsorted_source_arr,
-		       n_used_source_nodes*sizeof(int));
-  CUDAMALLOCCTRL("&d_i_sorted_source_arr",&d_i_sorted_source_arr,
-		       n_used_source_nodes*sizeof(int));
-  CUDAMALLOCCTRL("&d_source_node_index_to_be_mapped",&d_source_node_index_to_be_mapped,
-		       n_used_source_nodes*sizeof(int8_t));
+  // and local_node_index
+  uint* d_i_unsorted_source_arr;          // [n_used_source_nodes];
+  uint* d_i_sorted_source_arr;            // [n_used_source_nodes];
+  bool* d_source_node_index_to_be_mapped; //[n_used_source_nodes]; // initially
+                                          // false
+  CUDAMALLOCCTRL(
+    "&d_unsorted_source_node_index", &d_unsorted_source_node_index, n_used_source_nodes * sizeof( uint ) );
+  CUDAMALLOCCTRL( "&d_sorted_source_node_index", &d_sorted_source_node_index, n_used_source_nodes * sizeof( uint ) );
+  CUDAMALLOCCTRL( "&d_i_unsorted_source_arr", &d_i_unsorted_source_arr, n_used_source_nodes * sizeof( uint ) );
+  CUDAMALLOCCTRL( "&d_i_sorted_source_arr", &d_i_sorted_source_arr, n_used_source_nodes * sizeof( uint ) );
+  CUDAMALLOCCTRL(
+    "&d_source_node_index_to_be_mapped", &d_source_node_index_to_be_mapped, n_used_source_nodes * sizeof( int8_t ) );
   // source_node_index_to_be_mapped is initially false
-  gpuErrchk(cudaMemset(d_source_node_index_to_be_mapped, 0,
-		       n_used_source_nodes*sizeof(int8_t)));
-    
-  // Fill the arrays of nodes actually used by new connections 
+  gpuErrchk( cudaMemset( d_source_node_index_to_be_mapped, 0, n_used_source_nodes * sizeof( int8_t ) ) );
+
+  // Fill the arrays of nodes actually used by new connections
   // Reset n_used_source_nodes to 0
-  gpuErrchk(cudaMemset(d_n_used_source_nodes, 0, sizeof(int)));  
+  gpuErrchk( cudaMemset( d_n_used_source_nodes, 0, sizeof( uint ) ) );
   // Launch kernel to fill the arrays
-  getUsedSourceNodeIndexKernel<<<(n_source+1023)/1024, 1024>>>
-    (source, n_source, d_n_used_source_nodes, d_source_node_flag,
-     d_unsorted_source_node_index, d_i_unsorted_source_arr);
+  getUsedSourceNodeIndexKernel<<< ( n_source + 1023 ) / 1024, 1024 >>>( source,
+    n_source,
+    d_n_used_source_nodes,
+    d_source_node_flag,
+    d_unsorted_source_node_index,
+    d_i_unsorted_source_arr );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
 
-  //////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  std::cout << "n_used_source_nodes: " << n_used_source_nodes << "\n";
-  int h_unsorted_source_node_index[n_used_source_nodes];
-  int h_i_unsorted_source_arr[n_used_source_nodes];
-    
-  gpuErrchk(cudaMemcpy(h_unsorted_source_node_index,
-		       d_unsorted_source_node_index,
-		       n_used_source_nodes*sizeof(int),
-		       cudaMemcpyDeviceToHost));
-
-  gpuErrchk(cudaMemcpy(h_i_unsorted_source_arr,
-		       d_i_unsorted_source_arr,
-		       n_used_source_nodes*sizeof(int),
-		       cudaMemcpyDeviceToHost));
-
-  for (int i=0; i<n_used_source_nodes; i++) {
-    std::cout << "i_used_source: " << i << " unsorted_source_node_index: "
-	      << h_unsorted_source_node_index[i]
-	      << " i_unsorted_source_arr: "
-	      << h_i_unsorted_source_arr[i] << "\n";
-  }
-#endif
-  //////////////////////////////
-
   // Sort the arrays using unsorted_source_node_index as key
   // and i_source as value -> sorted_source_node_index
 
-
   // Determine temporary storage requirements for RadixSort
-  void *d_sort_storage = NULL;
+  void* d_sort_storage = NULL;
   size_t sort_storage_bytes = 0;
-  cub::DeviceRadixSort::SortPairs(d_sort_storage, sort_storage_bytes,
-				  d_unsorted_source_node_index,
-				  d_sorted_source_node_index,
-				  d_i_unsorted_source_arr,
-				  d_i_sorted_source_arr,
-				  n_used_source_nodes);
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceRadixSort::SortPairs( d_sort_storage,
+    sort_storage_bytes,
+    d_unsorted_source_node_index,
+    d_sorted_source_node_index,
+    d_i_unsorted_source_arr,
+    d_i_sorted_source_arr,
+    n_used_source_nodes );
+  //<END-CLANG-TIDY-SKIP>//
+
   // Allocate temporary storage
-  CUDAMALLOCCTRL("&d_sort_storage",&d_sort_storage, sort_storage_bytes);
+  CUDAMALLOCCTRL( "&d_sort_storage", &d_sort_storage, sort_storage_bytes );
 
   // Run sorting operation
-  cub::DeviceRadixSort::SortPairs(d_sort_storage, sort_storage_bytes,
-				  d_unsorted_source_node_index,
-				  d_sorted_source_node_index,
-				  d_i_unsorted_source_arr,
-				  d_i_sorted_source_arr,
-				  n_used_source_nodes);
-
-  //////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  int h_sorted_source_node_index[n_used_source_nodes];
-  int h_i_sorted_source_arr[n_used_source_nodes];
-  
-  gpuErrchk(cudaMemcpy(h_sorted_source_node_index,
-		       d_sorted_source_node_index,
-		       n_used_source_nodes*sizeof(int),
-		       cudaMemcpyDeviceToHost));
-
-  gpuErrchk(cudaMemcpy(h_i_sorted_source_arr,
-		       d_i_sorted_source_arr,
-		       n_used_source_nodes*sizeof(int),
-		       cudaMemcpyDeviceToHost));
-
-  for (int i=0; i<n_used_source_nodes; i++) {
-    std::cout << "i_used_source: " << i << " sorted_source_node_index: "
-	      << h_sorted_source_node_index[i]
-	      << " i_sorted_source_arr: "
-	      << h_i_sorted_source_arr[i] << "\n";
-  }
-#endif
-  //////////////////////////////////////////////////////////////////////
-
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceRadixSort::SortPairs( d_sort_storage,
+    sort_storage_bytes,
+    d_unsorted_source_node_index,
+    d_sorted_source_node_index,
+    d_i_unsorted_source_arr,
+    d_i_sorted_source_arr,
+    n_used_source_nodes );
+  //<END-CLANG-TIDY-SKIP>//
 
   //////////////////////////////
   // Allocate array of remote source node map blocks
   // and copy their address from host to device
-  int n_blocks = h_remote_source_node_map[source_host].size();
-  int **d_node_map = NULL;
-  int **d_spike_buffer_map = NULL;
+  uint n_blocks = h_remote_source_node_map_[ source_host ].size();
+  uint** d_node_map = NULL;
+  uint** d_spike_buffer_map = NULL;
   // get current number of elements in the map
-  int n_node_map;
-  gpuErrchk(cudaMemcpy(&n_node_map,
-		       &d_n_remote_source_node_map[source_host], sizeof(int),
-		       cudaMemcpyDeviceToHost));
-  
-    
-  if (n_blocks>0) {
+  uint n_node_map;
+  gpuErrchk(
+    cudaMemcpy( &n_node_map, &d_n_remote_source_node_map_[ source_host ], sizeof( uint ), cudaMemcpyDeviceToHost ) );
+
+  if ( n_blocks > 0 )
+  {
     // check for consistency between number of elements
     // and number of blocks in the map
-    int tmp_n_blocks = (n_node_map - 1) / h_node_map_block_size + 1;
-    if (tmp_n_blocks != n_blocks) {
-      std::cerr << "Inconsistent number of elements "
-		<< n_node_map << " and number of blocks "
-		<< n_blocks << " in remote_source_node_map\n";
-      exit(-1);
+    uint tmp_n_blocks = ( n_node_map - 1 ) / node_map_block_size_ + 1;
+    if ( tmp_n_blocks != n_blocks )
+    {
+      std::cerr << "Inconsistent number of elements " << n_node_map << " and number of blocks " << n_blocks
+                << " in remote_source_node_map\n";
+      exit( -1 );
     }
-    CUDAMALLOCCTRL("&d_node_map",&d_node_map, n_blocks*sizeof(int*));
-    gpuErrchk(cudaMemcpy(d_node_map,
-			 &h_remote_source_node_map[source_host][0],
-			 n_blocks*sizeof(int*),
-			 cudaMemcpyHostToDevice));
+    CUDAMALLOCCTRL( "&d_node_map", &d_node_map, n_blocks * sizeof( uint* ) );
+    gpuErrchk( cudaMemcpy( d_node_map,
+      &h_remote_source_node_map_[ source_host ][ 0 ],
+      n_blocks * sizeof( uint* ),
+      cudaMemcpyHostToDevice ) );
   }
 
   // Allocate boolean array for flagging remote source nodes not yet mapped
   // and initialize all elements to 0 (false)
-  bool *d_node_to_map;
-  CUDAMALLOCCTRL("&d_node_to_map",&d_node_to_map, n_used_source_nodes*sizeof(bool));
-  gpuErrchk(cudaMemset(d_node_to_map, 0, n_used_source_nodes*sizeof(bool)));
-  // Allocate number of nodes to be mapped and initialize it to 0 
-  int *d_n_node_to_map;
-  CUDAMALLOCCTRL("&d_n_node_to_map",&d_n_node_to_map, sizeof(int));
-  gpuErrchk(cudaMemset(d_n_node_to_map, 0, sizeof(int)));
+  bool* d_node_to_map;
+  CUDAMALLOCCTRL( "&d_node_to_map", &d_node_to_map, n_used_source_nodes * sizeof( bool ) );
+  gpuErrchk( cudaMemset( d_node_to_map, 0, n_used_source_nodes * sizeof( bool ) ) );
+  // Allocate number of nodes to be mapped and initialize it to 0
+  uint* d_n_node_to_map;
+  CUDAMALLOCCTRL( "&d_n_node_to_map", &d_n_node_to_map, sizeof( uint ) );
+  gpuErrchk( cudaMemset( d_n_node_to_map, 0, sizeof( uint ) ) );
 
   // launch kernel that searches remote source nodes indexes not in the map,
   // flags the nodes not yet mapped and counts them
-  searchNodeIndexNotInMapKernel<<<(n_used_source_nodes+1023)/1024, 1024>>>
-    (d_node_map, n_node_map, d_sorted_source_node_index, d_node_to_map,
-     d_n_node_to_map, n_used_source_nodes);
+  searchNodeIndexNotInMapKernel<<< ( n_used_source_nodes + 1023 ) / 1024, 1024 >>>(
+    d_node_map, n_node_map, d_sorted_source_node_index, d_node_to_map, d_n_node_to_map, n_used_source_nodes );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
 
-  int h_n_node_to_map;
-    
-  gpuErrchk(cudaMemcpy(&h_n_node_to_map, d_n_node_to_map, sizeof(int),
-		       cudaMemcpyDeviceToHost));
+  uint h_n_node_to_map;
 
-  //////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  std::cout << "n_node_to_map: " << h_n_node_to_map << "\n";
-  
-  bool h_node_to_map[n_used_source_nodes];
-    
-  gpuErrchk(cudaMemcpy(h_node_to_map, d_node_to_map,
-		       n_used_source_nodes*sizeof(bool),
-		       cudaMemcpyDeviceToHost));
-
-  for (int i=0; i<n_used_source_nodes; i++) {
-    std::cout << "i_used_source: " << i << " sorted_source_node_index: "
-	      << h_sorted_source_node_index[i]
-	      << " node_to_map: " << h_node_to_map[i] << "\n";
-  }
-#endif
-  //////////////////////////////
+  gpuErrchk( cudaMemcpy( &h_n_node_to_map, d_n_node_to_map, sizeof( uint ), cudaMemcpyDeviceToHost ) );
 
   // Check if new blocks are required for the map
-  int new_n_blocks = (n_node_map + h_n_node_to_map - 1)
-    / h_node_map_block_size + 1;
+  uint new_n_blocks = ( n_node_map + h_n_node_to_map - 1 ) / node_map_block_size_ + 1;
 
-#ifdef CHECKRC
-  std::cout << "new_n_blocks: " << new_n_blocks << "\n";
-#endif
-  
   // if new blocks are required for the map, allocate them
-  if (new_n_blocks != n_blocks) {
+  if ( new_n_blocks != n_blocks )
+  {
     // Allocate GPU memory for new remote-source-node-map blocks
-    allocRemoteSourceNodeMapBlocks(h_remote_source_node_map[source_host],
-				   h_local_spike_buffer_map[source_host],
-				   h_node_map_block_size, new_n_blocks);
+    allocRemoteSourceNodeMapBlocks(
+      h_remote_source_node_map_[ source_host ], h_local_spike_buffer_map_[ source_host ], new_n_blocks );
     // free d_node_map
-    if (n_blocks>0) {
-      CUDAFREECTRL("d_node_map",d_node_map);
+    if ( n_blocks > 0 )
+    {
+      CUDAFREECTRL( "d_node_map", d_node_map );
     }
-    // update number of blocks in the map 
+    // update number of blocks in the map
     n_blocks = new_n_blocks;
 
     // reallocate d_node_map and get it from host
-    CUDAMALLOCCTRL("&d_node_map",&d_node_map, n_blocks*sizeof(int*));
-    gpuErrchk(cudaMemcpy(d_node_map,
-			 &h_remote_source_node_map[source_host][0],
-			 n_blocks*sizeof(int*),
-			 cudaMemcpyHostToDevice));
+    CUDAMALLOCCTRL( "&d_node_map", &d_node_map, n_blocks * sizeof( uint* ) );
+    gpuErrchk( cudaMemcpy( d_node_map,
+      &h_remote_source_node_map_[ source_host ][ 0 ],
+      n_blocks * sizeof( uint* ),
+      cudaMemcpyHostToDevice ) );
   }
-  if (n_blocks > 0) {
+  if ( n_blocks > 0 )
+  {
     // allocate d_spike_buffer_map and get it from host
-    CUDAMALLOCCTRL("&d_spike_buffer_map",&d_spike_buffer_map, n_blocks*sizeof(int*));
-    gpuErrchk(cudaMemcpy(d_spike_buffer_map,
-			 &h_local_spike_buffer_map[source_host][0],
-			 n_blocks*sizeof(int*),
-			 cudaMemcpyHostToDevice));
+    CUDAMALLOCCTRL( "&d_spike_buffer_map", &d_spike_buffer_map, n_blocks * sizeof( uint* ) );
+    gpuErrchk( cudaMemcpy( d_spike_buffer_map,
+      &h_local_spike_buffer_map_[ source_host ][ 0 ],
+      n_blocks * sizeof( uint* ),
+      cudaMemcpyHostToDevice ) );
   }
-  
+
   // Map the not-yet-mapped source nodes using a kernel
   // similar to the one used for counting
   // In the target host unmapped remote source nodes must be mapped
   // to local nodes from n_nodes to n_nodes + n_node_to_map
-  
-  // Allocate the index of the nodes to be mapped and initialize it to 0 
-  int *d_i_node_to_map;
-  CUDAMALLOCCTRL("&d_i_node_to_map",&d_i_node_to_map, sizeof(int));
-  gpuErrchk(cudaMemset(d_i_node_to_map, 0, sizeof(int)));
+
+  // Allocate the index of the nodes to be mapped and initialize it to 0
+  uint* d_i_node_to_map;
+  CUDAMALLOCCTRL( "&d_i_node_to_map", &d_i_node_to_map, sizeof( uint ) );
+  gpuErrchk( cudaMemset( d_i_node_to_map, 0, sizeof( uint ) ) );
 
   // launch kernel that checks if nodes are already in map
   // if not insert them in the map
   // In the target host, put in the map the pair:
   // (source_node_index, spike_buffer_map_i0 + i_node_to_map)
-  insertNodesInMapKernel<<<(n_used_source_nodes+1023)/1024, 1024>>>
-    (d_node_map, d_spike_buffer_map, spike_buffer_map_i0,
-     n_node_map, d_sorted_source_node_index, d_node_to_map,
-     d_i_node_to_map, n_used_source_nodes);
+  insertNodesInMapKernel<<< ( n_used_source_nodes + 1023 ) / 1024, 1024 >>>( d_node_map,
+    d_spike_buffer_map,
+    spike_buffer_map_i0,
+    n_node_map,
+    d_sorted_source_node_index,
+    d_node_to_map,
+    d_i_node_to_map,
+    n_used_source_nodes );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
 
   // update number of elements in remote source node map
   n_node_map += h_n_node_to_map;
-  gpuErrchk(cudaMemcpy(&d_n_remote_source_node_map[source_host],
-		       &n_node_map, sizeof(int), cudaMemcpyHostToDevice));
-  
+  gpuErrchk(
+    cudaMemcpy( &d_n_remote_source_node_map_[ source_host ], &n_node_map, sizeof( uint ), cudaMemcpyHostToDevice ) );
+
   // check for consistency between number of elements
   // and number of blocks in the map
-  int tmp_n_blocks = (n_node_map - 1) / h_node_map_block_size + 1;
-  if (tmp_n_blocks != n_blocks) {
-    std::cerr << "Inconsistent number of elements "
-	      << n_node_map << " and number of blocks "
-	      << n_blocks << " in remote_source_node_map\n";
-    exit(-1);
+  uint tmp_n_blocks = ( n_node_map - 1 ) / node_map_block_size_ + 1;
+  if ( tmp_n_blocks != n_blocks )
+  {
+    std::cerr << "Inconsistent number of elements " << n_node_map << " and number of blocks " << n_blocks
+              << " in remote_source_node_map\n";
+    exit( -1 );
   }
 
-  //////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  std::cout << "//////////////////////////////////////////////\n";
-  std::cout << "UPDATED UNSORTED MAP\n";
-  std::cout << "OF REMOTE-SOURCE_NODES TO LOCAL-SPIKE-BUFFERS\n";
-  std::cout << "n_node_map: " << n_node_map << "\n";
-  std::cout << "n_blocks: " << n_blocks << "\n";
-  std::cout << "block_size: " << h_node_map_block_size << "\n";
-
-  int block_size = h_node_map_block_size;
-  int h_node_map_block[block_size];
-  int h_spike_buffer_map_block[block_size];
-  for (int ib=0; ib<n_blocks; ib++) {
-    gpuErrchk(cudaMemcpy(h_node_map_block,
-			 h_remote_source_node_map[source_host][ib],
-			 block_size*sizeof(int),
-			 cudaMemcpyDeviceToHost));
-    gpuErrchk(cudaMemcpy(h_spike_buffer_map_block,
-			 h_local_spike_buffer_map[source_host][ib],
-			 block_size*sizeof(int),
-			 cudaMemcpyDeviceToHost));
-    std::cout << "\n";
-    std::cout << "block " << ib << "\n";
-    std::cout << "remote source node index, local spike buffer index\n";
-    int n = block_size;
-    if (ib==n_blocks-1) {
-      n = (n_node_map - 1) % block_size + 1;
-    }
-    for (int i=0; i<n; i++) {
-      std::cout << h_node_map_block[i] << "\t" << h_spike_buffer_map_block[i]
-		<< "\n"; 
-    }
-    std::cout << "\n";
-  }
-#endif
-  //////////////////////////////////////////////////////////////////////
-  
   // Sort the WHOLE key-pair map source_node_map, spike_buffer_map
   // using block sort algorithm copass_sort
   // typical usage:
-  // copass_sort::sort<int, value_struct>(key_subarray, value_subarray, n,
+  // copass_sort::sort<uint, value_struct>(key_subarray, value_subarray, n,
   //				       aux_size, d_storage, storage_bytes);
   // Determine temporary storage requirements for copass_sort
   int64_t storage_bytes = 0;
-  void *d_storage = NULL;
-  copass_sort::sort<int, int>
-    (&h_remote_source_node_map[source_host][0],
-     &h_local_spike_buffer_map[source_host][0],
-     n_node_map, h_node_map_block_size, d_storage, storage_bytes);
-
-#ifdef CHECKRC
-  printf("storage bytes for copass sort: %ld\n", storage_bytes);
-#endif
-  
+  void* d_storage = NULL;
+  copass_sort::sort< uint, uint >( &h_remote_source_node_map_[ source_host ][ 0 ],
+    &h_local_spike_buffer_map_[ source_host ][ 0 ],
+    n_node_map,
+    node_map_block_size_,
+    d_storage,
+    storage_bytes );
+
   // Allocate temporary storage
-  CUDAMALLOCCTRL("&d_storage",&d_storage, storage_bytes);
+  CUDAMALLOCCTRL( "&d_storage", &d_storage, storage_bytes );
 
   // Run sorting operation
-  copass_sort::sort<int, int>
-    (&h_remote_source_node_map[source_host][0],
-     &h_local_spike_buffer_map[source_host][0],
-     n_node_map, h_node_map_block_size, d_storage, storage_bytes);
-  CUDAFREECTRL("d_storage",d_storage);
-
-  //////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  std::cout << "//////////////////////////////////////////////\n";
-  std::cout << "UPDATED SORTED MAP\n";
-  std::cout << "OF REMOTE-SOURCE_NODES TO LOCAL-SPIKE-BUFFERS\n";
-  std::cout << "n_node_map: " << n_node_map << "\n";
-  std::cout << "n_blocks: " << n_blocks << "\n";
-  std::cout << "block_size: " << block_size << "\n";
-
-  for (int ib=0; ib<n_blocks; ib++) {
-    gpuErrchk(cudaMemcpy(h_node_map_block,
-			 h_remote_source_node_map[source_host][ib],
-			 block_size*sizeof(int),
-			 cudaMemcpyDeviceToHost));
-    gpuErrchk(cudaMemcpy(h_spike_buffer_map_block,
-			 h_local_spike_buffer_map[source_host][ib],
-			 block_size*sizeof(int),
-			 cudaMemcpyDeviceToHost));
-    std::cout << "\n";
-    std::cout << "block " << ib << "\n";
-    std::cout << "remote source node index, local spike buffer index\n";
-    int n = block_size;
-    if (ib==n_blocks-1) {
-      n = (n_node_map - 1) % block_size + 1;
-    }
-    for (int i=0; i<n; i++) {
-      std::cout << h_node_map_block[i] << "\t" << h_spike_buffer_map_block[i]
-		<< "\n"; 
-    }
-    std::cout << "\n";
-  }
-#endif
-  //////////////////////////////////////////////////////////////////////
+  copass_sort::sort< uint, uint >( &h_remote_source_node_map_[ source_host ][ 0 ],
+    &h_local_spike_buffer_map_[ source_host ][ 0 ],
+    n_node_map,
+    node_map_block_size_,
+    d_storage,
+    storage_bytes );
+  CUDAFREECTRL( "d_storage", d_storage );
 
   // Launch kernel that searches source node indexes in the map
   // and set corresponding values of local_node_index
-  setLocalNodeIndexKernel<<<(n_source+1023)/1024, 1024>>>
-    (source, n_source, d_source_node_flag,
-     d_node_map, d_spike_buffer_map, n_node_map, d_local_node_index);
+  setLocalNodeIndexKernel<<< ( n_source + 1023 ) / 1024, 1024 >>>(
+    source, n_source, d_source_node_flag, d_node_map, d_spike_buffer_map, n_node_map, d_local_node_index );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
 
-  //////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  std::cout << "n_source: " << n_source << "\n";
-  int h_local_node_index[n_source];
-  gpuErrchk(cudaMemcpy(h_local_node_index, d_local_node_index,
-		       n_source*sizeof(int), cudaMemcpyDeviceToHost));
-
-  for (int i=0; i<n_source; i++) {
-    std::cout << "i_source: " << i << " source_node_flag: "
-	      << h_source_node_flag[i] << " local_node_index: ";
-    if (h_source_node_flag[i]) {
-      std::cout << h_local_node_index[i];
-    }
-    else {
-      std::cout << "---";
-    }
-    std::cout << "\n";
-  }
-#endif
-  //////////////////////////////
-
-
   // On target host. Loop on all new connections and replace
   // the source node index source_node[i_conn] with the value of the element
   // pointed by the index itself in the array local_node_index
@@ -772,459 +996,269 @@ int NESTGPU::_RemoteConnectSource(int source_host, T1 source, int n_source,
   // similar to setUsedSourceNodes
   // replace source_node_flag[i_source] with local_node_index[i_source]
   // clearly read it instead of writing on it!
-  //setUsedSourceNodes(KeySubarray, old_n_conn, NConn, h_ConnBlockSize,
-  //		     d_source_node_flag);
+  // setUsedSourceNodes(old_n_conn, d_source_node_flag);
   // becomes something like
-  fixConnectionSourceNodeIndexes(KeySubarray, old_n_conn, NConn,
-				 h_ConnBlockSize, d_local_node_index);
+  fixConnectionSourceNodeIndexes( old_n_conn, d_local_node_index );
 
   // On target host. Create n_nodes_to_map nodes of type image_node
-  //std::cout << "h_n_node_to_map " << h_n_node_to_map <<"\n";
-  if (h_n_node_to_map > 0) {
+  // std::cout << "h_n_node_to_map " << h_n_node_to_map <<"\n";
+  if ( h_n_node_to_map > 0 )
+  {
     //_Create("image_node", h_n_node_to_map);
     n_image_nodes_ += h_n_node_to_map;
-    //std::cout << "n_image_nodes_ " << n_image_nodes_ <<"\n";
+    // std::cout << "n_image_nodes_ " << n_image_nodes_ <<"\n";
   }
-  
+
   return 0;
 }
 
-
-
 // REMOTE CONNECT FUNCTION for source_host matching this_host
-template <class T1, class T2>
-int NESTGPU::_RemoteConnectTarget(int target_host, T1 source, int n_source,
-				  T2 target, int n_target,
-				  ConnSpec &conn_spec, SynSpec &syn_spec)
+template < class ConnKeyT, class ConnStructT >
+template < class T1, class T2 >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::remoteConnectTarget( int target_host,
+  T1 source,
+  inode_t n_source,
+  T2 target,
+  inode_t n_target,
+  ConnSpec& conn_spec,
+  SynSpec& syn_spec )
 {
   // check if the flag UseAllSourceNodes[conn_rule] is false
   // if (!use_all_source_nodes_flag) {
-    
+
   // on both the source and target hosts create a temporary array
   // of booleans having size equal to the number of source nodes
-    
-  int *d_source_node_flag; // [n_source] // each element is initially false
-  CUDAMALLOCCTRL("&d_source_node_flag",&d_source_node_flag, n_source*sizeof(int));
-  //std::cout << "d_source_node_flag: " << d_source_node_flag << "\n";
-  gpuErrchk(cudaMemset(d_source_node_flag, 0, n_source*sizeof(int)));  
-    
-  int64_t old_n_conn = NConn;
+
+  uint* d_source_node_flag; // [n_source] // each element is initially false
+  CUDAMALLOCCTRL( "&d_source_node_flag", &d_source_node_flag, n_source * sizeof( uint ) );
+  // std::cout << "d_source_node_flag: " << d_source_node_flag << "\n";
+  gpuErrchk( cudaMemset( d_source_node_flag, 0, n_source * sizeof( uint ) ) );
+
+  int64_t old_n_conn = n_conn_;
   // The connect command is performed on both source and target host using
   // the same initial seed and using as source node indexes the integers
   // from 0 to n_source_nodes - 1
-  _Connect(conn_random_generator_[this_host_][target_host],
-	   0, n_source, target, n_target,
-	   conn_spec, syn_spec);
+  _Connect< inode_t, T2 >(
+    conn_random_generator_[ this_host_ ][ target_host ], 0, n_source, target, n_target, conn_spec, syn_spec, true );
 
-  if (NConn == old_n_conn) {                                                                                                                  
-    return 0;                                                                                                                                 
-  }                                                                                                                                            
-  //////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  uint h_source_delay[NConn];
-  int h_source[NConn];
-  int h_delay[NConn];
-  gpuErrchk(cudaMemcpy(h_source_delay, KeySubarray[0],
-		       NConn*sizeof(uint), cudaMemcpyDeviceToHost));
-  for (int i=0; i<NConn; i++) {
-    h_source[i] = h_source_delay[i] >> h_MaxPortNBits;
-    h_delay[i] = h_source_delay[i] & h_PortMask;
-    std::cout << "i_conn: " << i << " source: " << h_source[i];
-    std::cout << " delay: " << h_delay[i] << "\n";
+  if ( n_conn_ == old_n_conn )
+  {
+    return 0;
   }
-#endif
-  //////////////////////////////
-    
 
   // flag source nodes used in at least one new connection
   // Loop on all new connections and set source_node_flag[i_source]=true
-  setUsedSourceNodes(KeySubarray, old_n_conn, NConn, h_ConnBlockSize,
-		     d_source_node_flag);
+  setUsedSourceNodesOnSourceHost( old_n_conn, d_source_node_flag );
 
-  //////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  std::cout << "n_source: " << n_source << "\n";
-  int h_source_node_flag[n_source];
-  //  std::cout << "d_source_node_flag: " << d_source_node_flag << "\n";
-    
-  gpuErrchk(cudaMemcpy(h_source_node_flag, d_source_node_flag,
-		       n_source*sizeof(int), cudaMemcpyDeviceToHost));
-
-  for (int i=0; i<n_source; i++) {
-    std::cout << "i_source: " << i << " source_node_flag: "
-  	      << h_source_node_flag[i] << "\n";
-  }
-#endif
-  //////////////////////////////
-    
   // Count source nodes actually used in new connections
   // Allocate n_used_source_nodes and initialize it to 0
-  int *d_n_used_source_nodes;
-  CUDAMALLOCCTRL("&d_n_used_source_nodes",&d_n_used_source_nodes, sizeof(int));
-  gpuErrchk(cudaMemset(d_n_used_source_nodes, 0, sizeof(int)));  
+  uint* d_n_used_source_nodes;
+  CUDAMALLOCCTRL( "&d_n_used_source_nodes", &d_n_used_source_nodes, sizeof( uint ) );
+  gpuErrchk( cudaMemset( d_n_used_source_nodes, 0, sizeof( uint ) ) );
   // Launch kernel to count used nodes
-  countUsedSourceNodeKernel<<<(n_source+1023)/1024, 1024>>>
-    (n_source, d_n_used_source_nodes, d_source_node_flag);
+  countUsedSourceNodeKernel<<< ( n_source + 1023 ) / 1024, 1024 >>>(
+    n_source, d_n_used_source_nodes, d_source_node_flag );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
 
   // copy result from GPU to CPU memory
-  int n_used_source_nodes;
-  gpuErrchk(cudaMemcpy(&n_used_source_nodes, d_n_used_source_nodes,
-		       sizeof(int), cudaMemcpyDeviceToHost));
-
-#ifdef CHECKRC
-  // TEMPORARY
-  std::cout << "n_used_source_nodes: " << n_used_source_nodes << "\n";
-  //
-#endif
-    
+  uint n_used_source_nodes;
+  gpuErrchk( cudaMemcpy( &n_used_source_nodes, d_n_used_source_nodes, sizeof( uint ), cudaMemcpyDeviceToHost ) );
+
   // Define and allocate arrays of size n_used_source_nodes
-  int *d_unsorted_source_node_index; // [n_used_source_nodes];
-  int *d_sorted_source_node_index; // [n_used_source_nodes];
+  uint* d_unsorted_source_node_index; // [n_used_source_nodes];
+  uint* d_sorted_source_node_index;   // [n_used_source_nodes];
   // i_source_arr are the positions in the arrays source_node_flag
-  // and local_node_index 
-  int *d_i_unsorted_source_arr; // [n_used_source_nodes];
-  int *d_i_sorted_source_arr; // [n_used_source_nodes];
-  bool *d_source_node_index_to_be_mapped; //[n_used_source_nodes]; // initially false
-  CUDAMALLOCCTRL("&d_unsorted_source_node_index",&d_unsorted_source_node_index,
-		       n_used_source_nodes*sizeof(int));
-  CUDAMALLOCCTRL("&d_sorted_source_node_index",&d_sorted_source_node_index,
-		       n_used_source_nodes*sizeof(int));
-  CUDAMALLOCCTRL("&d_i_unsorted_source_arr",&d_i_unsorted_source_arr,
-		       n_used_source_nodes*sizeof(int));
-  CUDAMALLOCCTRL("&d_i_sorted_source_arr",&d_i_sorted_source_arr,
-		       n_used_source_nodes*sizeof(int));
-  CUDAMALLOCCTRL("&d_source_node_index_to_be_mapped",&d_source_node_index_to_be_mapped,
-		       n_used_source_nodes*sizeof(int8_t));
+  // and local_node_index
+  uint* d_i_unsorted_source_arr;          // [n_used_source_nodes];
+  uint* d_i_sorted_source_arr;            // [n_used_source_nodes];
+  bool* d_source_node_index_to_be_mapped; //[n_used_source_nodes]; // initially
+                                          // false
+  CUDAMALLOCCTRL(
+    "&d_unsorted_source_node_index", &d_unsorted_source_node_index, n_used_source_nodes * sizeof( uint ) );
+  CUDAMALLOCCTRL( "&d_sorted_source_node_index", &d_sorted_source_node_index, n_used_source_nodes * sizeof( uint ) );
+  CUDAMALLOCCTRL( "&d_i_unsorted_source_arr", &d_i_unsorted_source_arr, n_used_source_nodes * sizeof( uint ) );
+  CUDAMALLOCCTRL( "&d_i_sorted_source_arr", &d_i_sorted_source_arr, n_used_source_nodes * sizeof( uint ) );
+  CUDAMALLOCCTRL(
+    "&d_source_node_index_to_be_mapped", &d_source_node_index_to_be_mapped, n_used_source_nodes * sizeof( int8_t ) );
   // source_node_index_to_be_mapped is initially false
-  gpuErrchk(cudaMemset(d_source_node_index_to_be_mapped, 0,
-		       n_used_source_nodes*sizeof(int8_t)));
-    
-  // Fill the arrays of nodes actually used by new connections 
+  gpuErrchk( cudaMemset( d_source_node_index_to_be_mapped, 0, n_used_source_nodes * sizeof( int8_t ) ) );
+
+  // Fill the arrays of nodes actually used by new connections
   // Reset n_used_source_nodes to 0
-  gpuErrchk(cudaMemset(d_n_used_source_nodes, 0, sizeof(int)));  
+  gpuErrchk( cudaMemset( d_n_used_source_nodes, 0, sizeof( uint ) ) );
   // Launch kernel to fill the arrays
-  getUsedSourceNodeIndexKernel<<<(n_source+1023)/1024, 1024>>>
-    (source, n_source, d_n_used_source_nodes, d_source_node_flag,
-     d_unsorted_source_node_index, d_i_unsorted_source_arr);
+  getUsedSourceNodeIndexKernel<<< ( n_source + 1023 ) / 1024, 1024 >>>( source,
+    n_source,
+    d_n_used_source_nodes,
+    d_source_node_flag,
+    d_unsorted_source_node_index,
+    d_i_unsorted_source_arr );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
 
-  //////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  std::cout << "n_used_source_nodes: " << n_used_source_nodes << "\n";
-  int h_unsorted_source_node_index[n_used_source_nodes];
-  int h_i_unsorted_source_arr[n_used_source_nodes];
-    
-  gpuErrchk(cudaMemcpy(h_unsorted_source_node_index,
-		       d_unsorted_source_node_index,
-		       n_used_source_nodes*sizeof(int),
-		       cudaMemcpyDeviceToHost));
-
-  gpuErrchk(cudaMemcpy(h_i_unsorted_source_arr,
-		       d_i_unsorted_source_arr,
-		       n_used_source_nodes*sizeof(int),
-		       cudaMemcpyDeviceToHost));
-
-  for (int i=0; i<n_used_source_nodes; i++) {
-    std::cout << "i_used_source: " << i << " unsorted_source_node_index: "
-	      << h_unsorted_source_node_index[i]
-	      << " i_unsorted_source_arr: "
-	      << h_i_unsorted_source_arr[i] << "\n";
-  }
-#endif
-  //////////////////////////////
-
   // Sort the arrays using unsorted_source_node_index as key
   // and i_source as value -> sorted_source_node_index
 
-
   // Determine temporary storage requirements for RadixSort
-  void *d_sort_storage = NULL;
+  void* d_sort_storage = NULL;
   size_t sort_storage_bytes = 0;
-  cub::DeviceRadixSort::SortPairs(d_sort_storage, sort_storage_bytes,
-				  d_unsorted_source_node_index,
-				  d_sorted_source_node_index,
-				  d_i_unsorted_source_arr,
-				  d_i_sorted_source_arr,
-				  n_used_source_nodes);
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceRadixSort::SortPairs( d_sort_storage,
+    sort_storage_bytes,
+    d_unsorted_source_node_index,
+    d_sorted_source_node_index,
+    d_i_unsorted_source_arr,
+    d_i_sorted_source_arr,
+    n_used_source_nodes );
+  //<END-CLANG-TIDY-SKIP>//
+
   // Allocate temporary storage
-  CUDAMALLOCCTRL("&d_sort_storage",&d_sort_storage, sort_storage_bytes);
+  CUDAMALLOCCTRL( "&d_sort_storage", &d_sort_storage, sort_storage_bytes );
 
   // Run sorting operation
-  cub::DeviceRadixSort::SortPairs(d_sort_storage, sort_storage_bytes,
-				  d_unsorted_source_node_index,
-				  d_sorted_source_node_index,
-				  d_i_unsorted_source_arr,
-				  d_i_sorted_source_arr,
-				  n_used_source_nodes);
-
-  //////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  int h_sorted_source_node_index[n_used_source_nodes];
-  int h_i_sorted_source_arr[n_used_source_nodes];
-    
-  gpuErrchk(cudaMemcpy(h_sorted_source_node_index,
-		       d_sorted_source_node_index,
-		       n_used_source_nodes*sizeof(int),
-		       cudaMemcpyDeviceToHost));
-
-  gpuErrchk(cudaMemcpy(h_i_sorted_source_arr,
-		       d_i_sorted_source_arr,
-		       n_used_source_nodes*sizeof(int),
-		       cudaMemcpyDeviceToHost));
-
-  for (int i=0; i<n_used_source_nodes; i++) {
-    std::cout << "i_used_source: " << i << " sorted_source_node_index: "
-	      << h_sorted_source_node_index[i]
-	      << " i_sorted_source_arr: "
-	      << h_i_sorted_source_arr[i] << "\n";
-  }
-#endif
-  //////////////////////////////
-
-
-  // !!!!!!!!!!!!!!!!  >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceRadixSort::SortPairs( d_sort_storage,
+    sort_storage_bytes,
+    d_unsorted_source_node_index,
+    d_sorted_source_node_index,
+    d_i_unsorted_source_arr,
+    d_i_sorted_source_arr,
+    n_used_source_nodes );
+  //<END-CLANG-TIDY-SKIP>//
+
+  // !!!!!!!!!!!!!!!!
   // Allocate array of local source node map blocks
   // and copy their address from host to device
-  int n_blocks = h_local_source_node_map[target_host].size();
-  int **d_node_map = NULL;
+  uint n_blocks = h_local_source_node_map_[ target_host ].size();
+  uint** d_node_map = NULL;
   // get current number of elements in the map
-  int n_node_map;
-  //std::cout << "ok2 th " << target_host << "\n"; 
-  gpuErrchk(cudaMemcpy(&n_node_map,
-		       &d_n_local_source_node_map[target_host], sizeof(int),
-		       cudaMemcpyDeviceToHost));
-  
-    
-  if (n_blocks>0) {
+  uint n_node_map;
+  // std::cout << "ok2 th " << target_host << "\n";
+  gpuErrchk(
+    cudaMemcpy( &n_node_map, &d_n_local_source_node_map_[ target_host ], sizeof( uint ), cudaMemcpyDeviceToHost ) );
+
+  if ( n_blocks > 0 )
+  {
     // check for consistency between number of elements
     // and number of blocks in the map
-    int tmp_n_blocks = (n_node_map - 1) / h_node_map_block_size + 1;
-    if (tmp_n_blocks != n_blocks) {
-      std::cerr << "Inconsistent number of elements "
-		<< n_node_map << " and number of blocks "
-		<< n_blocks << " in local_source_node_map\n";
-      exit(-1);
+    uint tmp_n_blocks = ( n_node_map - 1 ) / node_map_block_size_ + 1;
+    if ( tmp_n_blocks != n_blocks )
+    {
+      std::cerr << "Inconsistent number of elements " << n_node_map << " and number of blocks " << n_blocks
+                << " in local_source_node_map\n";
+      exit( -1 );
     }
-    CUDAMALLOCCTRL("&d_node_map",&d_node_map, n_blocks*sizeof(int*));
-    gpuErrchk(cudaMemcpy(d_node_map,
-			 &h_local_source_node_map[target_host][0],
-			 n_blocks*sizeof(int*),
-			 cudaMemcpyHostToDevice));
+    CUDAMALLOCCTRL( "&d_node_map", &d_node_map, n_blocks * sizeof( uint* ) );
+    gpuErrchk( cudaMemcpy(
+      d_node_map, &h_local_source_node_map_[ target_host ][ 0 ], n_blocks * sizeof( uint* ), cudaMemcpyHostToDevice ) );
   }
 
   // Allocate boolean array for flagging remote source nodes not yet mapped
   // and initialize all elements to 0 (false)
-  bool *d_node_to_map;
-  CUDAMALLOCCTRL("&d_node_to_map",&d_node_to_map, n_used_source_nodes*sizeof(bool));
-  gpuErrchk(cudaMemset(d_node_to_map, 0, n_used_source_nodes*sizeof(bool)));
-  // Allocate number of nodes to be mapped and initialize it to 0 
-  int *d_n_node_to_map;
-  CUDAMALLOCCTRL("&d_n_node_to_map",&d_n_node_to_map, sizeof(int));
-  gpuErrchk(cudaMemset(d_n_node_to_map, 0, sizeof(int)));
+  bool* d_node_to_map;
+  CUDAMALLOCCTRL( "&d_node_to_map", &d_node_to_map, n_used_source_nodes * sizeof( bool ) );
+  gpuErrchk( cudaMemset( d_node_to_map, 0, n_used_source_nodes * sizeof( bool ) ) );
+  // Allocate number of nodes to be mapped and initialize it to 0
+  uint* d_n_node_to_map;
+  CUDAMALLOCCTRL( "&d_n_node_to_map", &d_n_node_to_map, sizeof( uint ) );
+  gpuErrchk( cudaMemset( d_n_node_to_map, 0, sizeof( uint ) ) );
 
   // launch kernel that searches remote source nodes indexes in the map,
   // flags the nodes not yet mapped and counts them
-  searchNodeIndexNotInMapKernel<<<(n_used_source_nodes+1023)/1024, 1024>>>
-    (d_node_map, n_node_map, d_sorted_source_node_index, d_node_to_map,
-     d_n_node_to_map, n_used_source_nodes);
+  searchNodeIndexNotInMapKernel<<< ( n_used_source_nodes + 1023 ) / 1024, 1024 >>>(
+    d_node_map, n_node_map, d_sorted_source_node_index, d_node_to_map, d_n_node_to_map, n_used_source_nodes );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
 
-  int h_n_node_to_map;
-    
-  gpuErrchk(cudaMemcpy(&h_n_node_to_map, d_n_node_to_map, sizeof(int),
-		       cudaMemcpyDeviceToHost));
+  uint h_n_node_to_map;
 
-  //////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  std::cout << "n_node_to_map: " << h_n_node_to_map << "\n";
-  
-  bool h_node_to_map[n_used_source_nodes];
-    
-  gpuErrchk(cudaMemcpy(h_node_to_map, d_node_to_map,
-		       n_used_source_nodes*sizeof(bool),
-		       cudaMemcpyDeviceToHost));
-
-  for (int i=0; i<n_used_source_nodes; i++) {
-    std::cout << "i_used_source: " << i << " sorted_source_node_index: "
-	      << h_sorted_source_node_index[i]
-	      << " node_to_map: " << h_node_to_map[i] << "\n";
-  }
-#endif
-  //////////////////////////////
+  gpuErrchk( cudaMemcpy( &h_n_node_to_map, d_n_node_to_map, sizeof( uint ), cudaMemcpyDeviceToHost ) );
 
   // Check if new blocks are required for the map
-  int new_n_blocks = (n_node_map + h_n_node_to_map - 1)
-    / h_node_map_block_size + 1;
+  uint new_n_blocks = ( n_node_map + h_n_node_to_map - 1 ) / node_map_block_size_ + 1;
 
-#ifdef CHECKRC
-  std::cout << "new_n_blocks: " << new_n_blocks << "\n";
-#endif
-  
   // if new blocks are required for the map, allocate them
-  if (new_n_blocks != n_blocks) {
+  if ( new_n_blocks != n_blocks )
+  {
     // Allocate GPU memory for new remote-source-node-map blocks
-    allocLocalSourceNodeMapBlocks(h_local_source_node_map[target_host],
-				   h_node_map_block_size, new_n_blocks);
+    allocLocalSourceNodeMapBlocks( h_local_source_node_map_[ target_host ], new_n_blocks );
     // free d_node_map
-    if (n_blocks>0) {
-      CUDAFREECTRL("d_node_map",d_node_map);
+    if ( n_blocks > 0 )
+    {
+      CUDAFREECTRL( "d_node_map", d_node_map );
     }
-    // update number of blocks in the map 
+    // update number of blocks in the map
     n_blocks = new_n_blocks;
 
     // reallocate d_node_map and get it from host
-    CUDAMALLOCCTRL("&d_node_map",&d_node_map, n_blocks*sizeof(int*));
-    gpuErrchk(cudaMemcpy(d_node_map,
-			 &h_local_source_node_map[target_host][0],
-			 n_blocks*sizeof(int*),
-			 cudaMemcpyHostToDevice));
+    CUDAMALLOCCTRL( "&d_node_map", &d_node_map, n_blocks * sizeof( uint* ) );
+    gpuErrchk( cudaMemcpy(
+      d_node_map, &h_local_source_node_map_[ target_host ][ 0 ], n_blocks * sizeof( uint* ), cudaMemcpyHostToDevice ) );
   }
-  
+
   // Map the not-yet-mapped source nodes using a kernel
   // similar to the one used for counting
   // In the target host unmapped remote source nodes must be mapped
   // to local nodes from n_nodes to n_nodes + n_node_to_map
-  
-  // Allocate the index of the nodes to be mapped and initialize it to 0 
-  int *d_i_node_to_map;
-  CUDAMALLOCCTRL("&d_i_node_to_map",&d_i_node_to_map, sizeof(int));
-  gpuErrchk(cudaMemset(d_i_node_to_map, 0, sizeof(int)));
+
+  // Allocate the index of the nodes to be mapped and initialize it to 0
+  uint* d_i_node_to_map;
+  CUDAMALLOCCTRL( "&d_i_node_to_map", &d_i_node_to_map, sizeof( uint ) );
+  gpuErrchk( cudaMemset( d_i_node_to_map, 0, sizeof( uint ) ) );
 
   // launch kernel that checks if nodes are already in map
   // if not insert them in the map
   // In the source host, put in the mapsource_node_index
-  insertNodesInMapKernel<<<(n_used_source_nodes+1023)/1024, 1024>>>
-    (d_node_map, NULL, 0,
-     n_node_map, d_sorted_source_node_index, d_node_to_map,
-     d_i_node_to_map, n_used_source_nodes);
+  insertNodesInMapKernel<<< ( n_used_source_nodes + 1023 ) / 1024, 1024 >>>(
+    d_node_map, NULL, 0, n_node_map, d_sorted_source_node_index, d_node_to_map, d_i_node_to_map, n_used_source_nodes );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
 
   // update number of elements in remote source node map
   n_node_map += h_n_node_to_map;
-  //std::cout << "ok1 nnm " << n_node_map << " th " << target_host << "\n";
-  gpuErrchk(cudaMemcpy(&d_n_local_source_node_map[target_host],
-		       &n_node_map, sizeof(int), cudaMemcpyHostToDevice));
-  
+  // std::cout << "ok1 nnm " << n_node_map << " th " << target_host << "\n";
+  gpuErrchk(
+    cudaMemcpy( &d_n_local_source_node_map_[ target_host ], &n_node_map, sizeof( uint ), cudaMemcpyHostToDevice ) );
+
   // check for consistency between number of elements
   // and number of blocks in the map
-  int tmp_n_blocks = (n_node_map - 1) / h_node_map_block_size + 1;
-  if (tmp_n_blocks != n_blocks) {
-    std::cerr << "Inconsistent number of elements "
-	      << n_node_map << " and number of blocks "
-	      << n_blocks << " in local_source_node_map\n";
-    exit(-1);
-  }
-
-  //////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  std::cout << "//////////////////////////////////////////////\n";
-  std::cout << "UPDATED UNSORTED MAP\n";
-  std::cout << "OF LOCAL-SOURCE_NODES\n";
-  std::cout << "n_node_map: " << n_node_map << "\n";
-  std::cout << "n_blocks: " << n_blocks << "\n";
-  std::cout << "block_size: " << h_node_map_block_size << "\n";
-
-  int block_size = h_node_map_block_size;
-  int h_node_map_block[block_size];
-  for (int ib=0; ib<n_blocks; ib++) {
-    gpuErrchk(cudaMemcpy(h_node_map_block,
-			 h_local_source_node_map[target_host][ib],
-			 block_size*sizeof(int),
-			 cudaMemcpyDeviceToHost));
-    std::cout << "\n";
-    std::cout << "block " << ib << "\n";
-    std::cout << "local source node index\n";
-    int n = block_size;
-    if (ib==n_blocks-1) {
-      n = (n_node_map - 1) % block_size + 1;
-    }
-    for (int i=0; i<n; i++) {
-      std::cout << h_node_map_block[i] << "\n"; 
-    }
-    std::cout << "\n";
+  uint tmp_n_blocks = ( n_node_map - 1 ) / node_map_block_size_ + 1;
+  if ( tmp_n_blocks != n_blocks )
+  {
+    std::cerr << "Inconsistent number of elements " << n_node_map << " and number of blocks " << n_blocks
+              << " in local_source_node_map\n";
+    exit( -1 );
   }
-#endif
-  //////////////////////////////////////////////////////////////////////
 
   // Sort the WHOLE map source_node_map
   // using block sort algorithm copass_sort
   // typical usage:
-  // copass_sort::sort<int>(key_subarray, n,
+  // copass_sort::sort<uint>(key_subarray, n,
   //				       aux_size, d_storage, storage_bytes);
   // Determine temporary storage requirements for copass_sort
   int64_t storage_bytes = 0;
-  void *d_storage = NULL;
-  copass_sort::sort<int>
-    (&h_local_source_node_map[target_host][0],
-     n_node_map, h_node_map_block_size, d_storage, storage_bytes);
-
-#ifdef CHECKRC
-  printf("storage bytes for copass sort: %ld\n", storage_bytes);
-#endif
-  
+  void* d_storage = NULL;
+  copass_sort::sort< uint >(
+    &h_local_source_node_map_[ target_host ][ 0 ], n_node_map, node_map_block_size_, d_storage, storage_bytes );
+
   // Allocate temporary storage
-  CUDAMALLOCCTRL("&d_storage",&d_storage, storage_bytes);
+  CUDAMALLOCCTRL( "&d_storage", &d_storage, storage_bytes );
 
   // Run sorting operation
-  copass_sort::sort<int>
-    (&h_local_source_node_map[target_host][0],
-     n_node_map, h_node_map_block_size, d_storage, storage_bytes);
-  CUDAFREECTRL("d_storage",d_storage);
-
-  //////////////////////////////////////////////////////////////////////
-#ifdef CHECKRC
-  /// TEMPORARY for check
-  std::cout << "//////////////////////////////////////////////\n";
-  std::cout << "UPDATED SORTED MAP\n";
-  std::cout << "OF LOCAL-SOURCE_NODES\n";
-  std::cout << "n_node_map: " << n_node_map << "\n";
-  std::cout << "n_blocks: " << n_blocks << "\n";
-  std::cout << "block_size: " << block_size << "\n";
-
-  for (int ib=0; ib<n_blocks; ib++) {
-    gpuErrchk(cudaMemcpy(h_node_map_block,
-			 h_local_source_node_map[target_host][ib],
-			 block_size*sizeof(int),
-			 cudaMemcpyDeviceToHost));
-    std::cout << "\n";
-    std::cout << "block " << ib << "\n";
-    std::cout << "local source node index\n";
-    int n = block_size;
-    if (ib==n_blocks-1) {
-      n = (n_node_map - 1) % block_size + 1;
-    }
-    for (int i=0; i<n; i++) {
-      std::cout << h_node_map_block[i] << "\n"; 
-    }
-    std::cout << "\n";
-  }
-#endif
-  //////////////////////////////////////////////////////////////////////
+  copass_sort::sort< uint >(
+    &h_local_source_node_map_[ target_host ][ 0 ], n_node_map, node_map_block_size_, d_storage, storage_bytes );
+  CUDAFREECTRL( "d_storage", d_storage );
 
   // Remove temporary new connections in source host !!!!!!!!!!!
   // potential problem: check that number of blocks is an independent variable
-  // not calculated from NConn
+  // not calculated from n_conn_
   // connect.cu riga 462. Corrected but better keep an eye
-  // also, hopefully the is no global device variable for NConn
-  NConn = old_n_conn; 
+  // also, hopefully the is no global device variable for n_conn_
+  n_conn_ = old_n_conn;
 
   return 0;
 }
 
-__global__ void MapIndexToSpikeBufferKernel(int n_hosts, int *host_offset,
-					    int *node_index);
-
-
 #endif // REMOTECONNECTH
-
diff --git a/src/remote_spike.cu b/src/remote_spike.cu
index de3ec4a64..fd71c0675 100644
--- a/src/remote_spike.cu
+++ b/src/remote_spike.cu
@@ -28,404 +28,421 @@ __constant__ bool have_remote_spike_height;
 #include <stdlib.h>
 
 #include "cuda_error.h"
-#include "utilities.h"
-#include "spike_buffer.h"
 #include "getRealTime.h"
+#include "spike_buffer.h"
+#include "utilities.h"
 
 #include "remote_spike.h"
 
+#include "nestgpu.h"
+#include "remote_connect.h"
 #include "scan.h"
 #include "utilities.h"
-#include "remote_connect.h"
-
 
 // Simple kernel for pushing remote spikes in local spike buffers
-// Version without spike multiplicity array (spike_height) 
-__global__ void PushSpikeFromRemote(int n_spikes, int *spike_buffer_id)
+// Version without spike multiplicity array (spike_height)
+__global__ void
+PushSpikeFromRemote( uint n_spikes, uint* spike_buffer_id )
 {
-  int i_spike = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_spike<n_spikes) {
-    int isb = spike_buffer_id[i_spike];
-    PushSpike(isb, 1.0);
+  uint i_spike = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_spike < n_spikes )
+  {
+    uint isb = spike_buffer_id[ i_spike ];
+    PushSpike( isb, 1.0 );
   }
 }
 
-__device__ int NExternalTargetHost;
-__device__ int MaxSpikePerHost;
+__device__ uint NExternalTargetHost;
+__device__ uint MaxSpikePerHost;
 
-int *d_ExternalSpikeNum;
-__device__ int *ExternalSpikeNum;
+uint* d_ExternalSpikeNum;
+__device__ uint* ExternalSpikeNum;
 
-int *d_ExternalSpikeSourceNode; // [MaxSpikeNum];
-__device__ int *ExternalSpikeSourceNode;
+uint* d_ExternalSpikeSourceNode; // [MaxSpikeNum];
+__device__ uint* ExternalSpikeSourceNode;
 
-float *d_ExternalSpikeHeight; // [MaxSpikeNum];
-__device__ float *ExternalSpikeHeight;
+float* d_ExternalSpikeHeight; // [MaxSpikeNum];
+__device__ float* ExternalSpikeHeight;
 
-int *d_ExternalTargetSpikeNum;
-__device__ int *ExternalTargetSpikeNum;
+uint* d_ExternalTargetSpikeNum;
+__device__ uint* ExternalTargetSpikeNum;
 
-int *d_ExternalTargetSpikeNodeId;
-__device__ int *ExternalTargetSpikeNodeId;
+uint* d_ExternalTargetSpikeNodeId;
+__device__ uint* ExternalTargetSpikeNodeId;
 
-float *d_ExternalTargetSpikeHeight;
-__device__ float *ExternalTargetSpikeHeight;
+float* d_ExternalTargetSpikeHeight;
+__device__ float* ExternalTargetSpikeHeight;
 
-//int *d_NExternalNodeTargetHost;
-__device__ int *NExternalNodeTargetHost;
+// uint *d_NExternalNodeTargetHost;
+__device__ uint* NExternalNodeTargetHost;
 
-//int **d_ExternalNodeTargetHostId;
-__device__ int **ExternalNodeTargetHostId;
+// uint **d_ExternalNodeTargetHostId;
+__device__ uint** ExternalNodeTargetHostId;
 
-//int **d_ExternalNodeId;
-__device__ int **ExternalNodeId;
+// uint **d_ExternalNodeId;
+__device__ uint** ExternalNodeId;
 
-//int *d_ExternalSourceSpikeNum;
-//__device__ int *ExternalSourceSpikeNum;
+// uint *d_ExternalSourceSpikeNum;
+//__device__ uint *ExternalSourceSpikeNum;
 
-int *d_ExternalSourceSpikeNodeId;
-__device__ int *ExternalSourceSpikeNodeId;
+uint* d_ExternalSourceSpikeNodeId;
+__device__ uint* ExternalSourceSpikeNodeId;
 
-float *d_ExternalSourceSpikeHeight;
-__device__ float *ExternalSourceSpikeHeight;
+float* d_ExternalSourceSpikeHeight;
+__device__ float* ExternalSourceSpikeHeight;
 
-int *d_ExternalTargetSpikeIdx0;
-__device__ int *ExternalTargetSpikeIdx0;
-int *h_ExternalTargetSpikeIdx0;
+uint* d_ExternalTargetSpikeIdx0;
+__device__ uint* ExternalTargetSpikeIdx0;
+uint* h_ExternalTargetSpikeIdx0;
 
-int *d_ExternalSourceSpikeIdx0;
+uint* d_ExternalSourceSpikeIdx0;
 
-int *h_ExternalTargetSpikeNum;
-int *h_ExternalSourceSpikeNum;
-int *h_ExternalSourceSpikeIdx0;
-int *h_ExternalTargetSpikeNodeId;
-int *h_ExternalSourceSpikeNodeId;
+uint* h_ExternalTargetSpikeNum;
+uint* h_ExternalSourceSpikeNum;
+uint* h_ExternalSourceSpikeIdx0;
+uint* h_ExternalTargetSpikeNodeId;
+uint* h_ExternalSourceSpikeNodeId;
 
-//int *h_ExternalSpikeNodeId;
+// uint *h_ExternalSpikeNodeId;
 
-float *h_ExternalSpikeHeight;
+float* h_ExternalSpikeHeight;
 
 // Push in a dedicated array the spikes that must be sent externally
-__device__ void PushExternalSpike(int i_source, float height)
+__device__ void
+PushExternalSpike( uint i_source, float height )
 {
-  int pos = atomicAdd(ExternalSpikeNum, 1);
-  if (pos>=MaxSpikePerHost) {
-    printf("Number of spikes larger than MaxSpikePerHost: %d\n", MaxSpikePerHost);
+  uint pos = atomicAdd( ExternalSpikeNum, 1 );
+  if ( pos >= MaxSpikePerHost )
+  {
+    printf( "Number of spikes larger than MaxSpikePerHost: %d\n", MaxSpikePerHost );
     *ExternalSpikeNum = MaxSpikePerHost;
     return;
   }
-  ExternalSpikeSourceNode[pos] = i_source;
-  ExternalSpikeHeight[pos] = height;
+  ExternalSpikeSourceNode[ pos ] = i_source;
+  ExternalSpikeHeight[ pos ] = height;
 }
 
 // Push in a dedicated array the spikes that must be sent externally
 // (version without spike height)
-__device__ void PushExternalSpike(int i_source)
+__device__ void
+PushExternalSpike( uint i_source )
 {
-  int pos = atomicAdd(ExternalSpikeNum, 1);
-  if (pos>=MaxSpikePerHost) {
-    printf("Number of spikes larger than MaxSpikePerHost: %d\n", MaxSpikePerHost);
+  uint pos = atomicAdd( ExternalSpikeNum, 1 );
+  if ( pos >= MaxSpikePerHost )
+  {
+    printf( "Number of spikes larger than MaxSpikePerHost: %d\n", MaxSpikePerHost );
     *ExternalSpikeNum = MaxSpikePerHost;
     return;
   }
-  ExternalSpikeSourceNode[pos] = i_source;
+  ExternalSpikeSourceNode[ pos ] = i_source;
 }
 
 // Count the spikes that must be sent externally for each target host
-__global__ void countExternalSpikesPerTargetHost()
+__global__ void
+countExternalSpikesPerTargetHost()
 {
-  const int i_spike = blockIdx.x;
-  if (i_spike < *ExternalSpikeNum) {
-    //printf("ExternalSpikeNum: %d\ti_spike: %d\n", *ExternalSpikeNum, i_spike);
-    int i_source = ExternalSpikeSourceNode[i_spike];
-    //printf("i_source: %d\n", i_source);
-    int Nth = NExternalNodeTargetHost[i_source];
-    //printf("Nth: %d\n", Nth);
-    
-    for (int ith=threadIdx.x; ith<Nth; ith+=blockDim.x){
-      //printf("ith: %d\n", ith);
-      int target_host_id = ExternalNodeTargetHostId[i_source][ith];
-      //printf("target_host_id: %d\n", target_host_id);
-      //int remote_node_id = ExternalNodeId[i_source][ith];
-      //printf("remote_node_id: %d\n", remote_node_id);
-      //int pos =
-      atomicAdd(&ExternalTargetSpikeNum[target_host_id], 1);
-      //printf("pos: %d\n", pos);
+  const uint i_spike = blockIdx.x;
+  if ( i_spike < *ExternalSpikeNum )
+  {
+    // printf("ExternalSpikeNum: %d\ti_spike: %d\n", *ExternalSpikeNum,
+    // i_spike);
+    uint i_source = ExternalSpikeSourceNode[ i_spike ];
+    // printf("i_source: %d\n", i_source);
+    uint Nth = NExternalNodeTargetHost[ i_source ];
+    // printf("Nth: %d\n", Nth);
+
+    for ( uint ith = threadIdx.x; ith < Nth; ith += blockDim.x )
+    {
+      // printf("ith: %d\n", ith);
+      uint target_host_id = ExternalNodeTargetHostId[ i_source ][ ith ];
+      // printf("target_host_id: %d\n", target_host_id);
+      // uint remote_node_id = ExternalNodeId[i_source][ith];
+      // printf("remote_node_id: %d\n", remote_node_id);
+      // uint pos =
+      atomicAdd( &ExternalTargetSpikeNum[ target_host_id ], 1 );
+      // printf("pos: %d\n", pos);
     }
   }
 }
 
 // Organize the spikes that must be sent externally for each target host
-__global__ void organizeExternalSpikesPerTargetHost()
+__global__ void
+organizeExternalSpikesPerTargetHost()
 {
-  const int i_spike = blockIdx.x;
-  if (i_spike < *ExternalSpikeNum) {
-    //printf("ExternalSpikeNum: %d\ti_spike: %d\n", *ExternalSpikeNum, i_spike);
-    int i_source = ExternalSpikeSourceNode[i_spike];
-    //printf("i_source: %d\n", i_source);
-    int Nth = NExternalNodeTargetHost[i_source];
-    //printf("Nth: %d\n", Nth);
-    
-    for (int ith=threadIdx.x; ith<Nth; ith+=blockDim.x){
-      //printf("ith: %d\n", ith);
-      int target_host_id = ExternalNodeTargetHostId[i_source][ith];
-      //printf("target_host_id: %d\n", target_host_id);
-      int remote_node_id = ExternalNodeId[i_source][ith];
-      //printf("remote_node_id: %d\n", remote_node_id);
-      int pos = atomicAdd(&ExternalTargetSpikeNum[target_host_id], 1);
-      //printf("pos: %d\n", pos);
-      int i_arr = ExternalTargetSpikeIdx0[target_host_id] + pos;
-      ExternalTargetSpikeNodeId[i_arr] = remote_node_id;
-      if (have_remote_spike_height) {
-	float height = ExternalSpikeHeight[i_spike];
-	//printf("height: %f\n", height);
-	ExternalTargetSpikeHeight[i_arr] = height;
-	//printf("ExternalTargetSpikeHeight assigned\n");
+  const uint i_spike = blockIdx.x;
+  if ( i_spike < *ExternalSpikeNum )
+  {
+    // printf("ExternalSpikeNum: %d\ti_spike: %d\n", *ExternalSpikeNum,
+    // i_spike);
+    uint i_source = ExternalSpikeSourceNode[ i_spike ];
+    // printf("i_source: %d\n", i_source);
+    uint Nth = NExternalNodeTargetHost[ i_source ];
+    // printf("Nth: %d\n", Nth);
+
+    for ( uint ith = threadIdx.x; ith < Nth; ith += blockDim.x )
+    {
+      // printf("ith: %d\n", ith);
+      uint target_host_id = ExternalNodeTargetHostId[ i_source ][ ith ];
+      // printf("target_host_id: %d\n", target_host_id);
+      uint remote_node_id = ExternalNodeId[ i_source ][ ith ];
+      // printf("remote_node_id: %d\n", remote_node_id);
+      uint pos = atomicAdd( &ExternalTargetSpikeNum[ target_host_id ], 1 );
+      // printf("pos: %d\n", pos);
+      uint i_arr = ExternalTargetSpikeIdx0[ target_host_id ] + pos;
+      ExternalTargetSpikeNodeId[ i_arr ] = remote_node_id;
+      if ( have_remote_spike_height )
+      {
+        float height = ExternalSpikeHeight[ i_spike ];
+        // printf("height: %f\n", height);
+        ExternalTargetSpikeHeight[ i_arr ] = height;
+        // printf("ExternalTargetSpikeHeight assigned\n");
       }
     }
   }
 }
 
 // reset external spike counters
-int NESTGPU::ExternalSpikeReset()
+int
+NESTGPU::ExternalSpikeReset()
 {
-  gpuErrchk(cudaMemset(d_ExternalSpikeNum, 0, sizeof(int)));
-  gpuErrchk(cudaMemset(d_ExternalTargetSpikeNum, 0, n_hosts_*sizeof(int)));
-  
+  gpuErrchk( cudaMemset( d_ExternalSpikeNum, 0, sizeof( uint ) ) );
+  gpuErrchk( cudaMemset( d_ExternalTargetSpikeNum, 0, n_hosts_ * sizeof( uint ) ) );
+
   return 0;
 }
 
 // initialize external spike arrays
-int NESTGPU::ExternalSpikeInit()
+int
+NESTGPU::ExternalSpikeInit()
 {
   SendSpikeToRemote_comm_time_ = 0;
   RecvSpikeFromRemote_comm_time_ = 0;
-  
+
   SendSpikeToRemote_CUDAcp_time_ = 0;
   RecvSpikeFromRemote_CUDAcp_time_ = 0;
 
-  //int *h_NExternalNodeTargetHost = new int[n_node];
-  //int **h_ExternalNodeTargetHostId = new int*[n_node];
-  //int **h_ExternalNodeId = new int*[n_node];
-
-  h_ExternalTargetSpikeIdx0 = new int[n_hosts_+1];
-  //h_ExternalSpikeNodeId = new int[max_spike_per_host_];
-  h_ExternalTargetSpikeNum = new int [n_hosts_];
-  h_ExternalSourceSpikeNum = new int[n_hosts_];
-  h_ExternalSourceSpikeIdx0 = new int[n_hosts_ + 1];
-  h_ExternalTargetSpikeNodeId = new int[max_remote_spike_num_];
-  h_ExternalSourceSpikeNodeId = new int[max_remote_spike_num_];
-
-  CUDAMALLOCCTRL("&d_ExternalSpikeNum",&d_ExternalSpikeNum, sizeof(int));
-  CUDAMALLOCCTRL("&d_ExternalSpikeSourceNode",&d_ExternalSpikeSourceNode,
-		       max_spike_per_host_*sizeof(int));
-  
-  if (remote_spike_height_) {
-    h_ExternalSpikeHeight = new float[max_spike_per_host_];
-    CUDAMALLOCCTRL("&d_ExternalSpikeHeight",&d_ExternalSpikeHeight,
-		   max_spike_per_host_*sizeof(float));
-    CUDAMALLOCCTRL("&d_ExternalTargetSpikeHeight",&d_ExternalTargetSpikeHeight,
-		   max_remote_spike_num_*sizeof(float));
-    CUDAMALLOCCTRL("&d_ExternalSourceSpikeHeight",&d_ExternalSourceSpikeHeight,
-		   max_remote_spike_num_*sizeof(float));
+  // uint *h_NExternalNodeTargetHost = new uint[n_node];
+  // uint **h_ExternalNodeTargetHostId = new uint*[n_node];
+  // uint **h_ExternalNodeId = new uint*[n_node];
+
+  h_ExternalTargetSpikeIdx0 = new uint[ n_hosts_ + 1 ];
+  // h_ExternalSpikeNodeId = new uint[max_spike_per_host_];
+  h_ExternalTargetSpikeNum = new uint[ n_hosts_ ];
+  h_ExternalSourceSpikeNum = new uint[ n_hosts_ ];
+  h_ExternalSourceSpikeIdx0 = new uint[ n_hosts_ + 1 ];
+  h_ExternalTargetSpikeNodeId = new uint[ max_remote_spike_num_ ];
+  h_ExternalSourceSpikeNodeId = new uint[ max_remote_spike_num_ ];
+
+  CUDAMALLOCCTRL( "&d_ExternalSpikeNum", &d_ExternalSpikeNum, sizeof( uint ) );
+  CUDAMALLOCCTRL( "&d_ExternalSpikeSourceNode", &d_ExternalSpikeSourceNode, max_spike_per_host_ * sizeof( uint ) );
+
+  if ( remote_spike_height_ )
+  {
+    h_ExternalSpikeHeight = new float[ max_spike_per_host_ ];
+    CUDAMALLOCCTRL( "&d_ExternalSpikeHeight", &d_ExternalSpikeHeight, max_spike_per_host_ * sizeof( float ) );
+    CUDAMALLOCCTRL(
+      "&d_ExternalTargetSpikeHeight", &d_ExternalTargetSpikeHeight, max_remote_spike_num_ * sizeof( float ) );
+    CUDAMALLOCCTRL(
+      "&d_ExternalSourceSpikeHeight", &d_ExternalSourceSpikeHeight, max_remote_spike_num_ * sizeof( float ) );
   }
-  
-  CUDAMALLOCCTRL("&d_ExternalTargetSpikeNum",&d_ExternalTargetSpikeNum,
-		 n_hosts_*sizeof(int));
-
-  //printf("n_hosts, max_spike_per_host: %d %d\n", n_hosts, max_spike_per_host);
-
-  CUDAMALLOCCTRL("&d_ExternalTargetSpikeNodeId",
-		 &d_ExternalTargetSpikeNodeId,
-		 max_remote_spike_num_*sizeof(int));
-  
-  //CUDAMALLOCCTRL("&d_ExternalSourceSpikeNum",&d_ExternalSourceSpikeNum,
-  //n_hosts*sizeof(int));
-  CUDAMALLOCCTRL("&d_ExternalSourceSpikeNodeId",&d_ExternalSourceSpikeNodeId,
-		 max_remote_spike_num_*sizeof(int));
-  CUDAMALLOCCTRL("&d_ExternalTargetSpikeIdx0",&d_ExternalTargetSpikeIdx0,
-		 (n_hosts_ + 1)*sizeof(int));
-
-  CUDAMALLOCCTRL("&d_ExternalSourceSpikeIdx0",&d_ExternalSourceSpikeIdx0,
-		 (n_hosts_ + 1)*sizeof(int));
-  
-  //CUDAMALLOCCTRL("&d_NExternalNodeTargetHost",&d_NExternalNodeTargetHost,
-  //n_node*sizeof(int));
-  //CUDAMALLOCCTRL("&d_ExternalNodeTargetHostId",&d_ExternalNodeTargetHostId,
-  //n_node*sizeof(int*));
-  //CUDAMALLOCCTRL("&d_ExternalNodeId",&d_ExternalNodeId, n_node*sizeof(int*));
-
-  if (remote_spike_height_) {
-    DeviceExternalSpikeInit<<<1,1>>>(n_hosts_, max_spike_per_host_,
-				     d_ExternalSpikeNum,
-				     d_ExternalSpikeSourceNode,
-				     d_ExternalSpikeHeight,
-				     d_ExternalTargetSpikeNum,
-				     d_ExternalTargetSpikeIdx0,
-				     d_ExternalTargetSpikeNodeId,
-				     d_ExternalTargetSpikeHeight,
-				     d_n_target_hosts,
-				     d_node_target_hosts,
-				     d_node_target_host_i_map
-				     );
+
+  CUDAMALLOCCTRL( "&d_ExternalTargetSpikeNum", &d_ExternalTargetSpikeNum, n_hosts_ * sizeof( uint ) );
+
+  // printf("n_hosts, max_spike_per_host: %d %d\n", n_hosts,
+  // max_spike_per_host);
+
+  CUDAMALLOCCTRL(
+    "&d_ExternalTargetSpikeNodeId", &d_ExternalTargetSpikeNodeId, max_remote_spike_num_ * sizeof( uint ) );
+
+  // CUDAMALLOCCTRL("&d_ExternalSourceSpikeNum",&d_ExternalSourceSpikeNum,
+  // n_hosts*sizeof(int));
+  CUDAMALLOCCTRL(
+    "&d_ExternalSourceSpikeNodeId", &d_ExternalSourceSpikeNodeId, max_remote_spike_num_ * sizeof( uint ) );
+  CUDAMALLOCCTRL( "&d_ExternalTargetSpikeIdx0", &d_ExternalTargetSpikeIdx0, ( n_hosts_ + 1 ) * sizeof( uint ) );
+
+  CUDAMALLOCCTRL( "&d_ExternalSourceSpikeIdx0", &d_ExternalSourceSpikeIdx0, ( n_hosts_ + 1 ) * sizeof( uint ) );
+
+  // CUDAMALLOCCTRL("&d_NExternalNodeTargetHost",&d_NExternalNodeTargetHost,
+  // n_node*sizeof(uint));
+  // CUDAMALLOCCTRL("&d_ExternalNodeTargetHostId",&d_ExternalNodeTargetHostId,
+  // n_node*sizeof(uint*));
+  // CUDAMALLOCCTRL("&d_ExternalNodeId",&d_ExternalNodeId,
+  // n_node*sizeof(uint*));
+
+  if ( remote_spike_height_ )
+  {
+    DeviceExternalSpikeInit<<< 1, 1 >>>( n_hosts_,
+      max_spike_per_host_,
+      d_ExternalSpikeNum,
+      d_ExternalSpikeSourceNode,
+      d_ExternalSpikeHeight,
+      d_ExternalTargetSpikeNum,
+      d_ExternalTargetSpikeIdx0,
+      d_ExternalTargetSpikeNodeId,
+      d_ExternalTargetSpikeHeight,
+      conn_->getDevNTargetHosts(),
+      conn_->getDevNodeTargetHosts(),
+      conn_->getDevNodeTargetHostIMap() );
   }
-  else {
-    DeviceExternalSpikeInit<<<1,1>>>(n_hosts_, max_spike_per_host_,
-				     d_ExternalSpikeNum,
-				     d_ExternalSpikeSourceNode,
-				     d_ExternalTargetSpikeNum,
-				     d_ExternalTargetSpikeIdx0,
-				     d_ExternalTargetSpikeNodeId,
-				     d_n_target_hosts,
-				     d_node_target_hosts,
-				     d_node_target_host_i_map
-				     );  
+  else
+  {
+    DeviceExternalSpikeInit<<< 1, 1 >>>( n_hosts_,
+      max_spike_per_host_,
+      d_ExternalSpikeNum,
+      d_ExternalSpikeSourceNode,
+      d_ExternalTargetSpikeNum,
+      d_ExternalTargetSpikeIdx0,
+      d_ExternalTargetSpikeNodeId,
+      conn_->getDevNTargetHosts(),
+      conn_->getDevNodeTargetHosts(),
+      conn_->getDevNodeTargetHostIMap() );
   }
-  //delete[] h_NExternalNodeTargetHost;
-  //delete[] h_ExternalNodeTargetHostId;
-  //delete[] h_ExternalNodeId;
+  // delete[] h_NExternalNodeTargetHost;
+  // delete[] h_ExternalNodeTargetHostId;
+  // delete[] h_ExternalNodeId;
 
   return 0;
 }
 
 // initialize external spike array pointers in the GPU
-__global__ void DeviceExternalSpikeInit(int n_hosts,
-					int max_spike_per_host,
-					int *ext_spike_num,
-					int *ext_spike_source_node,
-					float *ext_spike_height,
-					int *ext_target_spike_num,
-					int *ext_target_spike_idx0,
-					int *ext_target_spike_node_id,
-					float *ext_target_spike_height,
-					int *n_ext_node_target_host,
-					int **ext_node_target_host_id,
-					int **ext_node_id
-					)
-  
+__global__ void
+DeviceExternalSpikeInit( uint n_hosts,
+  uint max_spike_per_host,
+  uint* ext_spike_num,
+  uint* ext_spike_source_node,
+  float* ext_spike_height,
+  uint* ext_target_spike_num,
+  uint* ext_target_spike_idx0,
+  uint* ext_target_spike_node_id,
+  float* ext_target_spike_height,
+  uint* n_ext_node_target_host,
+  uint** ext_node_target_host_id,
+  uint** ext_node_id )
+
 {
   NExternalTargetHost = n_hosts;
-  MaxSpikePerHost =  max_spike_per_host;
+  MaxSpikePerHost = max_spike_per_host;
   ExternalSpikeNum = ext_spike_num;
   ExternalSpikeSourceNode = ext_spike_source_node;
   ExternalSpikeHeight = ext_spike_height;
   ExternalTargetSpikeNum = ext_target_spike_num;
-  ExternalTargetSpikeIdx0 = ext_target_spike_idx0,
-  ExternalTargetSpikeNodeId = ext_target_spike_node_id;
+  ExternalTargetSpikeIdx0 = ext_target_spike_idx0, ExternalTargetSpikeNodeId = ext_target_spike_node_id;
   ExternalTargetSpikeHeight = ext_target_spike_height;
   NExternalNodeTargetHost = n_ext_node_target_host;
   ExternalNodeTargetHostId = ext_node_target_host_id;
   ExternalNodeId = ext_node_id;
   *ExternalSpikeNum = 0;
-  for (int ith=0; ith<NExternalTargetHost; ith++) {
-    ExternalTargetSpikeNum[ith] = 0;
-  }  
+  for ( uint ith = 0; ith < NExternalTargetHost; ith++ )
+  {
+    ExternalTargetSpikeNum[ ith ] = 0;
+  }
 }
 
 // initialize external spike array pointers in the GPU
 // (version without spike height)
-__global__ void DeviceExternalSpikeInit(int n_hosts,
-					int max_spike_per_host,
-					int *ext_spike_num,
-					int *ext_spike_source_node,
-					int *ext_target_spike_num,
-					int *ext_target_spike_idx0,
-					int *ext_target_spike_node_id,
-					int *n_ext_node_target_host,
-					int **ext_node_target_host_id,
-					int **ext_node_id
-					)
-  
+__global__ void
+DeviceExternalSpikeInit( uint n_hosts,
+  uint max_spike_per_host,
+  uint* ext_spike_num,
+  uint* ext_spike_source_node,
+  uint* ext_target_spike_num,
+  uint* ext_target_spike_idx0,
+  uint* ext_target_spike_node_id,
+  uint* n_ext_node_target_host,
+  uint** ext_node_target_host_id,
+  uint** ext_node_id )
+
 {
   NExternalTargetHost = n_hosts;
-  MaxSpikePerHost =  max_spike_per_host;
+  MaxSpikePerHost = max_spike_per_host;
   ExternalSpikeNum = ext_spike_num;
   ExternalSpikeSourceNode = ext_spike_source_node;
-  ExternalSpikeHeight = NULL;
+  ExternalSpikeHeight = nullptr;
   ExternalTargetSpikeNum = ext_target_spike_num;
-  ExternalTargetSpikeIdx0 = ext_target_spike_idx0,
-  ExternalTargetSpikeNodeId = ext_target_spike_node_id;
-  ExternalTargetSpikeHeight = NULL;
+  ExternalTargetSpikeIdx0 = ext_target_spike_idx0, ExternalTargetSpikeNodeId = ext_target_spike_node_id;
+  ExternalTargetSpikeHeight = nullptr;
   NExternalNodeTargetHost = n_ext_node_target_host;
   ExternalNodeTargetHostId = ext_node_target_host_id;
   ExternalNodeId = ext_node_id;
   *ExternalSpikeNum = 0;
-  for (int ith=0; ith<NExternalTargetHost; ith++) {
-    ExternalTargetSpikeNum[ith] = 0;
-  }  
+  for ( uint ith = 0; ith < NExternalTargetHost; ith++ )
+  {
+    ExternalTargetSpikeNum[ ith ] = 0;
+  }
 }
 
-int NESTGPU::organizeExternalSpikes(int n_ext_spikes)
+int
+NESTGPU::organizeExternalSpikes( int n_ext_spikes )
 {
-  countExternalSpikesPerTargetHost<<<n_ext_spikes, 1024>>>();
+  countExternalSpikesPerTargetHost<<< n_ext_spikes, 1024 >>>();
   CUDASYNC;
-  prefix_scan(d_ExternalTargetSpikeIdx0, d_ExternalTargetSpikeNum,
-	      n_hosts_+1, true);
+  prefix_scan( ( int* ) d_ExternalTargetSpikeIdx0, ( int* ) d_ExternalTargetSpikeNum, n_hosts_ + 1, true );
   DBGCUDASYNC;
-  gpuErrchk(cudaMemset(d_ExternalTargetSpikeNum, 0, n_hosts_*sizeof(int)));
-  organizeExternalSpikesPerTargetHost<<<n_ext_spikes, 1024>>>();
+  gpuErrchk( cudaMemset( d_ExternalTargetSpikeNum, 0, n_hosts_ * sizeof( uint ) ) );
+  organizeExternalSpikesPerTargetHost<<< n_ext_spikes, 1024 >>>();
   CUDASYNC;
-  
-  return 0;  
+
+  return 0;
 }
 
 // pack spikes received from remote hosts
 // and copy them to GPU memory
-int NESTGPU::CopySpikeFromRemote()
+int
+NESTGPU::CopySpikeFromRemote()
 {
   int n_spike_tot = 0;
-  h_ExternalSourceSpikeIdx0[0] = 0;
+  h_ExternalSourceSpikeIdx0[ 0 ] = 0;
   // loop on hosts
-  for (int i_host=0; i_host<n_hosts_; i_host++) {
-    int n_spike = h_ExternalSourceSpikeNum[i_host];
-    h_ExternalSourceSpikeIdx0[i_host+1] =
-      h_ExternalSourceSpikeIdx0[i_host] + n_spike;
-    for (int i_spike=0; i_spike<n_spike; i_spike++) {
+  for ( int i_host = 0; i_host < n_hosts_; i_host++ )
+  {
+    int n_spike = h_ExternalSourceSpikeNum[ i_host ];
+    h_ExternalSourceSpikeIdx0[ i_host + 1 ] = h_ExternalSourceSpikeIdx0[ i_host ] + n_spike;
+    for ( int i_spike = 0; i_spike < n_spike; i_spike++ )
+    {
       // pack spikes received from remote hosts
-      h_ExternalSourceSpikeNodeId[n_spike_tot] =
-	h_ExternalSourceSpikeNodeId[i_host*max_spike_per_host_ + i_spike];
+      h_ExternalSourceSpikeNodeId[ n_spike_tot ] =
+        h_ExternalSourceSpikeNodeId[ i_host * max_spike_per_host_ + i_spike ];
       n_spike_tot++;
     }
   }
-  
-  if (n_spike_tot >= max_remote_spike_num_) {
-    throw ngpu_exception
-      (std::string("Number of spikes to be received remotely ")
-       + std::to_string(n_spike_tot)
-       + " larger than limit " + std::to_string(max_remote_spike_num_));
+
+  if ( n_spike_tot >= max_remote_spike_num_ )
+  {
+    throw ngpu_exception( std::string( "Number of spikes to be received remotely " ) + std::to_string( n_spike_tot )
+      + " larger than limit " + std::to_string( max_remote_spike_num_ ) );
   }
-  
-  if (n_spike_tot>0) {
+
+  if ( n_spike_tot > 0 )
+  {
     double time_mark = getRealTime();
-    // Memcopy will be synchronized    
+    // Memcopy will be synchronized
     // copy to GPU memory cumulative sum of number of spikes per source host
-    gpuErrchk(cudaMemcpyAsync(d_ExternalSourceSpikeIdx0,
-			      h_ExternalSourceSpikeIdx0,
-			      (n_hosts_+1)*sizeof(int), cudaMemcpyHostToDevice));
+    gpuErrchk( cudaMemcpyAsync( d_ExternalSourceSpikeIdx0,
+      h_ExternalSourceSpikeIdx0,
+      ( n_hosts_ + 1 ) * sizeof( uint ),
+      cudaMemcpyHostToDevice ) );
     DBGCUDASYNC;
     // copy to GPU memory packed spikes from remote hosts
-    gpuErrchk(cudaMemcpyAsync(d_ExternalSourceSpikeNodeId,
-			      h_ExternalSourceSpikeNodeId,
-			      n_spike_tot*sizeof(int), cudaMemcpyHostToDevice));
+    gpuErrchk( cudaMemcpyAsync( d_ExternalSourceSpikeNodeId,
+      h_ExternalSourceSpikeNodeId,
+      n_spike_tot * sizeof( uint ),
+      cudaMemcpyHostToDevice ) );
     DBGCUDASYNC;
-    RecvSpikeFromRemote_CUDAcp_time_ += (getRealTime() - time_mark);
+    RecvSpikeFromRemote_CUDAcp_time_ += ( getRealTime() - time_mark );
     // convert node map indexes to spike buffer indexes
-    MapIndexToSpikeBufferKernel<<<n_hosts_, 1024>>>(n_hosts_,
-						   d_ExternalSourceSpikeIdx0,
-						   d_ExternalSourceSpikeNodeId);
+    MapIndexToSpikeBufferKernel<<< n_hosts_, 1024 >>>(
+      n_hosts_, d_ExternalSourceSpikeIdx0, d_ExternalSourceSpikeNodeId );
     DBGCUDASYNC;
     // convert node group indexes to spike buffer indexes
-    // by adding the index of the first node of the node group  
-    //AddOffset<<<(n_spike_tot+1023)/1024, 1024>>>
+    // by adding the index of the first node of the node group
+    // AddOffset<<<(n_spike_tot+1023)/1024, 1024 >>>
     //  (n_spike_tot, d_ExternalSourceSpikeNodeId, i_remote_node_0);
-    //gpuErrchk( cudaPeekAtLastError() );
-    //cudaDeviceSynchronize();
+    // gpuErrchk( cudaPeekAtLastError() );
+    // cudaDeviceSynchronize();
     // push remote spikes in local spike buffers
-    PushSpikeFromRemote<<<(n_spike_tot+1023)/1024, 1024>>>
-      (n_spike_tot, d_ExternalSourceSpikeNodeId);
+    PushSpikeFromRemote<<< ( n_spike_tot + 1023 ) / 1024, 1024 >>>( n_spike_tot, d_ExternalSourceSpikeNodeId );
     DBGCUDASYNC;
   }
-  
+
   return n_spike_tot;
 }
-
diff --git a/src/remote_spike.h b/src/remote_spike.h
index 3356cc2ad..dbf9db66b 100644
--- a/src/remote_spike.h
+++ b/src/remote_spike.h
@@ -20,109 +20,100 @@
  *
  */
 
-
-
-
-
 #ifndef REMOTE_SPIKE_H
 #define REMOTE_SPIKE_H
 
 extern __constant__ bool have_remote_spike_height;
 
-__global__ void PushSpikeFromRemote(int n_spikes, int *spike_buffer_id,
-                                    float *spike_height);
+__global__ void PushSpikeFromRemote( uint n_spikes, uint* spike_buffer_id, float* spike_height );
 
-__global__ void PushSpikeFromRemote(int n_spikes, int *spike_buffer_id);
+__global__ void PushSpikeFromRemote( uint n_spikes, uint* spike_buffer_id );
 
-extern __device__ int NExternalTargetHost;
-extern __device__ int MaxSpikePerHost;
+extern __device__ uint NExternalTargetHost;
+extern __device__ uint MaxSpikePerHost;
 
-extern int *d_ExternalSpikeNum;
-extern __device__ int *ExternalSpikeNum;
+extern uint* d_ExternalSpikeNum;
+extern __device__ uint* ExternalSpikeNum;
 
-extern int *d_ExternalSpikeSourceNode; // [MaxSpikeNum];
-extern __device__ int *ExternalSpikeSourceNode;
+extern uint* d_ExternalSpikeSourceNode; // [MaxSpikeNum];
+extern __device__ uint* ExternalSpikeSourceNode;
 
-extern float *d_ExternalSpikeHeight; // [MaxSpikeNum];
-extern __device__ float *ExternalSpikeHeight;
+extern float* d_ExternalSpikeHeight; // [MaxSpikeNum];
+extern __device__ float* ExternalSpikeHeight;
 
-extern int *d_ExternalTargetSpikeNum;
-extern __device__ int *ExternalTargetSpikeNum;
+extern uint* d_ExternalTargetSpikeNum;
+extern __device__ uint* ExternalTargetSpikeNum;
 
-extern int *d_ExternalTargetSpikeNodeId;
-extern __device__ int *ExternalTargetSpikeNodeId;
+extern uint* d_ExternalTargetSpikeNodeId;
+extern __device__ uint* ExternalTargetSpikeNodeId;
 
-extern float *d_ExternalTargetSpikeHeight;
-extern __device__ float *ExternalTargetSpikeHeight;
+extern float* d_ExternalTargetSpikeHeight;
+extern __device__ float* ExternalTargetSpikeHeight;
 
-//extern int *d_NExternalNodeTargetHost;
-extern __device__ int *NExternalNodeTargetHost;
+// extern uint *d_NExternalNodeTargetHost;
+extern __device__ uint* NExternalNodeTargetHost;
 
-//extern int **d_ExternalNodeTargetHostId;
-extern __device__ int **ExternalNodeTargetHostId;
+// extern uint **d_ExternalNodeTargetHostId;
+extern __device__ uint** ExternalNodeTargetHostId;
 
-//extern int **d_ExternalNodeId;
-extern __device__ int **ExternalNodeId;
+// extern uint **d_ExternalNodeId;
+extern __device__ uint** ExternalNodeId;
 
-//extern int *d_ExternalSourceSpikeNum;
-//extern __device__ int *ExternalSourceSpikeNum;
+// extern uint *d_ExternalSourceSpikeNum;
+// extern __device__ uint *ExternalSourceSpikeNum;
 
-extern int *d_ExternalSourceSpikeNodeId;
-extern __device__ int *ExternalSourceSpikeNodeId;
+extern uint* d_ExternalSourceSpikeNodeId;
+extern __device__ uint* ExternalSourceSpikeNodeId;
 
-extern float *d_ExternalSourceSpikeHeight;
-extern __device__ float *ExternalSourceSpikeHeight;
+extern float* d_ExternalSourceSpikeHeight;
+extern __device__ float* ExternalSourceSpikeHeight;
 
-extern int *d_ExternalTargetSpikeIdx0;
-extern __device__ int *ExternalTargetSpikeIdx0;
-extern int *h_ExternalTargetSpikeIdx0;
+extern uint* d_ExternalTargetSpikeIdx0;
+extern __device__ uint* ExternalTargetSpikeIdx0;
+extern uint* h_ExternalTargetSpikeIdx0;
 
-extern int *d_ExternalSourceSpikeIdx0;
+extern uint* d_ExternalSourceSpikeIdx0;
 
-extern int *h_ExternalTargetSpikeNum;
-extern int *h_ExternalSourceSpikeNum;
-extern int *h_ExternalSourceSpikeIdx0;
-extern int *h_ExternalTargetSpikeNodeId;
-extern int *h_ExternalSourceSpikeNodeId;
+extern uint* h_ExternalTargetSpikeNum;
+extern uint* h_ExternalSourceSpikeNum;
+extern uint* h_ExternalSourceSpikeIdx0;
+extern uint* h_ExternalTargetSpikeNodeId;
+extern uint* h_ExternalSourceSpikeNodeId;
 
-//extern int *h_ExternalSpikeNodeId;
+// extern uint *h_ExternalSpikeNodeId;
 
-extern float *h_ExternalSpikeHeight;
+extern float* h_ExternalSpikeHeight;
 
-__device__ void PushExternalSpike(int i_source, float height);
+__device__ void PushExternalSpike( uint i_source, float height );
 
-__device__ void PushExternalSpike(int i_source);
+__device__ void PushExternalSpike( uint i_source );
 
 __global__ void countExternalSpikesPerTargetHost();
 
 __global__ void organizeExternalSpikesPerTargetHost();
 
-__global__ void DeviceExternalSpikeInit(int n_hosts,
-					int max_spike_per_host,
-		      			int *ext_spike_num,
-					int *ext_spike_source_node,
-                                        float *ext_spike_height,
-					int *ext_target_spike_num,
-					int *ext_target_spike_idx0,
-					int *ext_target_spike_node_id,
-                                        float *ext_target_spike_height,
-					int *n_ext_node_target_host,
-					int **ext_node_target_host_id,
-					int **ext_node_id
-					);
-
-__global__ void DeviceExternalSpikeInit(int n_hosts,
-					int max_spike_per_host,
-		      			int *ext_spike_num,
-					int *ext_spike_source_node,
-					int *ext_target_spike_num,
-					int *ext_target_spike_idx0,
-					int *ext_target_spike_node_id,
-					int *n_ext_node_target_host,
-					int **ext_node_target_host_id,
-					int **ext_node_id
-					);
-
+__global__ void DeviceExternalSpikeInit( uint n_hosts,
+  uint max_spike_per_host,
+  uint* ext_spike_num,
+  uint* ext_spike_source_node,
+  float* ext_spike_height,
+  uint* ext_target_spike_num,
+  uint* ext_target_spike_idx0,
+  uint* ext_target_spike_node_id,
+  float* ext_target_spike_height,
+  uint* n_ext_node_target_host,
+  uint** ext_node_target_host_id,
+  uint** ext_node_id );
+
+__global__ void DeviceExternalSpikeInit( uint n_hosts,
+  uint max_spike_per_host,
+  uint* ext_spike_num,
+  uint* ext_spike_source_node,
+  uint* ext_target_spike_num,
+  uint* ext_target_spike_idx0,
+  uint* ext_target_spike_node_id,
+  uint* n_ext_node_target_host,
+  uint** ext_node_target_host_id,
+  uint** ext_node_id );
 
 #endif
-
diff --git a/src/rev_spike.cu b/src/rev_spike.cu
index b72170539..6536369f2 100644
--- a/src/rev_spike.cu
+++ b/src/rev_spike.cu
@@ -20,175 +20,123 @@
  *
  */
 
-#include <config.h>
-#include <stdio.h>
-#include "spike_buffer.h"
+#include "connect.h"
 #include "cuda_error.h"
+#include "spike_buffer.h"
 #include "syn_model.h"
-#include "connect.h"
+#include <config.h>
 #include <cub/cub.cuh>
+#include <stdio.h>
 
 #define SPIKE_TIME_DIFF_GUARD 15000 // must be less than 16384
-#define SPIKE_TIME_DIFF_THR 10000 // must be less than GUARD
+#define SPIKE_TIME_DIFF_THR 10000   // must be less than GUARD
 
 extern __constant__ long long NESTGPUTimeIdx;
-extern __constant__ float NESTGPUTimeResolution;
-
-unsigned int *d_RevSpikeNum;
-unsigned int *d_RevSpikeTarget;
-int *d_RevSpikeNConn;
-
-extern __device__ void SynapseUpdate(int syn_group, float *w, float Dt);
-
-__device__ unsigned int *RevSpikeNum;
-__device__ unsigned int *RevSpikeTarget;
-__device__ int *RevSpikeNConn;
 
-int64_t h_NRevConn; 
-
-int64_t *d_RevConnections; //[i] i=0,..., n_rev_conn - 1;
-__device__ int64_t *RevConnections;
-
-int *d_TargetRevConnectionSize; //[i] i=0,..., n_neuron-1;
-__device__ int *TargetRevConnectionSize;
+extern __constant__ float NESTGPUTimeResolution;
 
-int64_t **d_TargetRevConnection; //[i][j] j=0,...,RevConnectionSize[i]-1
-__device__ int64_t **TargetRevConnection;
+extern __device__ void SynapseUpdate( int syn_group, float* w, float Dt );
 
+__device__ unsigned int* RevSpikeNum;
 
-// Count number of reverse connections per target node
-__global__ void CountRevConnectionsKernel
-(int64_t n_conn, int64_t *target_rev_connection_size_64)
-{
-  int64_t i_conn = (int64_t)blockIdx.x * blockDim.x + threadIdx.x; 
-  if (i_conn >= n_conn) return;
+__device__ unsigned int* RevSpikeTarget;
 
-  uint i_block = (uint)(i_conn / ConnBlockSize);
-  int64_t i_block_conn = i_conn % ConnBlockSize;
-  connection_struct conn = ConnectionArray[i_block][i_block_conn];
+__device__ int* RevSpikeNConn;
 
-  // TO BE IMPROVED BY CHECKING IF THE SYNAPSE TYPE OF THE GROUP
-  // REQUIRES REVERSE CONNECTION
-  // - Check syn_group of all connections.
-  // - If syn_group>0 must create a reverse connection:
-  if ((conn.target_port_syn & SynMask) > 0) {
-    // First get target node index
-    uint target_port_syn = conn.target_port_syn;
-    int i_target = target_port_syn >> MaxPortSynNBits;
-    // (atomic)increase the number of reverse connections for target
-    atomicAdd((unsigned long long *)&target_rev_connection_size_64[i_target],
-	      1);
-  }
-}
+__device__ int64_t* RevConnections;
 
+__device__ int* TargetRevConnectionSize;
 
+__device__ int64_t** TargetRevConnection;
 
-// Fill array of reverse connection indexes
-__global__ void SetRevConnectionsIndexKernel
-(int64_t n_conn, int *target_rev_connection_size,
- int64_t **target_rev_connection)
+__global__ void
+setTargetRevConnectionsPtKernel( int n_spike_buffer,
+  int64_t* target_rev_connection_cumul,
+  int64_t** target_rev_connection,
+  int64_t* rev_connections )
 {
-  int64_t i_conn = (int64_t)blockIdx.x * blockDim.x + threadIdx.x; 
-  if (i_conn >= n_conn) return;
-
-  uint i_block = (uint)(i_conn / ConnBlockSize);
-  int64_t i_block_conn = i_conn % ConnBlockSize;
-  connection_struct conn = ConnectionArray[i_block][i_block_conn];
-
-  // TO BE IMPROVED BY CHECKING IF THE SYNAPSE TYPE OF THE GROUP
-  // REQUIRES REVERSE CONNECTION  
-  // - Check syn_group of all connections.
-  // - If syn_group>0 must create a reverse connection:
-  if ((conn.target_port_syn & SynMask) > 0) {
-    // First get target node index
-    uint target_port_syn = conn.target_port_syn;
-    int i_target = target_port_syn >> MaxPortSynNBits;
-    // (atomic)increase the number of reverse connections for target
-    int pos = atomicAdd(&target_rev_connection_size[i_target], 1);
-    // Evaluate the pointer to the rev connection position in the
-    // array of reverse connection indexes
-    int64_t *rev_conn_pt = target_rev_connection[i_target] + pos;
-    // Fill it with the connection index
-    *rev_conn_pt = i_conn;
+  int i_target = blockIdx.x * blockDim.x + threadIdx.x;
+  if ( i_target >= n_spike_buffer )
+  {
+    return;
   }
+  target_rev_connection[ i_target ] = rev_connections + target_rev_connection_cumul[ i_target ];
 }
 
-__global__ void SetTargetRevConnectionsPtKernel
-    (int n_spike_buffer, int64_t *target_rev_connection_cumul,
-     int64_t **target_rev_connection, int64_t *rev_connections)
-{
-  int i_target = blockIdx.x * blockDim.x + threadIdx.x; 
-  if (i_target >= n_spike_buffer) return;
-  target_rev_connection[i_target] = rev_connections
-    + target_rev_connection_cumul[i_target];
-}
-
-__global__ void RevConnectionInitKernel(int64_t *rev_conn,
-					int *target_rev_conn_size,
-					int64_t **target_rev_conn)
+__global__ void
+revConnectionInitKernel( int64_t* rev_conn, int* target_rev_conn_size, int64_t** target_rev_conn )
 {
   RevConnections = rev_conn;
   TargetRevConnectionSize = target_rev_conn_size;
   TargetRevConnection = target_rev_conn;
 }
 
-
-__global__ void RevSpikeBufferUpdate(unsigned int n_node)
+__global__ void
+revSpikeBufferUpdate( unsigned int n_node )
 {
   unsigned int i_node = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_node >= n_node) {
+  if ( i_node >= n_node )
+  {
     return;
   }
-  long long target_spike_time_idx = LastRevSpikeTimeIdx[i_node];
+  long long target_spike_time_idx = LastRevSpikeTimeIdx[ i_node ];
   // Check if a spike reached the input synapses now
-  if (target_spike_time_idx!=NESTGPUTimeIdx) {
+  if ( target_spike_time_idx != NESTGPUTimeIdx )
+  {
     return;
   }
-  int n_conn = TargetRevConnectionSize[i_node];
-  if (n_conn>0) {
-    unsigned int pos = atomicAdd(RevSpikeNum, 1);
-    RevSpikeTarget[pos] = i_node;
-    RevSpikeNConn[pos] = n_conn;
+  int n_conn = TargetRevConnectionSize[ i_node ];
+  if ( n_conn > 0 )
+  {
+    unsigned int pos = atomicAdd( RevSpikeNum, 1 );
+    RevSpikeTarget[ pos ] = i_node;
+    RevSpikeNConn[ pos ] = n_conn;
   }
 }
 
-__global__ void SetConnectionSpikeTime(unsigned int n_conn,
-				       unsigned short time_idx)
+__global__ void
+setConnectionSpikeTime( unsigned int n_conn, unsigned short time_idx )
 {
   unsigned int i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_conn) {
+  if ( i_conn >= n_conn )
+  {
     return;
   }
-  ConnectionSpikeTime[i_conn] = time_idx;
+  ConnectionSpikeTime[ i_conn ] = time_idx;
 }
 
-__global__ void ResetConnectionSpikeTimeUpKernel(unsigned int n_conn)
+__global__ void
+resetConnectionSpikeTimeUpKernel( unsigned int n_conn )
 {
   unsigned int i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_conn) {
+  if ( i_conn >= n_conn )
+  {
     return;
   }
-  unsigned short spike_time = ConnectionSpikeTime[i_conn];
-  if (spike_time >= 0x8000) {
-    ConnectionSpikeTime[i_conn] = 0;
+  unsigned short spike_time = ConnectionSpikeTime[ i_conn ];
+  if ( spike_time >= 0x8000 )
+  {
+    ConnectionSpikeTime[ i_conn ] = 0;
   }
 }
 
-__global__ void ResetConnectionSpikeTimeDownKernel(unsigned int n_conn)
+__global__ void
+resetConnectionSpikeTimeDownKernel( unsigned int n_conn )
 {
   unsigned int i_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn>=n_conn) {
+  if ( i_conn >= n_conn )
+  {
     return;
   }
-  unsigned short spike_time = ConnectionSpikeTime[i_conn];
-  if (spike_time < 0x8000) {
-    ConnectionSpikeTime[i_conn] = 0x8000;
+  unsigned short spike_time = ConnectionSpikeTime[ i_conn ];
+  if ( spike_time < 0x8000 )
+  {
+    ConnectionSpikeTime[ i_conn ] = 0x8000;
   }
 }
 
-__global__ void DeviceRevSpikeInit(unsigned int *rev_spike_num,
-				   unsigned int *rev_spike_target,
-				   int *rev_spike_n_conn)
+__global__ void
+deviceRevSpikeInit( unsigned int* rev_spike_num, unsigned int* rev_spike_target, int* rev_spike_n_conn )
 {
   RevSpikeNum = rev_spike_num;
   RevSpikeTarget = rev_spike_target;
@@ -196,131 +144,8 @@ __global__ void DeviceRevSpikeInit(unsigned int *rev_spike_num,
   *RevSpikeNum = 0;
 }
 
-__global__ void RevSpikeReset()
+__global__ void
+revSpikeReset()
 {
   *RevSpikeNum = 0;
 }
-  
-
-int ResetConnectionSpikeTimeUp()
-{  
-  ResetConnectionSpikeTimeUpKernel
-    <<<(NConn+1023)/1024, 1024>>>
-    (NConn);
-  gpuErrchk( cudaPeekAtLastError() );
-
-  return 0;
-}
-
-int ResetConnectionSpikeTimeDown()
-{  
-  ResetConnectionSpikeTimeDownKernel
-    <<<(NConn+1023)/1024, 1024>>>
-    (NConn);
-  gpuErrchk( cudaPeekAtLastError() );
-
-  return 0;
-}
-
-
-
-int RevSpikeFree()
-{
-  CUDAFREECTRL("&d_RevSpikeNum",&d_RevSpikeNum);
-  CUDAFREECTRL("&d_RevSpikeTarget",&d_RevSpikeTarget);
-  CUDAFREECTRL("&d_RevSpikeNConn",&d_RevSpikeNConn);
-
-  return 0;
-}
-
-int RevSpikeInit(uint n_spike_buffers)
-{
-  //printf("n_spike_buffers: %d\n", n_spike_buffers);
-
-  //////////////////////////////////////////////////////////////////////
-  /////// Organize reverse connections (new version)
-  // CHECK THE GLOBAL VARIABLES THAT MUST BE CONVERTED TO 64 bit ARRAYS
-  //////////////////////////////////////////////////////////////////////  
-  // Alloc 64 bit array of number of reverse connections per target node
-  // and initialize it to 0
-  int64_t *d_target_rev_conn_size_64;
-  int64_t *d_target_rev_conn_cumul;
-  CUDAMALLOCCTRL("&d_target_rev_conn_size_64",&d_target_rev_conn_size_64,
-		       (n_spike_buffers+1)*sizeof(int64_t));
-  gpuErrchk(cudaMemset(d_target_rev_conn_size_64, 0,
-		       (n_spike_buffers+1)*sizeof(int64_t)));
-  // Count number of reverse connections per target node
-  CountRevConnectionsKernel<<<(NConn+1023)/1024, 1024>>>
-    (NConn, d_target_rev_conn_size_64);
-  // Evaluate exclusive sum of reverse connections per target node
-  // Allocate array for cumulative sum
-  CUDAMALLOCCTRL("&d_target_rev_conn_cumul",&d_target_rev_conn_cumul,
-		       (n_spike_buffers+1)*sizeof(int64_t));
-  // Determine temporary device storage requirements
-  void *d_temp_storage = NULL;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,
-				d_target_rev_conn_size_64,
-				d_target_rev_conn_cumul,
-				n_spike_buffers+1);
-  // Allocate temporary storage
-  CUDAMALLOCCTRL("&d_temp_storage",&d_temp_storage, temp_storage_bytes);
-  // Run exclusive prefix sum
-  cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,
-				d_target_rev_conn_size_64,
-				d_target_rev_conn_cumul,
-				n_spike_buffers+1);
-  // The last element is the total number of reverse connections
-  gpuErrchk(cudaMemcpy(&h_NRevConn, &d_target_rev_conn_cumul[n_spike_buffers],
-		       sizeof(int64_t), cudaMemcpyDeviceToHost));
-  if (h_NRevConn > 0) {
-    // Allocate array of reverse connection indexes
-    // CHECK THAT d_RevConnections is of type int64_t array
-    CUDAMALLOCCTRL("&d_RevConnections",&d_RevConnections, h_NRevConn*sizeof(int64_t));  
-    // For each target node evaluate the pointer
-    // to its first reverse connection using the exclusive sum
-    // CHECK THAT d_TargetRevConnection is of type int64_t* pointer
-    CUDAMALLOCCTRL("&d_TargetRevConnection",&d_TargetRevConnection, n_spike_buffers
-			 *sizeof(int64_t*));
-    SetTargetRevConnectionsPtKernel<<<(n_spike_buffers+1023)/1024, 1024>>>
-      (n_spike_buffers, d_target_rev_conn_cumul,
-       d_TargetRevConnection, d_RevConnections);
-  
-    // alloc 32 bit array of number of reverse connections per target node
-    CUDAMALLOCCTRL("&d_TargetRevConnectionSize",&d_TargetRevConnectionSize,
-			 n_spike_buffers*sizeof(int));
-    // and initialize it to 0
-    gpuErrchk(cudaMemset(d_TargetRevConnectionSize, 0,
-			 n_spike_buffers*sizeof(int)));
-    // Fill array of reverse connection indexes
-    SetRevConnectionsIndexKernel<<<(NConn+1023)/1024, 1024>>>
-      (NConn, d_TargetRevConnectionSize, d_TargetRevConnection);
-
-    RevConnectionInitKernel<<<1,1>>>
-      (d_RevConnections, d_TargetRevConnectionSize, d_TargetRevConnection);
-
-    SetConnectionSpikeTime
-      <<<(NConn+1023)/1024, 1024>>>
-      (NConn, 0x8000);
-    gpuErrchk( cudaPeekAtLastError() );
-    gpuErrchk( cudaDeviceSynchronize() );
-
-    CUDAMALLOCCTRL("&d_RevSpikeNum",&d_RevSpikeNum, sizeof(unsigned int));
-  
-    CUDAMALLOCCTRL("&d_RevSpikeTarget",&d_RevSpikeTarget,
-			 n_spike_buffers*sizeof(unsigned int));
-    CUDAMALLOCCTRL("&d_RevSpikeNConn",&d_RevSpikeNConn,
-			 n_spike_buffers*sizeof(int));
-
-    DeviceRevSpikeInit<<<1,1>>>(d_RevSpikeNum, d_RevSpikeTarget,
-				d_RevSpikeNConn);
-    gpuErrchk( cudaPeekAtLastError() );
-    gpuErrchk( cudaDeviceSynchronize() );
-  }
-  
-  CUDAFREECTRL("d_temp_storage",d_temp_storage);
-  CUDAFREECTRL("d_target_rev_conn_size_64",d_target_rev_conn_size_64);
-  CUDAFREECTRL("d_target_rev_conn_cumul",d_target_rev_conn_cumul);
-
-  return 0;
-}
diff --git a/src/rev_spike.h b/src/rev_spike.h
index 0e2a319bc..9e5034660 100644
--- a/src/rev_spike.h
+++ b/src/rev_spike.h
@@ -23,68 +23,97 @@
 #ifndef REVSPIKE_H
 #define REVSPIKE_H
 
-//#include "connect.h"
+#include "conn12b.h"
+#include "conn16b.h"
+#include "connect.h"
+#include "get_spike.h"
 #include "spike_buffer.h"
 #include "syn_model.h"
-#include "get_spike.h"
 
-extern int64_t h_NRevConn;
-extern unsigned int *d_RevSpikeNum;
-extern unsigned int *d_RevSpikeTarget;
-extern int *d_RevSpikeNConn;
-extern __device__ unsigned int *RevSpikeTarget;
-extern __device__ int64_t **TargetRevConnection;
 extern __constant__ long long NESTGPUTimeIdx;
 extern __constant__ float NESTGPUTimeResolution;
 
-__global__ void RevSpikeReset();
+extern __device__ unsigned int* RevSpikeNum;
+extern __device__ unsigned int* RevSpikeTarget;
+extern __device__ int* RevSpikeNConn;
+
+extern int64_t* d_RevConnections; //[i] i=0,..., n_rev_conn - 1;
+extern __device__ int64_t* RevConnections;
+
+extern int* d_TargetRevConnectionSize; //[i] i=0,..., n_neuron-1;
+extern __device__ int* TargetRevConnectionSize;
 
-__global__ void RevSpikeBufferUpdate(unsigned int n_node);
+extern int64_t** d_TargetRevConnection; //[i][j] j=0,...,RevConnectionSize[i]-1
+extern __device__ int64_t** TargetRevConnection;
 
-int RevSpikeInit(uint n_spike_buffers);
+__global__ void revSpikeReset();
 
-int RevSpikeFree();
+__global__ void revSpikeBufferUpdate( unsigned int n_node );
 
-int ResetConnectionSpikeTimeDown();
+int revSpikeFree();
 
-int ResetConnectionSpikeTimeUp();
+int resetConnectionSpikeTimeDown();
 
-template<int i_func>
-__device__  __forceinline__ void NestedLoopFunction(int i_spike, int i_syn);
+int resetConnectionSpikeTimeUp();
 
 //////////////////////////////////////////////////////////////////////
 // This is the function called by the nested loop
 // that makes use of positive post-pre spike time difference
-template<>
-__device__ __forceinline__ void NestedLoopFunction<1>
-(int i_spike, int i_target_rev_conn)
+template < class ConnKeyT, class ConnStructT >
+__device__ __forceinline__ void
+NestedLoopFunction1( int i_spike, int i_target_rev_conn )
 {
-  unsigned int target = RevSpikeTarget[i_spike];
-  int64_t i_conn = TargetRevConnection[target][i_target_rev_conn];
-  uint i_block = (uint)(i_conn / ConnBlockSize);
+  unsigned int target = RevSpikeTarget[ i_spike ];
+  int64_t i_conn = TargetRevConnection[ target ][ i_target_rev_conn ];
+  uint i_block = ( uint ) ( i_conn / ConnBlockSize );
   int64_t i_block_conn = i_conn % ConnBlockSize;
-  connection_struct conn = ConnectionArray[i_block][i_block_conn];
-  unsigned char syn_group = conn.target_port_syn & SynMask;
-  
+  // connection_struct conn = ConnectionArray[i_block][i_block_conn];
+  // unsigned char syn_group = conn.target_port_syn & SynMask;
+  ConnKeyT& conn_key = ( ( ConnKeyT** ) ConnKeyArray )[ i_block ][ i_block_conn ];
+  ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ];
+  uint syn_group = getConnSyn< ConnKeyT, ConnStructT >( conn_key, conn_struct );
+
   // TO BE IMPROVED BY CHECKING IF THE SYNAPSE TYPE OF THE GROUP
   // REQUIRES AN UPDATE BASED ON POST-PRE SPIKE TIME DIFFERENCE
-  if (syn_group>0) {
-    unsigned short spike_time_idx = ConnectionSpikeTime[i_conn];
-    unsigned short time_idx = (unsigned short)(NESTGPUTimeIdx & 0xffff);
+  if ( syn_group > 0 )
+  {
+    unsigned short spike_time_idx = ConnectionSpikeTime[ i_conn ];
+    unsigned short time_idx = ( unsigned short ) ( NESTGPUTimeIdx & 0xffff );
     unsigned short Dt_int = time_idx - spike_time_idx;
 
-    //printf("rev spike target %d i_target_rev_conn %d "
+    // printf("rev spike target %d i_target_rev_conn %d "
     //	   "i_conn %lld weight %f syn_group %d "
     //	   "TimeIdx %lld CST %d Dt %d\n",
     //	   target, i_target_rev_conn, i_conn, conn.weight, syn_group,
     //	   NESTGPUTimeIdx, spike_time_idx, Dt_int);
-   
-    if (Dt_int<MAX_SYN_DT) {
-      SynapseUpdate(syn_group,
-		    &(ConnectionArray[i_block][i_block_conn].weight),
-		    NESTGPUTimeResolution*Dt_int);
+
+    if ( Dt_int < MAX_SYN_DT )
+    {
+      SynapseUpdate( syn_group, &( conn_struct.weight ), NESTGPUTimeResolution * Dt_int );
     }
   }
 }
 
+template < int i_func >
+__device__ __forceinline__ void NestedLoopFunction( int i_spike, int i_syn );
+
+//////////////////////////////////////////////////////////////////////
+// This is the function called by the nested loop
+// that makes use of positive post-pre spike time difference.
+// Include more integer template specializations
+// for different connection types
+template <>
+__device__ __forceinline__ void
+NestedLoopFunction< 1 >( int i_spike, int i_target_rev_conn )
+{
+  NestedLoopFunction1< conn12b_key, conn12b_struct >( i_spike, i_target_rev_conn );
+}
+
+template <>
+__device__ __forceinline__ void
+NestedLoopFunction< 3 >( int i_spike, int i_target_rev_conn )
+{
+  NestedLoopFunction1< conn16b_key, conn16b_struct >( i_spike, i_target_rev_conn );
+}
+
 #endif
diff --git a/src/rk5.cu b/src/rk5.cu
index b08c29bda..54b175acc 100644
--- a/src/rk5.cu
+++ b/src/rk5.cu
@@ -20,17 +20,13 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <iostream>
-#include <stdio.h>
+#include "rk5.h"
 #include <cmath>
+#include <config.h>
 #include <curand.h>
 #include <curand_kernel.h>
-#include "rk5.h"
+#include <iostream>
+#include <stdio.h>
 
 __constant__ float c2 = 0.2;
 __constant__ float c3 = 0.3;
@@ -38,31 +34,31 @@ __constant__ float c4 = 0.6;
 __constant__ float c5 = 1.0;
 __constant__ float c6 = 0.875;
 __constant__ float a21 = 0.2;
-__constant__ float a31 = 3.0/40.0;
-__constant__ float a32 = 9.0/40.0;
+__constant__ float a31 = 3.0 / 40.0;
+__constant__ float a32 = 9.0 / 40.0;
 __constant__ float a41 = 0.3;
 __constant__ float a42 = -0.9;
 __constant__ float a43 = 1.2;
-__constant__ float a51 = -11.0/54.0;
+__constant__ float a51 = -11.0 / 54.0;
 __constant__ float a52 = 2.5;
-__constant__ float a53 = -70.0/27.0;
-__constant__ float a54 = 35.0/27.0;
-__constant__ float a61 = 1631.0/55296.0;
-__constant__ float a62 = 175.0/512.0;
-__constant__ float a63 = 575.0/13824.0;
-__constant__ float a64 = 44275.0/110592.0;
-__constant__ float a65 = 253.0/4096.0;
+__constant__ float a53 = -70.0 / 27.0;
+__constant__ float a54 = 35.0 / 27.0;
+__constant__ float a61 = 1631.0 / 55296.0;
+__constant__ float a62 = 175.0 / 512.0;
+__constant__ float a63 = 575.0 / 13824.0;
+__constant__ float a64 = 44275.0 / 110592.0;
+__constant__ float a65 = 253.0 / 4096.0;
 
-__constant__ float a71 = 37.0/378.0;
-__constant__ float a73 = 250.0/621.0;
-__constant__ float a74 = 125.0/594.0;
-__constant__ float a76 = 512.0/1771.0;
+__constant__ float a71 = 37.0 / 378.0;
+__constant__ float a73 = 250.0 / 621.0;
+__constant__ float a74 = 125.0 / 594.0;
+__constant__ float a76 = 512.0 / 1771.0;
 
-__constant__ float e1 = 37.0/378.0 - 2825.0/27648.0;
-__constant__ float e3 = 250.0/621.0 - 18575.0/48384.0;
-__constant__ float e4 = 125.0/594.0 - 13525.0/55296.0;
-__constant__ float e5 = -277.00/14336.0;
-__constant__ float e6 = 512.0/1771.0 - 0.25;
+__constant__ float e1 = 37.0 / 378.0 - 2825.0 / 27648.0;
+__constant__ float e3 = 250.0 / 621.0 - 18575.0 / 48384.0;
+__constant__ float e4 = 125.0 / 594.0 - 13525.0 / 55296.0;
+__constant__ float e5 = -277.00 / 14336.0;
+__constant__ float e6 = 512.0 / 1771.0 - 0.25;
 
 __constant__ float eps = 1.0e-6;
 __constant__ float coeff = 0.9;
@@ -71,10 +67,12 @@ __constant__ float exp_dec = -0.25;
 __constant__ float err_min = 1.889568e-4; //(5/coeff)^(1/exp_inc)
 __constant__ float scal_min = 1.0e-1;
 
-__global__ void SetFloatArray(float *arr, int n_elem, int step, float val)
+__global__ void
+SetFloatArray( float* arr, int n_elem, int step, float val )
 {
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<n_elem) {
-    arr[array_idx*step] = val;
+  if ( array_idx < n_elem )
+  {
+    arr[ array_idx * step ] = val;
   }
 }
diff --git a/src/rk5.h b/src/rk5.h
index 80769eb4f..1b4af5f75 100644
--- a/src/rk5.h
+++ b/src/rk5.h
@@ -20,10 +20,6 @@
  *
  */
 
-
-
-
-
 #ifndef RK5_H
 #define RK5_H
 
@@ -31,311 +27,357 @@
 #include "rk5_const.h"
 #include "rk5_interface.h"
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
-#define MAX(a,b) (((a)>(b))?(a):(b))
-
-__global__ void SetFloatArray(float *arr, int n_elem, int step, float val);
-
-template<class DataStruct>
-__global__
-void ArrayInit(int array_size, int n_var, int n_param, double *x_arr,
-	       float *h_arr, float *y_arr, float *par_arr, double x_min,
-	       float h, DataStruct data_struct)
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
+#define MAX( a, b ) ( ( ( a ) > ( b ) ) ? ( a ) : ( b ) )
+
+__global__ void SetFloatArray( float* arr, int n_elem, int step, float val );
+
+template < class DataStruct >
+__global__ void
+ArrayInit( int array_size,
+  int n_var,
+  int n_param,
+  double* x_arr,
+  float* h_arr,
+  float* y_arr,
+  float* par_arr,
+  double x_min,
+  float h,
+  DataStruct data_struct )
 {
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<array_size) {
-    NodeInit(n_var, n_param, x_min, &y_arr[array_idx*n_var],
-	     &par_arr[array_idx*n_param], data_struct);
-    x_arr[array_idx] = x_min;
-    h_arr[array_idx] = h;
+  if ( array_idx < array_size )
+  {
+    NodeInit( n_var, n_param, x_min, &y_arr[ array_idx * n_var ], &par_arr[ array_idx * n_param ], data_struct );
+    x_arr[ array_idx ] = x_min;
+    h_arr[ array_idx ] = h;
   }
 }
 
-template<class DataStruct>
-__global__
-void ArrayCalibrate(int array_size, int n_var, int n_param, double *x_arr,
-		    float *h_arr, float *y_arr, float *par_arr, double x_min,
-		    float h, DataStruct data_struct)
+template < class DataStruct >
+__global__ void
+ArrayCalibrate( int array_size,
+  int n_var,
+  int n_param,
+  double* x_arr,
+  float* h_arr,
+  float* y_arr,
+  float* par_arr,
+  double x_min,
+  float h,
+  DataStruct data_struct )
 {
   int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (array_idx<array_size) {
-    NodeCalibrate(n_var, n_param, x_min, &y_arr[array_idx*n_var],
-		  &par_arr[array_idx*n_param], data_struct);
-    x_arr[array_idx] = x_min;
-    h_arr[array_idx] = h;
+  if ( array_idx < array_size )
+  {
+    NodeCalibrate( n_var, n_param, x_min, &y_arr[ array_idx * n_var ], &par_arr[ array_idx * n_param ], data_struct );
+    x_arr[ array_idx ] = x_min;
+    h_arr[ array_idx ] = h;
   }
 }
 
-template<int NVAR, int NPARAM, class DataStruct>
-__device__
-void RK5Step(double &x, float *y, float &h, float h_min, float h_max,
-	     float *param, DataStruct data_struct)
+template < int NVAR, int NPARAM, class DataStruct >
+__device__ void
+RK5Step( double& x, float* y, float& h, float h_min, float h_max, float* param, DataStruct data_struct )
 {
-  float y_new[NVAR];
-  float k1[NVAR];
-  float k2[NVAR];
-  float k3[NVAR];
-  float k4[NVAR];
-  float k5[NVAR];
-  float k6[NVAR];
-  float y_scal[NVAR];
-
-  Derivatives<NVAR, NPARAM>(x, y, k1, param, data_struct);
-  for (int i=0; i<NVAR; i++) {
-    y_scal[i] = fabs(y[i]) + fabs(k1[i]*h) + scal_min;
+  float y_new[ NVAR ];
+  float k1[ NVAR ];
+  float k2[ NVAR ];
+  float k3[ NVAR ];
+  float k4[ NVAR ];
+  float k5[ NVAR ];
+  float k6[ NVAR ];
+  float y_scal[ NVAR ];
+
+  Derivatives< NVAR, NPARAM >( x, y, k1, param, data_struct );
+  for ( int i = 0; i < NVAR; i++ )
+  {
+    y_scal[ i ] = fabs( y[ i ] ) + fabs( k1[ i ] * h ) + scal_min;
   }
-  
+
   float err;
-  for(;;) {
-    if (h > h_max) h = h_max;
-    if (h < h_min) h = h_min;
-    
-    for (int i=0; i<NVAR; i++) {
-      y_new[i] = y[i] + h*a21*k1[i];
+  for ( ;; )
+  {
+    if ( h > h_max )
+    {
+      h = h_max;
+    }
+    if ( h < h_min )
+    {
+      h = h_min;
     }
 
-    Derivatives<NVAR, NPARAM>(x+c2*h, y_new, k2, param,
-			      data_struct);
-  
-    for (int i=0; i<NVAR; i++) {
-      y_new[i] = y[i] + h*(a31*k1[i] + a32*k2[i]);
+    for ( int i = 0; i < NVAR; i++ )
+    {
+      y_new[ i ] = y[ i ] + h * a21 * k1[ i ];
     }
-    Derivatives<NVAR, NPARAM>(x+c3*h, y_new, k3, param,
-			      data_struct);
 
-    for (int i=0; i<NVAR; i++) {
-      y_new[i] = y[i] + h*(a41*k1[i] + a42*k2[i] + a43*k3[i]);
+    Derivatives< NVAR, NPARAM >( x + c2 * h, y_new, k2, param, data_struct );
+
+    for ( int i = 0; i < NVAR; i++ )
+    {
+      y_new[ i ] = y[ i ] + h * ( a31 * k1[ i ] + a32 * k2[ i ] );
     }
-    Derivatives<NVAR, NPARAM>(x+c4*h, y_new, k4, param,
-			      data_struct);
-  
-    for (int i=0; i<NVAR; i++) {
-      y_new[i] = y[i] + h*(a51*k1[i] + a52*k2[i] + a53*k3[i] + a54*k4[i]);
+    Derivatives< NVAR, NPARAM >( x + c3 * h, y_new, k3, param, data_struct );
+
+    for ( int i = 0; i < NVAR; i++ )
+    {
+      y_new[ i ] = y[ i ] + h * ( a41 * k1[ i ] + a42 * k2[ i ] + a43 * k3[ i ] );
     }
-    Derivatives<NVAR, NPARAM>(x+c5*h, y_new, k5, param,
-			      data_struct);
-  
-    for (int i=0; i<NVAR; i++) {
-      y_new[i] = y[i] + h*(a61*k1[i] + a62*k2[i] + a63*k3[i] + a64*k4[i]
-			  + a65*k5[i]);
+    Derivatives< NVAR, NPARAM >( x + c4 * h, y_new, k4, param, data_struct );
+
+    for ( int i = 0; i < NVAR; i++ )
+    {
+      y_new[ i ] = y[ i ] + h * ( a51 * k1[ i ] + a52 * k2[ i ] + a53 * k3[ i ] + a54 * k4[ i ] );
     }
-    Derivatives<NVAR, NPARAM>(x+c6*h, y_new, k6, param, data_struct);
-    
-    for (int i=0; i<NVAR; i++) {
-      y_new[i] = y[i] + h*(a71*k1[i] + a73*k3[i] + a74*k4[i] + a76*k6[i]);
+    Derivatives< NVAR, NPARAM >( x + c5 * h, y_new, k5, param, data_struct );
+
+    for ( int i = 0; i < NVAR; i++ )
+    {
+      y_new[ i ] = y[ i ] + h * ( a61 * k1[ i ] + a62 * k2[ i ] + a63 * k3[ i ] + a64 * k4[ i ] + a65 * k5[ i ] );
     }
-  
+    Derivatives< NVAR, NPARAM >( x + c6 * h, y_new, k6, param, data_struct );
+
+    for ( int i = 0; i < NVAR; i++ )
+    {
+      y_new[ i ] = y[ i ] + h * ( a71 * k1[ i ] + a73 * k3[ i ] + a74 * k4[ i ] + a76 * k6[ i ] );
+    }
+
     err = 0.0;
-    for (int i=0; i<NVAR; i++) {
-      float val = h*(e1*k1[i] + e3*k3[i] + e4*k4[i] + e5*k5[i] + e6*k6[i]);
-      val /= y_scal[i]; ///// check for overflow!!!!!!!!!!!
-      err = MAX(err,fabs(val));
+    for ( int i = 0; i < NVAR; i++ )
+    {
+      float val = h * ( e1 * k1[ i ] + e3 * k3[ i ] + e4 * k4[ i ] + e5 * k5[ i ] + e6 * k6[ i ] );
+      val /= y_scal[ i ]; ///// check for overflow!!!!!!!!!!!
+      err = MAX( err, fabs( val ) );
     }
     err /= eps;
-    if (err <= 1.0 || h<=h_min*(1.0+1.0e-5)) break;
-
-    float h_new = h*coeff*pow(err,exp_dec);
-    h = MAX(h_new, 0.1*h);
-    
-    //if (h <= h_min) {
-    //  h = h_min;
-    //}
-    //x_new = x + h;
-  }
+    if ( err <= 1.0 || h <= h_min * ( 1.0 + 1.0e-5 ) )
+    {
+      break;
+    }
 
-  x += h;  
+    float h_new = h * coeff * pow( err, exp_dec );
+    h = MAX( h_new, 0.1 * h );
 
-  if (err > err_min) {
-    h = h*coeff*pow(err,exp_inc);
+    // if (h <= h_min) {
+    //   h = h_min;
+    // }
+    // x_new = x + h;
   }
-  else {
-    h = 5.0*h;
+
+  x += h;
+
+  if ( err > err_min )
+  {
+    h = h * coeff * pow( err, exp_inc );
   }
-  
-  for (int i=0; i<NVAR; i++) {
-    y[i] = y_new[i];
+  else
+  {
+    h = 5.0 * h;
   }
 
+  for ( int i = 0; i < NVAR; i++ )
+  {
+    y[ i ] = y_new[ i ];
+  }
 }
 
-template<int NVAR, int NPARAM, class DataStruct>
-__device__
-void RK5Update(double &x, float *y, double x1, float &h, float h_min,
-	       float *param, DataStruct data_struct)
+template < int NVAR, int NPARAM, class DataStruct >
+__device__ void
+RK5Update( double& x, float* y, double x1, float& h, float h_min, float* param, DataStruct data_struct )
 {
-  bool end_time_step=false;
-  while(!end_time_step) {
-    float hmax=(float)(x1-x);
-    RK5Step<NVAR, NPARAM, DataStruct>(x, y, h, h_min, hmax, param,
-				       data_struct);
-    end_time_step = (x >= x1-h_min);
-    ExternalUpdate<NVAR, NPARAM>(x, y, param, end_time_step, data_struct);
+  bool end_time_step = false;
+  while ( !end_time_step )
+  {
+    float hmax = ( float ) ( x1 - x );
+    RK5Step< NVAR, NPARAM, DataStruct >( x, y, h, h_min, hmax, param, data_struct );
+    end_time_step = ( x >= x1 - h_min );
+    ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
   }
 }
 
-template<int NVAR, int NPARAM, class DataStruct>
-__global__
-void ArrayUpdate(int array_size, double *x_arr, float *h_arr, float *y_arr,
-		 float *par_arr, double x1, float h_min, DataStruct data_struct)
+template < int NVAR, int NPARAM, class DataStruct >
+__global__ void
+ArrayUpdate( int array_size,
+  double* x_arr,
+  float* h_arr,
+  float* y_arr,
+  float* par_arr,
+  double x1,
+  float h_min,
+  DataStruct data_struct )
 {
   int ArrayIdx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (ArrayIdx<array_size) {
-    double x = x_arr[ArrayIdx];
-    float h = h_arr[ArrayIdx];
-    float y[NVAR];
-    float param[NPARAM];
-
-    for(int i=0; i<NVAR; i++) {
-      y[i] = y_arr[ArrayIdx*NVAR + i];
+  if ( ArrayIdx < array_size )
+  {
+    double x = x_arr[ ArrayIdx ];
+    float h = h_arr[ ArrayIdx ];
+    float y[ NVAR ];
+    float param[ NPARAM ];
+
+    for ( int i = 0; i < NVAR; i++ )
+    {
+      y[ i ] = y_arr[ ArrayIdx * NVAR + i ];
     }
-    for(int j=0; j<NPARAM; j++) {
-      param[j] = par_arr[ArrayIdx*NPARAM + j];
+    for ( int j = 0; j < NPARAM; j++ )
+    {
+      param[ j ] = par_arr[ ArrayIdx * NPARAM + j ];
     }
 
-    RK5Update<NVAR, NPARAM, DataStruct>(x, y, x1, h, h_min, param,
-					 data_struct);
+    RK5Update< NVAR, NPARAM, DataStruct >( x, y, x1, h, h_min, param, data_struct );
 
-    x_arr[ArrayIdx] = x;
-    h_arr[ArrayIdx] = h;
-    for(int i=0; i<NVAR; i++) {
-      y_arr[ArrayIdx*NVAR + i] = y[i];
+    x_arr[ ArrayIdx ] = x;
+    h_arr[ ArrayIdx ] = h;
+    for ( int i = 0; i < NVAR; i++ )
+    {
+      y_arr[ ArrayIdx * NVAR + i ] = y[ i ];
     }
-    for(int j=0; j<NPARAM; j++) {
-      par_arr[ArrayIdx*NPARAM + j] = param[j]; 
+    for ( int j = 0; j < NPARAM; j++ )
+    {
+      par_arr[ ArrayIdx * NPARAM + j ] = param[ j ];
     }
   }
 }
 
-template<class DataStruct>
+template < class DataStruct >
 class RungeKutta5
 {
   int array_size_;
   int n_var_;
   int n_param_;
-    
-  double *d_XArr;
-  float *d_HArr;
-  float *d_YArr;
-  float *d_ParamArr;
 
-  public:
+  double* d_XArr;
+  float* d_HArr;
+  float* d_YArr;
+  float* d_ParamArr;
 
+public:
   ~RungeKutta5();
- 
-  double *GetXArr() {return d_XArr;}
-  float *GetHArr() {return d_HArr;}
-  float *GetYArr() {return d_YArr;}
-  float *GetParamArr() {return d_ParamArr;}
-  int Init(int array_size, int n_var, int n_param, double x_min, float h,
-	   DataStruct data_struct);
-  int Calibrate(double x_min, float h, DataStruct data_struct);
 
-  int Free();
+  double*
+  GetXArr()
+  {
+    return d_XArr;
+  }
+  float*
+  GetHArr()
+  {
+    return d_HArr;
+  }
+  float*
+  GetYArr()
+  {
+    return d_YArr;
+  }
+  float*
+  GetParamArr()
+  {
+    return d_ParamArr;
+  }
+  int Init( int array_size, int n_var, int n_param, double x_min, float h, DataStruct data_struct );
+  int Calibrate( double x_min, float h, DataStruct data_struct );
 
-  int GetX(int i_array, int n_elem, double *x);
-  int GetY(int i_var, int i_array, int n_elem, float *y);
-  int SetParam(int i_param, int i_array, int n_param, int n_elem, float val);
-  int SetVectParam(int i_param, int i_array, int n_param, int n_elem,
-		    float *param, int vect_size);
-  template<int NVAR, int NPARAM> int Update(double x1, float h_min,
-					     DataStruct data_struct);
+  int Free();
 
+  int GetX( int i_array, int n_elem, double* x );
+  int GetY( int i_var, int i_array, int n_elem, float* y );
+  int SetParam( int i_param, int i_array, int n_param, int n_elem, float val );
+  int SetVectParam( int i_param, int i_array, int n_param, int n_elem, float* param, int vect_size );
+  template < int NVAR, int NPARAM >
+  int Update( double x1, float h_min, DataStruct data_struct );
 };
 
-
-template<class DataStruct>
-template<int NVAR, int NPARAM>
-  int RungeKutta5<DataStruct>::Update(double x1, float h_min,
-				      DataStruct data_struct)
+template < class DataStruct >
+template < int NVAR, int NPARAM >
+int
+RungeKutta5< DataStruct >::Update( double x1, float h_min, DataStruct data_struct )
 {
-  ArrayUpdate<NVAR, NPARAM, DataStruct><<<(array_size_+1023)/1024, 1024>>>
-    (array_size_, d_XArr, d_HArr, d_YArr, d_ParamArr, x1, h_min, data_struct);
-  //gpuErrchk( cudaPeekAtLastError() );
-  //gpuErrchk( cudaDeviceSynchronize() );
+  ArrayUpdate< NVAR, NPARAM, DataStruct > <<< ( array_size_ + 1023 ) / 1024, 1024 >>>(
+    array_size_, d_XArr, d_HArr, d_YArr, d_ParamArr, x1, h_min, data_struct );
+  // gpuErrchk( cudaPeekAtLastError() );
+  // gpuErrchk( cudaDeviceSynchronize() );
 
   return 0;
 }
 
-template<class DataStruct>
-RungeKutta5<DataStruct>::~RungeKutta5()
+template < class DataStruct >
+RungeKutta5< DataStruct >::~RungeKutta5()
 {
   Free();
 }
 
-template<class DataStruct>
-int RungeKutta5<DataStruct>::Free()
+template < class DataStruct >
+int
+RungeKutta5< DataStruct >::Free()
 {
-  cudaFree(d_XArr);
-  cudaFree(d_HArr);
-  cudaFree(d_YArr);
-  cudaFree(d_ParamArr);
+  CUDAFREECTRL( "d_XArr", d_XArr );
+  CUDAFREECTRL( "d_HArr", d_HArr );
+  CUDAFREECTRL( "d_YArr", d_YArr );
+  CUDAFREECTRL( "d_ParamArr", d_ParamArr );
 
   return 0;
 }
 
-template<class DataStruct>
-int RungeKutta5<DataStruct>::Init(int array_size, int n_var, int n_param,
-				  double x_min, float h,
-				  DataStruct data_struct)
+template < class DataStruct >
+int
+RungeKutta5< DataStruct >::Init( int array_size, int n_var, int n_param, double x_min, float h, DataStruct data_struct )
 {
   array_size_ = array_size;
   n_var_ = n_var;
-  n_param_ = n_param; 
+  n_param_ = n_param;
 
-  CUDAMALLOCCTRL("&d_XArr",&d_XArr, array_size_*sizeof(double));
-  CUDAMALLOCCTRL("&d_HArr",&d_HArr, array_size_*sizeof(float));
-  CUDAMALLOCCTRL("&d_YArr",&d_YArr, array_size_*n_var_*sizeof(float));
-  CUDAMALLOCCTRL("&d_ParamArr",&d_ParamArr, array_size_*n_param_*sizeof(float));
+  CUDAMALLOCCTRL( "&d_XArr", &d_XArr, array_size_ * sizeof( double ) );
+  CUDAMALLOCCTRL( "&d_HArr", &d_HArr, array_size_ * sizeof( float ) );
+  CUDAMALLOCCTRL( "&d_YArr", &d_YArr, array_size_ * n_var_ * sizeof( float ) );
+  CUDAMALLOCCTRL( "&d_ParamArr", &d_ParamArr, array_size_ * n_param_ * sizeof( float ) );
 
-  ArrayInit<DataStruct><<<(array_size+1023)/1024, 1024>>>
-    (array_size_, n_var, n_param, d_XArr, d_HArr, d_YArr, d_ParamArr,
-     x_min, h, data_struct);
+  ArrayInit< DataStruct > <<< ( array_size + 1023 ) / 1024, 1024 >>>(
+    array_size_, n_var, n_param, d_XArr, d_HArr, d_YArr, d_ParamArr, x_min, h, data_struct );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  
+
   return 0;
 }
 
-template<class DataStruct>
-int RungeKutta5<DataStruct>::Calibrate(double x_min, float h,
-				       DataStruct data_struct)
+template < class DataStruct >
+int
+RungeKutta5< DataStruct >::Calibrate( double x_min, float h, DataStruct data_struct )
 {
-  ArrayCalibrate<DataStruct><<<(array_size_+1023)/1024, 1024>>>
-    (array_size_, n_var_, n_param_, d_XArr, d_HArr, d_YArr, d_ParamArr,
-     x_min, h, data_struct);
+  ArrayCalibrate< DataStruct > <<< ( array_size_ + 1023 ) / 1024, 1024 >>>(
+    array_size_, n_var_, n_param_, d_XArr, d_HArr, d_YArr, d_ParamArr, x_min, h, data_struct );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
 
   return 0;
 }
 
-template<class DataStruct>
-int RungeKutta5<DataStruct>::GetX(int i_array, int n_elem, double *x)
+template < class DataStruct >
+int
+RungeKutta5< DataStruct >::GetX( int i_array, int n_elem, double* x )
 {
-  cudaMemcpy(x, &d_XArr[i_array], n_elem*sizeof(double),
-	     cudaMemcpyDeviceToHost);
+  cudaMemcpy( x, &d_XArr[ i_array ], n_elem * sizeof( double ), cudaMemcpyDeviceToHost );
 
   return 0;
 }
 
-template<class DataStruct>
-int RungeKutta5<DataStruct>::GetY(int i_var, int i_array, int n_elem, float *y)
+template < class DataStruct >
+int
+RungeKutta5< DataStruct >::GetY( int i_var, int i_array, int n_elem, float* y )
 {
-  cudaMemcpy(y, &d_YArr[i_array*n_var_ + i_var], n_elem*sizeof(float),
-	     cudaMemcpyDeviceToHost);
+  cudaMemcpy( y, &d_YArr[ i_array * n_var_ + i_var ], n_elem * sizeof( float ), cudaMemcpyDeviceToHost );
 
   return 0;
 }
 
-template<class DataStruct>
-int RungeKutta5<DataStruct>::SetParam(int i_param, int i_array, int n_param,
-			   int n_elem, float val)
+template < class DataStruct >
+int
+RungeKutta5< DataStruct >::SetParam( int i_param, int i_array, int n_param, int n_elem, float val )
 {
-  SetFloatArray<<<(n_elem+1023)/1024, 1024>>>
-    (&d_ParamArr[i_array*n_param_ + i_param], n_elem, n_param, val);
+  SetFloatArray<<< ( n_elem + 1023 ) / 1024, 1024 >>>(
+    &d_ParamArr[ i_array * n_param_ + i_param ], n_elem, n_param, val );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  
+
   return 0;
 }
 
diff --git a/src/rk5_const.h b/src/rk5_const.h
index 4bf94a89f..bdc4284fe 100644
--- a/src/rk5_const.h
+++ b/src/rk5_const.h
@@ -23,7 +23,6 @@
 #ifndef RK5CONST_H
 #define RK5CONST_H
 
-
 extern __constant__ float c2;
 extern __constant__ float c3;
 extern __constant__ float c4;
diff --git a/src/rk5_interface.h b/src/rk5_interface.h
index be5e61152..8d5ded7f2 100644
--- a/src/rk5_interface.h
+++ b/src/rk5_interface.h
@@ -20,15 +20,10 @@
  *
  */
 
-
-
-
-
 #ifndef RK5INTERFACE_H
 #define RK5INTERFACE_H
 
 #include "aeif_cond_beta_multisynapse_rk5.h"
 #include "aeif_psc_exp_multisynapse_rk5.h"
 
-
 #endif
diff --git a/src/scan.cu b/src/scan.cu
index 937996eab..64e16cbd9 100644
--- a/src/scan.cu
+++ b/src/scan.cu
@@ -1,335 +1,376 @@
 /*
-	Matt Dean - https://github.com/mattdean1/cuda
+        Matt Dean - https://github.com/mattdean1/cuda
 */
 
+#include "cuda_error.h"
+#include "scan.h"
 #include <config.h>
 #include <stdlib.h>
-#include "scan.h"
 
-#define checkCudaError(o, l) _checkCudaError(o, l, __func__)
+#define checkCudaError( o, l ) _checkCudaError( o, l, __func__ )
 
 #define SHARED_MEMORY_BANKS 32
 #define LOG_MEM_BANKS 5
-#define CONFLICT_FREE_OFFSET(n) ((n) >> LOG_MEM_BANKS)
+#define CONFLICT_FREE_OFFSET( n ) ( ( n ) >> LOG_MEM_BANKS )
 
 int THREADS_PER_BLOCK = 512;
 int ELEMENTS_PER_BLOCK = THREADS_PER_BLOCK * 2;
 
-__global__ void prescan_arbitrary(int *output, int *input, int n, int powerOfTwo)
+__global__ void
+prescan_arbitrary( int* output, int* input, int n, int powerOfTwo )
 {
-	extern __shared__ int temp[];// allocated on invocation
-	int threadID = threadIdx.x;
-
-	int ai = threadID;
-	int bi = threadID + (n / 2);
-	int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
-	int bankOffsetB = CONFLICT_FREE_OFFSET(bi);
-
-	
-	if (threadID < n) {
-		temp[ai + bankOffsetA] = input[ai];
-		temp[bi + bankOffsetB] = input[bi];
-	}
-	else {
-		temp[ai + bankOffsetA] = 0;
-		temp[bi + bankOffsetB] = 0;
-	}
-	
-
-	int offset = 1;
-	for (int d = powerOfTwo >> 1; d > 0; d >>= 1) // build sum in place up the tree
-	{
-		__syncthreads();
-		if (threadID < d)
-		{
-			int ai = offset * (2 * threadID + 1) - 1;
-			int bi = offset * (2 * threadID + 2) - 1;
-			ai += CONFLICT_FREE_OFFSET(ai);
-			bi += CONFLICT_FREE_OFFSET(bi);
-
-			temp[bi] += temp[ai];
-		}
-		offset *= 2;
-	}
-
-	if (threadID == 0) {
-		temp[powerOfTwo - 1 + CONFLICT_FREE_OFFSET(powerOfTwo - 1)] = 0; // clear the last element
-	}
-
-	for (int d = 1; d < powerOfTwo; d *= 2) // traverse down tree & build scan
-	{
-		offset >>= 1;
-		__syncthreads();
-		if (threadID < d)
-		{
-			int ai = offset * (2 * threadID + 1) - 1;
-			int bi = offset * (2 * threadID + 2) - 1;
-			ai += CONFLICT_FREE_OFFSET(ai);
-			bi += CONFLICT_FREE_OFFSET(bi);
-
-			int t = temp[ai];
-			temp[ai] = temp[bi];
-			temp[bi] += t;
-		}
-	}
-	__syncthreads();
-
-	if (threadID < n) {
-		output[ai] = temp[ai + bankOffsetA];
-		output[bi] = temp[bi + bankOffsetB];
-	}
-}
+  extern __shared__ int temp[]; // allocated on invocation
+  int threadID = threadIdx.x;
+
+  int ai = threadID;
+  int bi = threadID + ( n / 2 );
+  int bankOffsetA = CONFLICT_FREE_OFFSET( ai );
+  int bankOffsetB = CONFLICT_FREE_OFFSET( bi );
+
+  if ( threadID < n )
+  {
+    temp[ ai + bankOffsetA ] = input[ ai ];
+    temp[ bi + bankOffsetB ] = input[ bi ];
+  }
+  else
+  {
+    temp[ ai + bankOffsetA ] = 0;
+    temp[ bi + bankOffsetB ] = 0;
+  }
 
-__global__ void prescan_arbitrary_unoptimized(int *output, int *input, int n, int powerOfTwo) {
-	extern __shared__ int temp[];// allocated on invocation
-	int threadID = threadIdx.x;
-
-	if (threadID < n) {
-		temp[2 * threadID] = input[2 * threadID]; // load input into shared memory
-		temp[2 * threadID + 1] = input[2 * threadID + 1];
-	}
-	else {
-		temp[2 * threadID] = 0;
-		temp[2 * threadID + 1] = 0;
-	}
-
-
-	int offset = 1;
-	for (int d = powerOfTwo >> 1; d > 0; d >>= 1) // build sum in place up the tree
-	{
-		__syncthreads();
-		if (threadID < d)
-		{
-			int ai = offset * (2 * threadID + 1) - 1;
-			int bi = offset * (2 * threadID + 2) - 1;
-			temp[bi] += temp[ai];
-		}
-		offset *= 2;
-	}
-
-	if (threadID == 0) { temp[powerOfTwo - 1] = 0; } // clear the last element
-
-	for (int d = 1; d < powerOfTwo; d *= 2) // traverse down tree & build scan
-	{
-		offset >>= 1;
-		__syncthreads();
-		if (threadID < d)
-		{
-			int ai = offset * (2 * threadID + 1) - 1;
-			int bi = offset * (2 * threadID + 2) - 1;
-			int t = temp[ai];
-			temp[ai] = temp[bi];
-			temp[bi] += t;
-		}
-	}
-	__syncthreads();
-
-	if (threadID < n) {
-		output[2 * threadID] = temp[2 * threadID]; // write results to device memory
-		output[2 * threadID + 1] = temp[2 * threadID + 1];
-	}
-}
+  int offset = 1;
+  for ( int d = powerOfTwo >> 1; d > 0; d >>= 1 ) // build sum in place up the tree
+  {
+    __syncthreads();
+    if ( threadID < d )
+    {
+      int ai = offset * ( 2 * threadID + 1 ) - 1;
+      int bi = offset * ( 2 * threadID + 2 ) - 1;
+      ai += CONFLICT_FREE_OFFSET( ai );
+      bi += CONFLICT_FREE_OFFSET( bi );
+
+      temp[ bi ] += temp[ ai ];
+    }
+    offset *= 2;
+  }
 
+  if ( threadID == 0 )
+  {
+    temp[ powerOfTwo - 1 + CONFLICT_FREE_OFFSET( powerOfTwo - 1 ) ] = 0; // clear the last element
+  }
 
-__global__ void prescan_large(int *output, int *input, int n, int *sums) {
-	extern __shared__ int temp[];
-
-	int blockID = blockIdx.x;
-	int threadID = threadIdx.x;
-	int blockOffset = blockID * n;
-	
-	int ai = threadID;
-	int bi = threadID + (n / 2);
-	int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
-	int bankOffsetB = CONFLICT_FREE_OFFSET(bi);
-	temp[ai + bankOffsetA] = input[blockOffset + ai];
-	temp[bi + bankOffsetB] = input[blockOffset + bi];
-
-	int offset = 1;
-	for (int d = n >> 1; d > 0; d >>= 1) // build sum in place up the tree
-	{
-		__syncthreads();
-		if (threadID < d)
-		{
-			int ai = offset * (2 * threadID + 1) - 1;
-			int bi = offset * (2 * threadID + 2) - 1;
-			ai += CONFLICT_FREE_OFFSET(ai);
-			bi += CONFLICT_FREE_OFFSET(bi);
-
-			temp[bi] += temp[ai];
-		}
-		offset *= 2;
-	}
-	__syncthreads();
-
-
-	if (threadID == 0) { 
-		sums[blockID] = temp[n - 1 + CONFLICT_FREE_OFFSET(n - 1)];
-		temp[n - 1 + CONFLICT_FREE_OFFSET(n - 1)] = 0;
-	} 
-	
-	for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
-	{
-		offset >>= 1;
-		__syncthreads();
-		if (threadID < d)
-		{
-			int ai = offset * (2 * threadID + 1) - 1;
-			int bi = offset * (2 * threadID + 2) - 1;
-			ai += CONFLICT_FREE_OFFSET(ai);
-			bi += CONFLICT_FREE_OFFSET(bi);
-
-			int t = temp[ai];
-			temp[ai] = temp[bi];
-			temp[bi] += t;
-		}
-	}
-	__syncthreads();
-
-	output[blockOffset + ai] = temp[ai + bankOffsetA];
-	output[blockOffset + bi] = temp[bi + bankOffsetB];
-}
+  for ( int d = 1; d < powerOfTwo; d *= 2 ) // traverse down tree & build scan
+  {
+    offset >>= 1;
+    __syncthreads();
+    if ( threadID < d )
+    {
+      int ai = offset * ( 2 * threadID + 1 ) - 1;
+      int bi = offset * ( 2 * threadID + 2 ) - 1;
+      ai += CONFLICT_FREE_OFFSET( ai );
+      bi += CONFLICT_FREE_OFFSET( bi );
+
+      int t = temp[ ai ];
+      temp[ ai ] = temp[ bi ];
+      temp[ bi ] += t;
+    }
+  }
+  __syncthreads();
 
-__global__ void prescan_large_unoptimized(int *output, int *input, int n, int *sums) {
-	int blockID = blockIdx.x;
-	int threadID = threadIdx.x;
-	int blockOffset = blockID * n;
-
-	extern __shared__ int temp[];
-	temp[2 * threadID] = input[blockOffset + (2 * threadID)];
-	temp[2 * threadID + 1] = input[blockOffset + (2 * threadID) + 1];
-
-	int offset = 1;
-	for (int d = n >> 1; d > 0; d >>= 1) // build sum in place up the tree
-	{
-		__syncthreads();
-		if (threadID < d)
-		{
-			int ai = offset * (2 * threadID + 1) - 1;
-			int bi = offset * (2 * threadID + 2) - 1;
-			temp[bi] += temp[ai];
-		}
-		offset *= 2;
-	}
-	__syncthreads();
-
-
-	if (threadID == 0) {
-		sums[blockID] = temp[n - 1];
-		temp[n - 1] = 0;
-	}
-
-	for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
-	{
-		offset >>= 1;
-		__syncthreads();
-		if (threadID < d)
-		{
-			int ai = offset * (2 * threadID + 1) - 1;
-			int bi = offset * (2 * threadID + 2) - 1;
-			int t = temp[ai];
-			temp[ai] = temp[bi];
-			temp[bi] += t;
-		}
-	}
-	__syncthreads();
-
-	output[blockOffset + (2 * threadID)] = temp[2 * threadID];
-	output[blockOffset + (2 * threadID) + 1] = temp[2 * threadID + 1];
+  if ( threadID < n )
+  {
+    output[ ai ] = temp[ ai + bankOffsetA ];
+    output[ bi ] = temp[ bi + bankOffsetB ];
+  }
 }
 
+__global__ void
+prescan_arbitrary_unoptimized( int* output, int* input, int n, int powerOfTwo )
+{
+  extern __shared__ int temp[]; // allocated on invocation
+  int threadID = threadIdx.x;
 
-__global__ void add(int *output, int length, int *n) {
-	int blockID = blockIdx.x;
-	int threadID = threadIdx.x;
-	int blockOffset = blockID * length;
+  if ( threadID < n )
+  {
+    temp[ 2 * threadID ] = input[ 2 * threadID ]; // load input into shared memory
+    temp[ 2 * threadID + 1 ] = input[ 2 * threadID + 1 ];
+  }
+  else
+  {
+    temp[ 2 * threadID ] = 0;
+    temp[ 2 * threadID + 1 ] = 0;
+  }
 
-	output[blockOffset + threadID] += n[blockID];
+  int offset = 1;
+  for ( int d = powerOfTwo >> 1; d > 0; d >>= 1 ) // build sum in place up the tree
+  {
+    __syncthreads();
+    if ( threadID < d )
+    {
+      int ai = offset * ( 2 * threadID + 1 ) - 1;
+      int bi = offset * ( 2 * threadID + 2 ) - 1;
+      temp[ bi ] += temp[ ai ];
+    }
+    offset *= 2;
+  }
+
+  if ( threadID == 0 )
+  {
+    temp[ powerOfTwo - 1 ] = 0;
+  } // clear the last element
+
+  for ( int d = 1; d < powerOfTwo; d *= 2 ) // traverse down tree & build scan
+  {
+    offset >>= 1;
+    __syncthreads();
+    if ( threadID < d )
+    {
+      int ai = offset * ( 2 * threadID + 1 ) - 1;
+      int bi = offset * ( 2 * threadID + 2 ) - 1;
+      int t = temp[ ai ];
+      temp[ ai ] = temp[ bi ];
+      temp[ bi ] += t;
+    }
+  }
+  __syncthreads();
+
+  if ( threadID < n )
+  {
+    output[ 2 * threadID ] = temp[ 2 * threadID ]; // write results to device memory
+    output[ 2 * threadID + 1 ] = temp[ 2 * threadID + 1 ];
+  }
 }
 
-__global__ void add(int *output, int length, int *n1, int *n2) {
-	int blockID = blockIdx.x;
-	int threadID = threadIdx.x;
-	int blockOffset = blockID * length;
+__global__ void
+prescan_large( int* output, int* input, int n, int* sums )
+{
+  extern __shared__ int temp[];
+
+  int blockID = blockIdx.x;
+  int threadID = threadIdx.x;
+  int blockOffset = blockID * n;
+
+  int ai = threadID;
+  int bi = threadID + ( n / 2 );
+  int bankOffsetA = CONFLICT_FREE_OFFSET( ai );
+  int bankOffsetB = CONFLICT_FREE_OFFSET( bi );
+  temp[ ai + bankOffsetA ] = input[ blockOffset + ai ];
+  temp[ bi + bankOffsetB ] = input[ blockOffset + bi ];
+
+  int offset = 1;
+  for ( int d = n >> 1; d > 0; d >>= 1 ) // build sum in place up the tree
+  {
+    __syncthreads();
+    if ( threadID < d )
+    {
+      int ai = offset * ( 2 * threadID + 1 ) - 1;
+      int bi = offset * ( 2 * threadID + 2 ) - 1;
+      ai += CONFLICT_FREE_OFFSET( ai );
+      bi += CONFLICT_FREE_OFFSET( bi );
+
+      temp[ bi ] += temp[ ai ];
+    }
+    offset *= 2;
+  }
+  __syncthreads();
+
+  if ( threadID == 0 )
+  {
+    sums[ blockID ] = temp[ n - 1 + CONFLICT_FREE_OFFSET( n - 1 ) ];
+    temp[ n - 1 + CONFLICT_FREE_OFFSET( n - 1 ) ] = 0;
+  }
+
+  for ( int d = 1; d < n; d *= 2 ) // traverse down tree & build scan
+  {
+    offset >>= 1;
+    __syncthreads();
+    if ( threadID < d )
+    {
+      int ai = offset * ( 2 * threadID + 1 ) - 1;
+      int bi = offset * ( 2 * threadID + 2 ) - 1;
+      ai += CONFLICT_FREE_OFFSET( ai );
+      bi += CONFLICT_FREE_OFFSET( bi );
+
+      int t = temp[ ai ];
+      temp[ ai ] = temp[ bi ];
+      temp[ bi ] += t;
+    }
+  }
+  __syncthreads();
 
-	output[blockOffset + threadID] += n1[blockID] + n2[blockID];
+  output[ blockOffset + ai ] = temp[ ai + bankOffsetA ];
+  output[ blockOffset + bi ] = temp[ bi + bankOffsetB ];
 }
 
-void prefix_scan(int *d_out, int *d_in, int length, bool bcao) {
-  if (length > ELEMENTS_PER_BLOCK) {
-    scanLargeDeviceArray(d_out, d_in, length, bcao);
+__global__ void
+prescan_large_unoptimized( int* output, int* input, int n, int* sums )
+{
+  int blockID = blockIdx.x;
+  int threadID = threadIdx.x;
+  int blockOffset = blockID * n;
+
+  extern __shared__ int temp[];
+  temp[ 2 * threadID ] = input[ blockOffset + ( 2 * threadID ) ];
+  temp[ 2 * threadID + 1 ] = input[ blockOffset + ( 2 * threadID ) + 1 ];
+
+  int offset = 1;
+  for ( int d = n >> 1; d > 0; d >>= 1 ) // build sum in place up the tree
+  {
+    __syncthreads();
+    if ( threadID < d )
+    {
+      int ai = offset * ( 2 * threadID + 1 ) - 1;
+      int bi = offset * ( 2 * threadID + 2 ) - 1;
+      temp[ bi ] += temp[ ai ];
+    }
+    offset *= 2;
   }
-  else {
-    scanSmallDeviceArray(d_out, d_in, length, bcao);
+  __syncthreads();
+
+  if ( threadID == 0 )
+  {
+    sums[ blockID ] = temp[ n - 1 ];
+    temp[ n - 1 ] = 0;
+  }
+
+  for ( int d = 1; d < n; d *= 2 ) // traverse down tree & build scan
+  {
+    offset >>= 1;
+    __syncthreads();
+    if ( threadID < d )
+    {
+      int ai = offset * ( 2 * threadID + 1 ) - 1;
+      int bi = offset * ( 2 * threadID + 2 ) - 1;
+      int t = temp[ ai ];
+      temp[ ai ] = temp[ bi ];
+      temp[ bi ] += t;
+    }
   }
+  __syncthreads();
+
+  output[ blockOffset + ( 2 * threadID ) ] = temp[ 2 * threadID ];
+  output[ blockOffset + ( 2 * threadID ) + 1 ] = temp[ 2 * threadID + 1 ];
 }
 
+__global__ void
+add( int* output, int length, int* n )
+{
+  int blockID = blockIdx.x;
+  int threadID = threadIdx.x;
+  int blockOffset = blockID * length;
 
-void scanLargeDeviceArray(int *d_out, int *d_in, int length, bool bcao) {
-	int remainder = length % (ELEMENTS_PER_BLOCK);
-	if (remainder == 0) {
-		scanLargeEvenDeviceArray(d_out, d_in, length, bcao);
-	}
-	else {
-		// perform a large scan on a compatible multiple of elements
-		int lengthMultiple = length - remainder;
-		scanLargeEvenDeviceArray(d_out, d_in, lengthMultiple, bcao);
+  output[ blockOffset + threadID ] += n[ blockID ];
+}
 
-		// scan the remaining elements and add the (inclusive) last element of the large scan to this
-		int *startOfOutputArray = &(d_out[lengthMultiple]);
-		scanSmallDeviceArray(startOfOutputArray, &(d_in[lengthMultiple]), remainder, bcao);
+__global__ void
+add( int* output, int length, int* n1, int* n2 )
+{
+  int blockID = blockIdx.x;
+  int threadID = threadIdx.x;
+  int blockOffset = blockID * length;
 
-		add<<<1, remainder>>>(startOfOutputArray, remainder, &(d_in[lengthMultiple - 1]), &(d_out[lengthMultiple - 1]));
-	}
+  output[ blockOffset + threadID ] += n1[ blockID ] + n2[ blockID ];
 }
 
-void scanSmallDeviceArray(int *d_out, int *d_in, int length, bool bcao) {
-	int powerOfTwo = nextPowerOfTwo(length);
+void
+prefix_scan( int* d_out, int* d_in, int length, bool bcao )
+{
+  if ( length > ELEMENTS_PER_BLOCK )
+  {
+    scanLargeDeviceArray( d_out, d_in, length, bcao );
+  }
+  else
+  {
+    scanSmallDeviceArray( d_out, d_in, length, bcao );
+  }
+}
 
-	if (bcao) {
-		prescan_arbitrary << <1, (length + 1) / 2, 2 * powerOfTwo * sizeof(int) >> >(d_out, d_in, length, powerOfTwo);
-	}
-	else {
-		prescan_arbitrary_unoptimized<< <1, (length + 1) / 2, 2 * powerOfTwo * sizeof(int) >> >(d_out, d_in, length, powerOfTwo);
-	}
+void
+scanLargeDeviceArray( int* d_out, int* d_in, int length, bool bcao )
+{
+  int remainder = length % ( ELEMENTS_PER_BLOCK );
+  if ( remainder == 0 )
+  {
+    scanLargeEvenDeviceArray( d_out, d_in, length, bcao );
+  }
+  else
+  {
+    // perform a large scan on a compatible multiple of elements
+    int lengthMultiple = length - remainder;
+    scanLargeEvenDeviceArray( d_out, d_in, lengthMultiple, bcao );
+
+    // scan the remaining elements and add the (inclusive) last element of the
+    // large scan to this
+    int* startOfOutputArray = &( d_out[ lengthMultiple ] );
+    scanSmallDeviceArray( startOfOutputArray, &( d_in[ lengthMultiple ] ), remainder, bcao );
+
+    add<<< 1, remainder>>>(
+      startOfOutputArray, remainder, &( d_in[ lengthMultiple - 1 ] ), &( d_out[ lengthMultiple - 1 ] ) );
+  }
 }
 
-void scanLargeEvenDeviceArray(int *d_out, int *d_in, int length, bool bcao) {
-	const int blocks = length / ELEMENTS_PER_BLOCK;
-	const int sharedMemArraySize = ELEMENTS_PER_BLOCK * sizeof(int);
-
-	int *d_sums, *d_incr;
-	cudaMalloc((void **)&d_sums, blocks * sizeof(int));
-	cudaMalloc((void **)&d_incr, blocks * sizeof(int));
-
-	if (bcao) {
-		prescan_large<<<blocks, THREADS_PER_BLOCK, 2 * sharedMemArraySize>>>(d_out, d_in, ELEMENTS_PER_BLOCK, d_sums);
-	}
-	else {
-		prescan_large_unoptimized<<<blocks, THREADS_PER_BLOCK, 2 * sharedMemArraySize>>>(d_out, d_in, ELEMENTS_PER_BLOCK, d_sums);
-	}
-
-	const int sumsArrThreadsNeeded = (blocks + 1) / 2;
-	if (sumsArrThreadsNeeded > THREADS_PER_BLOCK) {
-		// perform a large scan on the sums arr
-		scanLargeDeviceArray(d_incr, d_sums, blocks, bcao);
-	}
-	else {
-		// only need one block to scan sums arr so can use small scan
-		scanSmallDeviceArray(d_incr, d_sums, blocks, bcao);
-	}
-
-	add<<<blocks, ELEMENTS_PER_BLOCK>>>(d_out, ELEMENTS_PER_BLOCK, d_incr);
-
-	cudaFree(d_sums);
-	cudaFree(d_incr);
+void
+scanSmallDeviceArray( int* d_out, int* d_in, int length, bool bcao )
+{
+  int powerOfTwo = nextPowerOfTwo( length );
+
+  if ( bcao )
+  {
+    prescan_arbitrary <<< 1, ( length + 1 ) / 2, 2 * powerOfTwo * sizeof( int ) >>>( d_out, d_in, length, powerOfTwo );
+  }
+  else
+  {
+    prescan_arbitrary_unoptimized<<< 1, ( length + 1 ) / 2, 2 * powerOfTwo * sizeof( int ) >>>(
+      d_out, d_in, length, powerOfTwo );
+  }
+}
+
+void
+scanLargeEvenDeviceArray( int* d_out, int* d_in, int length, bool bcao )
+{
+  const int blocks = length / ELEMENTS_PER_BLOCK;
+  const int sharedMemArraySize = ELEMENTS_PER_BLOCK * sizeof( int );
+
+  int *d_sums, *d_incr;
+  CUDAMALLOCCTRL( "&d_sums", ( void** ) &d_sums, blocks * sizeof( int ) );
+  CUDAMALLOCCTRL( "&d_sums", ( void** ) &d_incr, blocks * sizeof( int ) );
+
+  if ( bcao )
+  {
+    prescan_large<<< blocks, THREADS_PER_BLOCK, 2 * sharedMemArraySize>>>( d_out, d_in, ELEMENTS_PER_BLOCK, d_sums );
+  }
+  else
+  {
+    prescan_large_unoptimized<<< blocks, THREADS_PER_BLOCK, 2 * sharedMemArraySize>>>(
+      d_out, d_in, ELEMENTS_PER_BLOCK, d_sums );
+  }
+
+  const int sumsArrThreadsNeeded = ( blocks + 1 ) / 2;
+  if ( sumsArrThreadsNeeded > THREADS_PER_BLOCK )
+  {
+    // perform a large scan on the sums arr
+    scanLargeDeviceArray( d_incr, d_sums, blocks, bcao );
+  }
+  else
+  {
+    // only need one block to scan sums arr so can use small scan
+    scanSmallDeviceArray( d_incr, d_sums, blocks, bcao );
+  }
+
+  add<<< blocks, ELEMENTS_PER_BLOCK>>>( d_out, ELEMENTS_PER_BLOCK, d_incr );
+
+  CUDAFREECTRL( "d_sums", d_sums );
+  CUDAFREECTRL( "d_incr", d_incr );
 }
 
 // from https://stackoverflow.com/a/12506181
-int nextPowerOfTwo(int x) {
-	int power = 1;
-	while (power < x) {
-		power *= 2;
-	}
-	return power;
+int
+nextPowerOfTwo( int x )
+{
+  int power = 1;
+  while ( power < x )
+  {
+    power *= 2;
+  }
+  return power;
 }
diff --git a/src/scan.h b/src/scan.h
index 2e63c7017..78960ec2b 100644
--- a/src/scan.h
+++ b/src/scan.h
@@ -1,31 +1,31 @@
 /*
-	Matt Dean - https://github.com/mattdean1/cuda
+        Matt Dean - https://github.com/mattdean1/cuda
 */
 
 #ifndef SCANCUH
 #define SCANCUH
 
-__global__ void prescan_arbitrary(int *g_odata, int *g_idata, int n, int powerOfTwo);
-__global__ void prescan_arbitrary_unoptimized(int *g_odata, int *g_idata, int n, int powerOfTwo);
+__global__ void prescan_arbitrary( int* g_odata, int* g_idata, int n, int powerOfTwo );
+__global__ void prescan_arbitrary_unoptimized( int* g_odata, int* g_idata, int n, int powerOfTwo );
 
-__global__ void prescan_large(int *g_odata, int *g_idata, int n, int* sums);
-__global__ void prescan_large_unoptimized(int *output, int *input, int n, int *sums);
+__global__ void prescan_large( int* g_odata, int* g_idata, int n, int* sums );
+__global__ void prescan_large_unoptimized( int* output, int* input, int n, int* sums );
 
-__global__ void add(int *output, int length, int *n1);
-__global__ void add(int *output, int length, int *n1, int *n2);
+__global__ void add( int* output, int length, int* n1 );
+__global__ void add( int* output, int length, int* n1, int* n2 );
 
-void _checkCudaError(const char *message, cudaError_t err, const char *caller);
-void printResult(const char* prefix, int result, long nanoseconds);
-void printResult(const char* prefix, int result, float milliseconds);
+void _checkCudaError( const char* message, cudaError_t err, const char* caller );
+void printResult( const char* prefix, int result, long nanoseconds );
+void printResult( const char* prefix, int result, float milliseconds );
 
-bool isPowerOfTwo(int x);
-int nextPowerOfTwo(int x);
+bool isPowerOfTwo( int x );
+int nextPowerOfTwo( int x );
 
 long get_nanos();
 
-void prefix_scan(int *output, int *input, int length, bool bcao);
-void scanLargeDeviceArray(int *output, int *input, int length, bool bcao);
-void scanSmallDeviceArray(int *d_out, int *d_in, int length, bool bcao);
-void scanLargeEvenDeviceArray(int *output, int *input, int length, bool bcao);
+void prefix_scan( int* output, int* input, int length, bool bcao );
+void scanLargeDeviceArray( int* output, int* input, int length, bool bcao );
+void scanSmallDeviceArray( int* d_out, int* d_in, int length, bool bcao );
+void scanLargeEvenDeviceArray( int* output, int* input, int length, bool bcao );
 
 #endif
diff --git a/src/send_spike.cu b/src/send_spike.cu
index b4586f1ed..16ba35a40 100644
--- a/src/send_spike.cu
+++ b/src/send_spike.cu
@@ -20,47 +20,47 @@
  *
  */
 
-
-
-
-
+#include "cuda_error.h"
+#include "send_spike.h"
 #include <config.h>
 #include <stdio.h>
-#include "send_spike.h"
-#include "cuda_error.h"
 
-int *d_SpikeNum;
-int *d_SpikeSourceIdx;
-int *d_SpikeConnIdx;
-float *d_SpikeHeight;
-int *d_SpikeTargetNum;
+int* d_SpikeNum;
+int* d_SpikeSourceIdx;
+int* d_SpikeConnIdx;
+float* d_SpikeHeight;
+int* d_SpikeTargetNum;
 
 __device__ int MaxSpikeNum;
-__device__ int *SpikeNum;
-__device__ int *SpikeSourceIdx;
-__device__ int *SpikeConnIdx;
-__device__ float *SpikeHeight;
-__device__ int *SpikeTargetNum;
+__device__ int* SpikeNum;
+__device__ int* SpikeSourceIdx;
+__device__ int* SpikeConnIdx;
+__device__ float* SpikeHeight;
+__device__ int* SpikeTargetNum;
 
-__device__ void SendSpike(int i_source, int i_conn, float height,
-			  int target_num)
+__device__ void
+SendSpike( int i_source, int i_conn, float height, int target_num )
 {
-  int pos = atomicAdd(SpikeNum, 1);
-  if (pos>=MaxSpikeNum) {
-    printf("Number of spikes larger than MaxSpikeNum: %d\n", MaxSpikeNum);
+  int pos = atomicAdd( SpikeNum, 1 );
+  if ( pos >= MaxSpikeNum )
+  {
+    printf( "Number of spikes larger than MaxSpikeNum: %d\n", MaxSpikeNum );
     *SpikeNum = MaxSpikeNum;
     return;
   }
-  SpikeSourceIdx[pos] = i_source;
-  SpikeConnIdx[pos] = i_conn;
-  SpikeHeight[pos] = height;
-  SpikeTargetNum[pos] = target_num;
+  SpikeSourceIdx[ pos ] = i_source;
+  SpikeConnIdx[ pos ] = i_conn;
+  SpikeHeight[ pos ] = height;
+  SpikeTargetNum[ pos ] = target_num;
 }
 
-__global__ void DeviceSpikeInit(int *spike_num, int *spike_source_idx,
-				int *spike_conn_idx, float *spike_height,
-				int *spike_target_num,
-				int max_spike_num)
+__global__ void
+DeviceSpikeInit( int* spike_num,
+  int* spike_source_idx,
+  int* spike_conn_idx,
+  float* spike_height,
+  int* spike_target_num,
+  int max_spike_num )
 {
   SpikeNum = spike_num;
   SpikeSourceIdx = spike_source_idx;
@@ -71,23 +71,24 @@ __global__ void DeviceSpikeInit(int *spike_num, int *spike_source_idx,
   *SpikeNum = 0;
 }
 
-
-void SpikeInit(int max_spike_num)
+void
+SpikeInit( int max_spike_num )
 {
-  //h_SpikeTargetNum = new int[PrefixScan::AllocSize];
+  // h_SpikeTargetNum = new int[PrefixScan::AllocSize];
 
-  CUDAMALLOCCTRL("&d_SpikeNum",&d_SpikeNum, sizeof(int));
-  CUDAMALLOCCTRL("&d_SpikeSourceIdx",&d_SpikeSourceIdx, max_spike_num*sizeof(int));
-  CUDAMALLOCCTRL("&d_SpikeConnIdx",&d_SpikeConnIdx, max_spike_num*sizeof(int));
-  CUDAMALLOCCTRL("&d_SpikeHeight",&d_SpikeHeight, max_spike_num*sizeof(float));
-  CUDAMALLOCCTRL("&d_SpikeTargetNum",&d_SpikeTargetNum, max_spike_num*sizeof(int));
-  //printf("here: SpikeTargetNum size: %d", max_spike_num);
-  DeviceSpikeInit<<<1,1>>>(d_SpikeNum, d_SpikeSourceIdx, d_SpikeConnIdx,
-			   d_SpikeHeight, d_SpikeTargetNum, max_spike_num);
+  CUDAMALLOCCTRL( "&d_SpikeNum", &d_SpikeNum, sizeof( int ) );
+  CUDAMALLOCCTRL( "&d_SpikeSourceIdx", &d_SpikeSourceIdx, max_spike_num * sizeof( int ) );
+  CUDAMALLOCCTRL( "&d_SpikeConnIdx", &d_SpikeConnIdx, max_spike_num * sizeof( int ) );
+  CUDAMALLOCCTRL( "&d_SpikeHeight", &d_SpikeHeight, max_spike_num * sizeof( float ) );
+  CUDAMALLOCCTRL( "&d_SpikeTargetNum", &d_SpikeTargetNum, max_spike_num * sizeof( int ) );
+  // printf("here: SpikeTargetNum size: %d", max_spike_num);
+  DeviceSpikeInit<<< 1, 1 >>>(
+    d_SpikeNum, d_SpikeSourceIdx, d_SpikeConnIdx, d_SpikeHeight, d_SpikeTargetNum, max_spike_num );
   gpuErrchk( cudaPeekAtLastError() );
 }
 
-__global__ void SpikeReset()
+__global__ void
+SpikeReset()
 {
   *SpikeNum = 0;
 }
diff --git a/src/send_spike.h b/src/send_spike.h
index 39e41055f..c7a97eb3a 100644
--- a/src/send_spike.h
+++ b/src/send_spike.h
@@ -20,34 +20,32 @@
  *
  */
 
-
-
-
-
 #ifndef SENDSPIKE_H
 #define SENDSPIKE_H
 
-extern int *d_SpikeNum;
-extern int *d_SpikeSourceIdx;
-extern int *d_SpikeConnIdx;
-extern float *d_SpikeHeight;
-extern int *d_SpikeTargetNum;
+extern int* d_SpikeNum;
+extern int* d_SpikeSourceIdx;
+extern int* d_SpikeConnIdx;
+extern float* d_SpikeHeight;
+extern int* d_SpikeTargetNum;
 
 extern __device__ int MaxSpikeNum;
-extern __device__ int *SpikeNum;
-extern __device__ int *SpikeSourceIdx;
-extern __device__ int *SpikeConnIdx;
-extern __device__ float *SpikeHeight;
-extern __device__ int *SpikeTargetNum;
-
-__global__ void DeviceSpikeInit(int *spike_num, int *spike_source_idx,
-				int *spike_conn_idx, float *spike_height,
-				int *spike_target_num, int max_spike_num);
-
-__device__ void SendSpike(int i_source, int i_conn, float height,
-			  int target_num);
-
-void SpikeInit(int max_spike_num);
+extern __device__ int* SpikeNum;
+extern __device__ int* SpikeSourceIdx;
+extern __device__ int* SpikeConnIdx;
+extern __device__ float* SpikeHeight;
+extern __device__ int* SpikeTargetNum;
+
+__global__ void DeviceSpikeInit( int* spike_num,
+  int* spike_source_idx,
+  int* spike_conn_idx,
+  float* spike_height,
+  int* spike_target_num,
+  int max_spike_num );
+
+__device__ void SendSpike( int i_source, int i_conn, float height, int target_num );
+
+void SpikeInit( int max_spike_num );
 
 __global__ void SpikeReset();
 
diff --git a/src/spike_buffer.cu b/src/spike_buffer.cu
index bbadfffe0..9098cd52e 100644
--- a/src/spike_buffer.cu
+++ b/src/spike_buffer.cu
@@ -20,7 +20,7 @@
  *
  */
 
-//#define OPTIMIZE_FOR_MEMORY
+// #define OPTIMIZE_FOR_MEMORY
 
 #include <config.h>
 #include <iostream>
@@ -29,11 +29,11 @@
 
 #include "cuda_error.h"
 #include "spike_buffer.h"
-//#include "connect.h"
-#include "send_spike.h"
-#include "node_group.h"
+// #include "connect.h"
 #include "connect.h"
+#include "node_group.h"
 #include "remote_spike.h"
+#include "send_spike.h"
 
 #define LAST_SPIKE_TIME_GUARD 0x70000000
 
@@ -41,7 +41,7 @@ extern __constant__ double NESTGPUTime;
 extern __constant__ long long NESTGPUTimeIdx;
 extern __constant__ float NESTGPUTimeResolution;
 extern __constant__ NodeGroupStruct NodeGroupArray[];
-extern __device__ int16_t *NodeGroupMap;
+extern __device__ int16_t* NodeGroupMap;
 
 __constant__ bool ExternalSpikeFlag;
 __device__ int MaxSpikeBufferSize;
@@ -49,332 +49,334 @@ __device__ int NSpikeBuffer;
 __device__ int MaxDelayNum;
 
 int h_NSpikeBuffer;
-bool ConnectionSpikeTimeFlag;
-
-float *d_LastSpikeHeight; // [NSpikeBuffer];
-__device__ float *LastSpikeHeight; //
 
-long long *d_LastSpikeTimeIdx; // [NSpikeBuffer];
-__device__ long long *LastSpikeTimeIdx; //
+float* d_LastSpikeHeight;          // [NSpikeBuffer];
+__device__ float* LastSpikeHeight; //
 
-long long *d_LastRevSpikeTimeIdx; // [NSpikeBuffer];
-__device__ long long *LastRevSpikeTimeIdx; //
+long long* d_LastSpikeTimeIdx;          // [NSpikeBuffer];
+__device__ long long* LastSpikeTimeIdx; //
 
-unsigned short *d_ConnectionSpikeTime; // [NConnection];
-__device__ unsigned short *ConnectionSpikeTime; //
+long long* d_LastRevSpikeTimeIdx;          // [NSpikeBuffer];
+__device__ long long* LastRevSpikeTimeIdx; //
 
 extern __constant__ int n_local_nodes;
 
 //////////////////////////////////////////////////////////////////////
 
-int *d_SpikeBufferSize; // [NSpikeBuffer];
-__device__ int *SpikeBufferSize; // [NSpikeBuffer];
+int* d_SpikeBufferSize;          // [NSpikeBuffer];
+__device__ int* SpikeBufferSize; // [NSpikeBuffer];
 // SpikeBufferSize[i_spike_buffer];
 // where i_spike_buffer is the source node index
 // number of spikes stored in the buffer
 
-int *d_SpikeBufferIdx0; // [NSpikeBuffer];
-__device__ int *SpikeBufferIdx0; // [NSpikeBuffer];
+int* d_SpikeBufferIdx0;          // [NSpikeBuffer];
+__device__ int* SpikeBufferIdx0; // [NSpikeBuffer];
 // SpikeBufferIdx0[i_spike_buffer];
 // where i_spike_buffer is the source node index
 // index of most recent spike stored in the buffer
 
-int *d_SpikeBufferTimeIdx; // [NSpikeBuffer*MaxSpikeBufferNum];
-__device__ int *SpikeBufferTimeIdx; // [NSpikeBuffer*MaxSpikeBufferNum];
+int* d_SpikeBufferTimeIdx;          // [NSpikeBuffer*MaxSpikeBufferNum];
+__device__ int* SpikeBufferTimeIdx; // [NSpikeBuffer*MaxSpikeBufferNum];
 // SpikeBufferTimeIdx[i_spike*NSpikeBuffer+i_spike_buffer];
 // time index of the spike
 
-int *d_SpikeBufferConnIdx; // [NSpikeBuffer*MaxSpikeBufferNum];
-__device__ int *SpikeBufferConnIdx; // [NSpikeBuffer*MaxSpikeBufferNum];
+int* d_SpikeBufferConnIdx;          // [NSpikeBuffer*MaxSpikeBufferNum];
+__device__ int* SpikeBufferConnIdx; // [NSpikeBuffer*MaxSpikeBufferNum];
 // SpikeBufferConnIdx[i_spike*NSpikeBuffer+i_spike_buffer];
 // index of the next connection group that will emit this spike
 
-float *d_SpikeBufferHeight; // [NSpikeBuffer*MaxSpikeBufferNum];
-__device__ float *SpikeBufferHeight; // [NSpikeBuffer*MaxSpikeBufferNum];
+float* d_SpikeBufferHeight;          // [NSpikeBuffer*MaxSpikeBufferNum];
+__device__ float* SpikeBufferHeight; // [NSpikeBuffer*MaxSpikeBufferNum];
 // SpikeBufferHeight[i_spike*NSpikeBuffer+i_spike_buffer];
 // spike height
 
-
 ////////////////////////////////////////////////////////////
 // push a new spike in spike buffer of a node
 ////////////////////////////////////////////////////////////
 // i_spike_buffer : node index
 // height: spike multiplicity
 ////////////////////////////////////////////////////////////
-__device__ void PushSpike(int i_spike_buffer, float height)
+__device__ void
+PushSpike( int i_spike_buffer, float height )
 {
   int den_delay_idx = 0;
-  if (i_spike_buffer<n_local_nodes) {
-    LastSpikeTimeIdx[i_spike_buffer] = NESTGPUTimeIdx;
-    LastSpikeHeight[i_spike_buffer] = height;
-    int i_group = NodeGroupMap[i_spike_buffer];
-    float *den_delay_arr = NodeGroupArray[i_group].den_delay_arr_;
+  if ( i_spike_buffer < n_local_nodes )
+  {
+    LastSpikeTimeIdx[ i_spike_buffer ] = NESTGPUTimeIdx;
+    LastSpikeHeight[ i_spike_buffer ] = height;
+    int i_group = NodeGroupMap[ i_spike_buffer ];
+    float* den_delay_arr = NodeGroupArray[ i_group ].den_delay_arr_;
     // check if node has dendritic delay
-    if (den_delay_arr != NULL) {
-      int i_neuron = i_spike_buffer - NodeGroupArray[i_group].i_node_0_;
-      int n_param = NodeGroupArray[i_group].n_param_;
+    if ( den_delay_arr != nullptr )
+    {
+      int i_neuron = i_spike_buffer - NodeGroupArray[ i_group ].i_node_0_;
+      int n_param = NodeGroupArray[ i_group ].n_param_;
       // dendritic delay index is stored in the parameter array
       // den_delay_arr points to the dendritic delay if the first
       // node of the group. The other are separate by steps = n_param
-      den_delay_idx = (int)round(den_delay_arr[i_neuron*n_param]
-				 /NESTGPUTimeResolution);
-      //printf("isb %d\tden_delay_idx: %d\n", i_spike_buffer, den_delay_idx);
+      den_delay_idx = ( int ) round( den_delay_arr[ i_neuron * n_param ] / NESTGPUTimeResolution );
+      // printf("isb %d\tden_delay_idx: %d\n", i_spike_buffer, den_delay_idx);
     }
     // printf("Node %d spikes at time %lld , den_delay_idx: %d\n",
-    //	 i_spike_buffer, NESTGPUTimeIdx, den_delay_idx); 
-    if (den_delay_idx==0) {
+    //	 i_spike_buffer, NESTGPUTimeIdx, den_delay_idx);
+    if ( den_delay_idx == 0 )
+    {
       // last time when spike is sent back to dendrites (e.g. for STDP)
-      LastRevSpikeTimeIdx[i_spike_buffer] = NESTGPUTimeIdx;
+      LastRevSpikeTimeIdx[ i_spike_buffer ] = NESTGPUTimeIdx;
     }
-    
-    if (ExternalSpikeFlag) {
+
+    if ( ExternalSpikeFlag )
+    {
       // if active spike should eventually be sent to remote connections
-      //printf("PushExternalSpike i_spike_buffer: %d height: %f\n",
+      // printf("PushExternalSpike i_spike_buffer: %d height: %f\n",
       //	   i_spike_buffer, height);
-      if (have_remote_spike_height) {
-	PushExternalSpike(i_spike_buffer, height);
+      if ( have_remote_spike_height )
+      {
+        PushExternalSpike( i_spike_buffer, height );
       }
-      else {
-	PushExternalSpike(i_spike_buffer);
+      else
+      {
+        PushExternalSpike( i_spike_buffer );
       }
     }
-    
+
     // if recording  spike counts is activated, increase counter
-    if (NodeGroupArray[i_group].spike_count_ != NULL) {
-      int i_node_0 = NodeGroupArray[i_group].i_node_0_;
-      NodeGroupArray[i_group].spike_count_[i_spike_buffer-i_node_0]++;
+    if ( NodeGroupArray[ i_group ].spike_count_ != nullptr )
+    {
+      int i_node_0 = NodeGroupArray[ i_group ].i_node_0_;
+      NodeGroupArray[ i_group ].spike_count_[ i_spike_buffer - i_node_0 ]++;
     }
-    
+
     // check if recording spike times is activated
-    int max_n_rec_spike_times = NodeGroupArray[i_group].max_n_rec_spike_times_;
-    if (max_n_rec_spike_times != 0) {
-      int i_node_rel = i_spike_buffer - NodeGroupArray[i_group].i_node_0_;
-      int n_rec_spike_times =
-	NodeGroupArray[i_group].n_rec_spike_times_[i_node_rel];
-      if (n_rec_spike_times>=max_n_rec_spike_times-1) {
-	printf("Maximum number of recorded spike times exceeded"
-	       " for spike buffer %d\n", i_spike_buffer);
+    int max_n_rec_spike_times = NodeGroupArray[ i_group ].max_n_rec_spike_times_;
+    if ( max_n_rec_spike_times != 0 )
+    {
+      int i_node_rel = i_spike_buffer - NodeGroupArray[ i_group ].i_node_0_;
+      int n_rec_spike_times = NodeGroupArray[ i_group ].n_rec_spike_times_[ i_node_rel ];
+      if ( n_rec_spike_times >= max_n_rec_spike_times - 1 )
+      {
+        printf(
+          "Maximum number of recorded spike times exceeded"
+          " for spike buffer %d\n",
+          i_spike_buffer );
       }
-      else { // record spike time
-	NodeGroupArray[i_group].rec_spike_times_
-	  [i_node_rel*max_n_rec_spike_times + n_rec_spike_times]
-	  = NESTGPUTime;
-	NodeGroupArray[i_group].n_rec_spike_times_[i_node_rel]++;
+      else
+      { // record spike time
+        NodeGroupArray[ i_group ].rec_spike_times_[ i_node_rel * max_n_rec_spike_times + n_rec_spike_times ] =
+          NESTGPUTime;
+        NodeGroupArray[ i_group ].n_rec_spike_times_[ i_node_rel ]++;
       }
     }
   }
 
   // spike should be stored if there are output connections
   // or if dendritic delay is > 0
-  uint conn_group_num = ConnGroupIdx0[i_spike_buffer + 1]
-    - ConnGroupIdx0[i_spike_buffer];
-  if (conn_group_num>0 || den_delay_idx>0) {
-    int Ns = SpikeBufferSize[i_spike_buffer]; // n. of spikes in buffer
-    if (Ns>=MaxSpikeBufferSize) {
-      printf("Maximum number of spikes in spike buffer exceeded"
-	     " for spike buffer %d\n", i_spike_buffer);
-      //exit(0);
+  uint conn_group_num = ConnGroupIdx0[ i_spike_buffer + 1 ] - ConnGroupIdx0[ i_spike_buffer ];
+  if ( conn_group_num > 0 || den_delay_idx > 0 )
+  {
+    int Ns = SpikeBufferSize[ i_spike_buffer ]; // n. of spikes in buffer
+    if ( Ns >= MaxSpikeBufferSize )
+    {
+      printf(
+        "Maximum number of spikes in spike buffer exceeded"
+        " for spike buffer %d\n",
+        i_spike_buffer );
+      // exit(0);
       return;
     }
     ///////////////////////////////////
     // push_front new spike in buffer
     //////////////////////////////////
-    SpikeBufferSize[i_spike_buffer]++; // increase n. of spikes in buffer
+    SpikeBufferSize[ i_spike_buffer ]++; // increase n. of spikes in buffer
     // the index of the most recent spike is0 should be decreased by 1
-    int is0 = (SpikeBufferIdx0[i_spike_buffer] + MaxSpikeBufferSize - 1)
-      % MaxSpikeBufferSize;
-    SpikeBufferIdx0[i_spike_buffer] = is0;
-    int i_arr = is0*NSpikeBuffer+i_spike_buffer; // spike index in array
-    SpikeBufferTimeIdx[i_arr] = 0; // time index is initialized to 0
-    SpikeBufferConnIdx[i_arr] = 0; // connect. group index is initialized to 0
-    SpikeBufferHeight[i_arr] = height; // spike multiplicity
+    int is0 = ( SpikeBufferIdx0[ i_spike_buffer ] + MaxSpikeBufferSize - 1 ) % MaxSpikeBufferSize;
+    SpikeBufferIdx0[ i_spike_buffer ] = is0;
+    int i_arr = is0 * NSpikeBuffer + i_spike_buffer; // spike index in array
+    SpikeBufferTimeIdx[ i_arr ] = 0;                 // time index is initialized to 0
+    SpikeBufferConnIdx[ i_arr ] = 0;                 // connect. group index is initialized to 0
+    SpikeBufferHeight[ i_arr ] = height;             // spike multiplicity
   }
 }
 
 ////////////////////////////////////////////////////////////
 // Update spike buffer of a node
 ////////////////////////////////////////////////////////////
-__global__ void SpikeBufferUpdate()
+__global__ void
+SpikeBufferUpdate()
 {
   int i_spike_buffer = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_spike_buffer>=NSpikeBuffer) return;
+  if ( i_spike_buffer >= NSpikeBuffer )
+  {
+    return;
+  }
 
   int den_delay_idx = 0;
-  if (i_spike_buffer<n_local_nodes) {
-    int i_group=NodeGroupMap[i_spike_buffer];
-    float *den_delay_arr = NodeGroupArray[i_group].den_delay_arr_;
+  if ( i_spike_buffer < n_local_nodes )
+  {
+    int i_group = NodeGroupMap[ i_spike_buffer ];
+    float* den_delay_arr = NodeGroupArray[ i_group ].den_delay_arr_;
     // check if node has dendritic delay
-    if (den_delay_arr != NULL) {
-      int i_neuron = i_spike_buffer - NodeGroupArray[i_group].i_node_0_;
-      int n_param = NodeGroupArray[i_group].n_param_;
+    if ( den_delay_arr != nullptr )
+    {
+      int i_neuron = i_spike_buffer - NodeGroupArray[ i_group ].i_node_0_;
+      int n_param = NodeGroupArray[ i_group ].n_param_;
       // dendritic delay index is stored in the parameter array
       // den_delay_arr points to the dendritic delay if the first
       // node of the group. The other are separate by steps = n_param
-      den_delay_idx = (int)round(den_delay_arr[i_neuron*n_param]
-				 /NESTGPUTimeResolution);
-      //printf("isb update %d\tden_delay_idx: %d\n", i_spike_buffer, den_delay_idx);
+      den_delay_idx = ( int ) round( den_delay_arr[ i_neuron * n_param ] / NESTGPUTimeResolution );
+      // printf("isb update %d\tden_delay_idx: %d\n", i_spike_buffer,
+      // den_delay_idx);
     }
   }
   // flag for sending spikes back through dendrites (e.g. for STDP)
   bool rev_spike = false;
-  int is0 = SpikeBufferIdx0[i_spike_buffer]; // index of most recent spike
-  int Ns = SpikeBufferSize[i_spike_buffer]; // n. of spikes in buffer
-  for (int is=0; is<Ns; is++) {
-    int is1 = (is0  + is)%MaxSpikeBufferSize;
-    int i_arr = is1*NSpikeBuffer+i_spike_buffer; // spike index in array
-    int i_conn = SpikeBufferConnIdx[i_arr];
-    int spike_time_idx = SpikeBufferTimeIdx[i_arr];
-    //if (i_spike_buffer==1) {
-    //printf("is %d st %d dd %d\n", is, spike_time_idx, den_delay_idx);
-    //}
-    if (spike_time_idx+1 == den_delay_idx) {
+  int is0 = SpikeBufferIdx0[ i_spike_buffer ]; // index of most recent spike
+  int Ns = SpikeBufferSize[ i_spike_buffer ];  // n. of spikes in buffer
+  for ( int is = 0; is < Ns; is++ )
+  {
+    int is1 = ( is0 + is ) % MaxSpikeBufferSize;
+    int i_arr = is1 * NSpikeBuffer + i_spike_buffer; // spike index in array
+    int i_conn = SpikeBufferConnIdx[ i_arr ];
+    int spike_time_idx = SpikeBufferTimeIdx[ i_arr ];
+    // if (i_spike_buffer==1) {
+    // printf("is %d st %d dd %d\n", is, spike_time_idx, den_delay_idx);
+    // }
+    if ( spike_time_idx + 1 == den_delay_idx )
+    {
       rev_spike = true;
     }
     // connection index in array
-    //int i_conn_arr = i_conn*NSpikeBuffer+i_spike_buffer;
-    //int ig = ConnGroupIdx0[i_spike_buffer] + i_conn;
+    // int i_conn_arr = i_conn*NSpikeBuffer+i_spike_buffer;
+    // int ig = ConnGroupIdx0[i_spike_buffer] + i_conn;
     // if spike time matches connection group delay deliver it
     // to global spike array
-    
-    uint conn_group_i0 = ConnGroupIdx0[i_spike_buffer];
-    uint conn_group_num = ConnGroupIdx0[i_spike_buffer+1] - conn_group_i0;
+
+    uint conn_group_i0 = ConnGroupIdx0[ i_spike_buffer ];
+    uint conn_group_num = ConnGroupIdx0[ i_spike_buffer + 1 ] - conn_group_i0;
     int ig = conn_group_i0 + i_conn;
-    
-    
-    if (i_conn<conn_group_num) {
+
+    if ( i_conn < conn_group_num )
+    {
 #ifdef OPTIMIZE_FOR_MEMORY
-      int64_t conn_group_i_conn0 = ConnGroupIConn0[ig];
-      uint i_block = (uint)(conn_group_i_conn0 / ConnBlockSize);
+      int64_t conn_group_i_conn0 = ConnGroupIConn0[ ig ];
+      uint i_block = ( uint ) ( conn_group_i_conn0 / ConnBlockSize );
       int64_t i_block_conn = conn_group_i_conn0 % ConnBlockSize;
-      uint source_delay = SourceDelayArray[i_block][i_block_conn];
+      uint source_delay = SourceDelayArray[ i_block ][ i_block_conn ];
       int conn_group_delay = source_delay & PortMask;
       // check if spike time matches connection group delay
-      if (spike_time_idx+1 == conn_group_delay) {
+      if ( spike_time_idx + 1 == conn_group_delay )
+      {
 #else
-      if (spike_time_idx+1 == ConnGroupDelay[ig]) {
-	int64_t conn_group_i_conn0 = ConnGroupIConn0[ig];
+      if ( spike_time_idx + 1 == ConnGroupDelay[ ig ] )
+      {
+        int64_t conn_group_i_conn0 = ConnGroupIConn0[ ig ];
 #endif
-	float height = SpikeBufferHeight[i_arr]; // spike multiplicity
-	// deliver spike
-	int64_t conn_group_n_conn = ConnGroupIConn0[ig+1] - conn_group_i_conn0;
-	//SendSpike(i_spike_buffer, i_conn, height, ConnGroupNConn[ig]);
-	SendSpike(i_spike_buffer, i_conn, height, conn_group_n_conn);
-	// increase index of the next conn. group that will emit this spike
-	i_conn++;
-	SpikeBufferConnIdx[i_arr] = i_conn;
+        float height = SpikeBufferHeight[ i_arr ]; // spike multiplicity
+        // deliver spike
+        int64_t conn_group_n_conn = ConnGroupIConn0[ ig + 1 ] - conn_group_i_conn0;
+        // SendSpike(i_spike_buffer, i_conn, height, ConnGroupNConn[ig]);
+        SendSpike( i_spike_buffer, i_conn, height, conn_group_n_conn );
+        // increase index of the next conn. group that will emit this spike
+        i_conn++;
+        SpikeBufferConnIdx[ i_arr ] = i_conn;
       }
     }
     // Check if the oldest spike should be removed from the buffer:
     // check if it is the oldest spike of the buffer
     // and if its connection group index is over the last connection group
     // and if spike time is greater than the dendritic delay
-    //if (is==Ns-1 && i_conn>=ConnGroupNum[i_spike_buffer]
-    if (is==Ns-1 && i_conn>=conn_group_num
-	&& spike_time_idx+1>=den_delay_idx) {
+    // if (is==Ns-1 && i_conn>=ConnGroupNum[i_spike_buffer]
+    if ( is == Ns - 1 && i_conn >= conn_group_num && spike_time_idx + 1 >= den_delay_idx )
+    {
       // in this case we don't need any more to keep track of the oldest spike
-      SpikeBufferSize[i_spike_buffer]--; // so remove it from buffer
+      SpikeBufferSize[ i_spike_buffer ]--; // so remove it from buffer
     }
-    else {
-      SpikeBufferTimeIdx[i_arr]++;
+    else
+    {
+      SpikeBufferTimeIdx[ i_arr ]++;
       // increase time index
     }
   }
-    
-  if (rev_spike) {
-    LastRevSpikeTimeIdx[i_spike_buffer] = NESTGPUTimeIdx+1;
+
+  if ( rev_spike )
+  {
+    LastRevSpikeTimeIdx[ i_spike_buffer ] = NESTGPUTimeIdx + 1;
   }
 }
 
-__global__ void InitLastSpikeTimeIdx(unsigned int n_spike_buffers,
-				       int spike_time_idx)
+__global__ void
+InitLastSpikeTimeIdx( unsigned int n_spike_buffers, int spike_time_idx )
 {
   unsigned int i_spike_buffer = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_spike_buffer>=n_spike_buffers) {
+  if ( i_spike_buffer >= n_spike_buffers )
+  {
     return;
   }
-  LastSpikeTimeIdx[i_spike_buffer] = spike_time_idx;
-  LastRevSpikeTimeIdx[i_spike_buffer] = spike_time_idx;
+  LastSpikeTimeIdx[ i_spike_buffer ] = spike_time_idx;
+  LastRevSpikeTimeIdx[ i_spike_buffer ] = spike_time_idx;
 }
 
-
-int SpikeBufferInit(uint n_spike_buffers, int max_spike_buffer_size)
+int
+spikeBufferInit( uint n_spike_buffers, int max_spike_buffer_size, int max_delay_num )
 {
-  //unsigned int n_spike_buffers = net_connection->connection_.size();
+  // unsigned int n_spike_buffers = net_connection->connection_.size();
   h_NSpikeBuffer = n_spike_buffers;
-  int max_delay_num = h_MaxDelayNum;
-  //printf("mdn: %d\n", max_delay_num);
-  
-  CUDAMALLOCCTRL("&d_LastSpikeTimeIdx",&d_LastSpikeTimeIdx, n_spike_buffers*sizeof(long long));
-  CUDAMALLOCCTRL("&d_LastSpikeHeight",&d_LastSpikeHeight, n_spike_buffers*sizeof(float));
-  CUDAMALLOCCTRL("&d_LastRevSpikeTimeIdx",&d_LastRevSpikeTimeIdx, n_spike_buffers
-		       *sizeof(long long));
-  
-  CUDAMALLOCCTRL("&d_SpikeBufferSize",&d_SpikeBufferSize, n_spike_buffers*sizeof(int));
-  CUDAMALLOCCTRL("&d_SpikeBufferIdx0",&d_SpikeBufferIdx0, n_spike_buffers*sizeof(int));
-  CUDAMALLOCCTRL("&d_SpikeBufferTimeIdx",&d_SpikeBufferTimeIdx,
-		       n_spike_buffers*max_spike_buffer_size*sizeof(int));
-  CUDAMALLOCCTRL("&d_SpikeBufferConnIdx",&d_SpikeBufferConnIdx,
-		       n_spike_buffers*max_spike_buffer_size*sizeof(int));
-  CUDAMALLOCCTRL("&d_SpikeBufferHeight",&d_SpikeBufferHeight,
-		       n_spike_buffers*max_spike_buffer_size*sizeof(float));
-  gpuErrchk(cudaMemsetAsync(d_SpikeBufferSize, 0, n_spike_buffers*sizeof(int)));
-  gpuErrchk(cudaMemsetAsync(d_SpikeBufferIdx0, 0, n_spike_buffers*sizeof(int)));
-
-  if (ConnectionSpikeTimeFlag){
-    //h_conn_spike_time = new unsigned short[n_conn];
-    CUDAMALLOCCTRL("&d_ConnectionSpikeTime",&d_ConnectionSpikeTime,
-			 NConn*sizeof(unsigned short));
-    //gpuErrchk(cudaMemset(d_ConnectionSpikeTime, 0,
-    //			 n_conn*sizeof(unsigned short)));
-  }
+  // int max_delay_num = h_MaxDelayNum;
+  // printf("mdn: %d\n", max_delay_num);
 
-  /*
-  if(ConnectionSpikeTimeFlag) {
-    cudaMemcpyAsync(d_ConnectionGroupTargetSpikeTime,
-	       h_ConnectionGroupTargetSpikeTime,
-	       n_spike_buffers*max_delay_num*sizeof(unsigned short*),
-	       cudaMemcpyHostToDevice);
-  }
-  */
-  
-  DeviceSpikeBufferInit<<<1,1>>>(n_spike_buffers, max_delay_num,
-			   max_spike_buffer_size,
-			   d_LastSpikeTimeIdx, d_LastSpikeHeight,	 
-			   d_ConnectionSpikeTime,
-			   d_SpikeBufferSize, d_SpikeBufferIdx0,
-			   d_SpikeBufferTimeIdx,
-			   d_SpikeBufferConnIdx, d_SpikeBufferHeight,
-			   d_LastRevSpikeTimeIdx
-				 );
+  CUDAMALLOCCTRL( "&d_LastSpikeTimeIdx", &d_LastSpikeTimeIdx, n_spike_buffers * sizeof( long long ) );
+  CUDAMALLOCCTRL( "&d_LastSpikeHeight", &d_LastSpikeHeight, n_spike_buffers * sizeof( float ) );
+  CUDAMALLOCCTRL( "&d_LastRevSpikeTimeIdx", &d_LastRevSpikeTimeIdx, n_spike_buffers * sizeof( long long ) );
+
+  CUDAMALLOCCTRL( "&d_SpikeBufferSize", &d_SpikeBufferSize, n_spike_buffers * sizeof( int ) );
+  CUDAMALLOCCTRL( "&d_SpikeBufferIdx0", &d_SpikeBufferIdx0, n_spike_buffers * sizeof( int ) );
+  CUDAMALLOCCTRL(
+    "&d_SpikeBufferTimeIdx", &d_SpikeBufferTimeIdx, n_spike_buffers * max_spike_buffer_size * sizeof( int ) );
+  CUDAMALLOCCTRL(
+    "&d_SpikeBufferConnIdx", &d_SpikeBufferConnIdx, n_spike_buffers * max_spike_buffer_size * sizeof( int ) );
+  CUDAMALLOCCTRL(
+    "&d_SpikeBufferHeight", &d_SpikeBufferHeight, n_spike_buffers * max_spike_buffer_size * sizeof( float ) );
+  gpuErrchk( cudaMemsetAsync( d_SpikeBufferSize, 0, n_spike_buffers * sizeof( int ) ) );
+  gpuErrchk( cudaMemsetAsync( d_SpikeBufferIdx0, 0, n_spike_buffers * sizeof( int ) ) );
+
+  DeviceSpikeBufferInit<<< 1, 1 >>>( n_spike_buffers,
+    max_delay_num,
+    max_spike_buffer_size,
+    d_LastSpikeTimeIdx,
+    d_LastSpikeHeight,
+    d_SpikeBufferSize,
+    d_SpikeBufferIdx0,
+    d_SpikeBufferTimeIdx,
+    d_SpikeBufferConnIdx,
+    d_SpikeBufferHeight,
+    d_LastRevSpikeTimeIdx );
   gpuErrchk( cudaPeekAtLastError() );
-  
-  InitLastSpikeTimeIdx
-    <<<(n_spike_buffers+1023)/1024, 1024>>>
-    (n_spike_buffers, LAST_SPIKE_TIME_GUARD);
+
+  InitLastSpikeTimeIdx <<< ( n_spike_buffers + 1023 ) / 1024, 1024 >>>( n_spike_buffers, LAST_SPIKE_TIME_GUARD );
   gpuErrchk( cudaPeekAtLastError() );
-  gpuErrchk(cudaMemsetAsync(d_LastSpikeHeight, 0,
-		       n_spike_buffers*sizeof(unsigned short)));
-  
+  gpuErrchk( cudaMemsetAsync( d_LastSpikeHeight, 0, n_spike_buffers * sizeof( unsigned short ) ) );
+
   return 0;
 }
 
-__global__ void DeviceSpikeBufferInit(int n_spike_buffers, int max_delay_num,
-				int max_spike_buffer_size,
-				long long *last_spike_time_idx,
-				float *last_spike_height,
-				unsigned short *conn_spike_time,
-				int *spike_buffer_size, int *spike_buffer_idx0,
-				int *spike_buffer_time,
-				int *spike_buffer_conn,
-				float *spike_buffer_height,
-				long long *last_rev_spike_time_idx)
+__global__ void
+DeviceSpikeBufferInit( int n_spike_buffers,
+  int max_delay_num,
+  int max_spike_buffer_size,
+  long long* last_spike_time_idx,
+  float* last_spike_height,
+  int* spike_buffer_size,
+  int* spike_buffer_idx0,
+  int* spike_buffer_time,
+  int* spike_buffer_conn,
+  float* spike_buffer_height,
+  long long* last_rev_spike_time_idx )
 {
   NSpikeBuffer = n_spike_buffers;
   MaxDelayNum = max_delay_num;
   MaxSpikeBufferSize = max_spike_buffer_size;
   LastSpikeTimeIdx = last_spike_time_idx;
   LastSpikeHeight = last_spike_height;
-  ConnectionSpikeTime = conn_spike_time;
   SpikeBufferSize = spike_buffer_size;
   SpikeBufferIdx0 = spike_buffer_idx0;
   SpikeBufferTimeIdx = spike_buffer_time;
diff --git a/src/spike_buffer.h b/src/spike_buffer.h
index d7af16276..f8c6c6bcc 100644
--- a/src/spike_buffer.h
+++ b/src/spike_buffer.h
@@ -20,13 +20,9 @@
  *
  */
 
-
-
-
-
 #ifndef SPIKEBUFFER_H
 #define SPIKEBUFFER_H
-//#include "connect.h"
+// #include "connect.h"
 
 extern __constant__ bool ExternalSpikeFlag;
 extern __device__ int MaxSpikeBufferSize;
@@ -34,57 +30,55 @@ extern __device__ int NSpikeBuffer;
 extern __device__ int MaxDelayNum;
 
 extern int h_NSpikeBuffer;
-extern bool ConnectionSpikeTimeFlag;
-
-extern float *d_LastSpikeHeight; // [NSpikeBuffer];
-extern __device__ float *LastSpikeHeight; //
 
-extern long long *d_LastSpikeTimeIdx; // [NSpikeBuffer];
-extern __device__ long long *LastSpikeTimeIdx; //
+extern float* d_LastSpikeHeight;          // [NSpikeBuffer];
+extern __device__ float* LastSpikeHeight; //
 
-extern long long *d_LastRevSpikeTimeIdx; // [NSpikeBuffer];
-extern __device__ long long *LastRevSpikeTimeIdx; //
+extern long long* d_LastSpikeTimeIdx;          // [NSpikeBuffer];
+extern __device__ long long* LastSpikeTimeIdx; //
 
-extern unsigned short *d_ConnectionSpikeTime; // [NConnection];
-extern __device__ unsigned short *ConnectionSpikeTime; //
+extern long long* d_LastRevSpikeTimeIdx;          // [NSpikeBuffer];
+extern __device__ long long* LastRevSpikeTimeIdx; //
 
+extern unsigned short* d_ConnectionSpikeTime;          // [NConnection];
+extern __device__ unsigned short* ConnectionSpikeTime; //
 
-extern int *d_SpikeBufferSize;
-extern __device__ int *SpikeBufferSize;
+extern int* d_SpikeBufferSize;
+extern __device__ int* SpikeBufferSize;
 // number of spikes stored in the buffer
 
-extern int *d_SpikeBufferIdx0;
-extern __device__ int *SpikeBufferIdx0;
+extern int* d_SpikeBufferIdx0;
+extern __device__ int* SpikeBufferIdx0;
 // index of most recent spike stored in the buffer
 
-extern int *d_SpikeBufferTimeIdx;
-extern __device__ int *SpikeBufferTimeIdx;
+extern int* d_SpikeBufferTimeIdx;
+extern __device__ int* SpikeBufferTimeIdx;
 // time index of the spike
 
-extern int *d_SpikeBufferConnIdx;
-extern __device__ int *SpikeBufferConnIdx;
+extern int* d_SpikeBufferConnIdx;
+extern __device__ int* SpikeBufferConnIdx;
 // index of the next connection group that will emit this spike
 
-extern float *d_SpikeBufferHeight;
-extern __device__ float *SpikeBufferHeight;
+extern float* d_SpikeBufferHeight;
+extern __device__ float* SpikeBufferHeight;
 // spike height
 
-
-__device__ void PushSpike(int i_spike_buffer, float height);
+__device__ void PushSpike( int i_spike_buffer, float height );
 
 __global__ void SpikeBufferUpdate();
 
-__global__ void DeviceSpikeBufferInit(int n_spike_buffers, int max_delay_num,
-				int max_spike_buffer_size,
-				long long *last_spike_time_idx,
-				float *last_spike_height,
-				unsigned short *conn_spike_time,      
-				int *spike_buffer_size, int *spike_buffer_idx0,
-				int *spike_buffer_time,
-				int *spike_buffer_conn,
-				float *spike_buffer_height,
-                                long long *last_rev_spike_time_idx);
-
-int SpikeBufferInit(uint n_spike_buffers, int max_spike_buffer_size);
+__global__ void DeviceSpikeBufferInit( int n_spike_buffers,
+  int max_delay_num,
+  int max_spike_buffer_size,
+  long long* last_spike_time_idx,
+  float* last_spike_height,
+  int* spike_buffer_size,
+  int* spike_buffer_idx0,
+  int* spike_buffer_time,
+  int* spike_buffer_conn,
+  float* spike_buffer_height,
+  long long* last_rev_spike_time_idx );
+
+int spikeBufferInit( uint n_spike_buffers, int max_spike_buffer_size, int max_delay_num );
 
 #endif
diff --git a/src/spike_detector.cu b/src/spike_detector.cu
index 6fddf83e8..045a0d319 100644
--- a/src/spike_detector.cu
+++ b/src/spike_detector.cu
@@ -20,58 +20,52 @@
  *
  */
 
-
-
-
-
-#include <config.h>
 #include <cmath>
+#include <config.h>
 #include <iostream>
 #include <string>
-//#include <stdio.h>
+// #include <stdio.h>
 
 #include "cuda_error.h"
 #include "nestgpu.h"
 #include "neuron_models.h"
 #include "spike_detector.h"
-				    //#include "spike_buffer.h"
-//#include "parrot_neuron_variables.h"
+// #include "spike_buffer.h"
+// #include "parrot_neuron_variables.h"
 
-enum {
-  i_spike_detector_hold_spike_height=0,
+enum
+{
+  i_spike_detector_hold_spike_height = 0,
   N_SPIKE_DETECTOR_SCAL_PARAM
 };
 
-const std::string spike_detector_scal_param_name[N_SPIKE_DETECTOR_SCAL_PARAM]
-= {"hold_spike_height"};
+const std::string spike_detector_scal_param_name[ N_SPIKE_DETECTOR_SCAL_PARAM ] = { "hold_spike_height" };
 
-enum {
-  i_spike_detector_input_spike_height=0,
+enum
+{
+  i_spike_detector_input_spike_height = 0,
   i_spike_detector_spike_height,
   N_SPIKE_DETECTOR_SCAL_VAR
 };
 
-const std::string spike_detector_scal_var_name[N_SPIKE_DETECTOR_SCAL_VAR]
-= {"input_spike_height", "spike_height"};
-
+const std::string spike_detector_scal_var_name[ N_SPIKE_DETECTOR_SCAL_VAR ] = { "input_spike_height", "spike_height" };
 
-__global__
-void spike_detector_UpdateKernel(int i_node_0, int n_node, float *var_arr,
-				float *param_arr, int n_var, int n_param)
+__global__ void
+spike_detector_UpdateKernel( int i_node_0, int n_node, float* var_arr, float* param_arr, int n_var, int n_param )
 {
   int irel_node = threadIdx.x + blockIdx.x * blockDim.x;
-  if (irel_node < n_node) {
-    float *input_spike_height_pt = var_arr + irel_node*n_var
-      + i_spike_detector_input_spike_height;
-    float *spike_height_pt = var_arr + irel_node*n_var
-      + i_spike_detector_spike_height;
-    float *hold_spike_height_pt = param_arr + irel_node*n_param +
-      i_spike_detector_hold_spike_height;
-    //int i_node = i_node_0 + irel_node;
+  if ( irel_node < n_node )
+  {
+    float* input_spike_height_pt = var_arr + irel_node * n_var + i_spike_detector_input_spike_height;
+    float* spike_height_pt = var_arr + irel_node * n_var + i_spike_detector_spike_height;
+    float* hold_spike_height_pt = param_arr + irel_node * n_param + i_spike_detector_hold_spike_height;
+    // int i_node = i_node_0 + irel_node;
     float spike_height = *input_spike_height_pt;
-    if (spike_height != 0.0) {
-      if (*hold_spike_height_pt==0.0) {
-	spike_height = 1.0;
+    if ( spike_height != 0.0 )
+    {
+      if ( *hold_spike_height_pt == 0.0 )
+      {
+        spike_height = 1.0;
       }
       *input_spike_height_pt = 0;
     }
@@ -79,68 +73,69 @@ void spike_detector_UpdateKernel(int i_node_0, int n_node, float *var_arr,
   }
 }
 
-
-int spike_detector::Init(int i_node_0, int n_node, int /*n_port*/,
-			int i_group)
+int
+spike_detector::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 1 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 1 /*n_port*/, i_group );
   node_type_ = i_spike_detector_model;
 
   n_scal_var_ = N_SPIKE_DETECTOR_SCAL_VAR;
   n_var_ = n_scal_var_;
   scal_var_name_ = spike_detector_scal_var_name;
-  
+
   n_scal_param_ = N_SPIKE_DETECTOR_SCAL_PARAM;
   n_param_ = n_scal_param_;
   scal_param_name_ = spike_detector_scal_param_name;
 
-  CUDAMALLOCCTRL("&var_arr_",&var_arr_, n_node_*n_var_*sizeof(float));
+  CUDAMALLOCCTRL( "&var_arr_", &var_arr_, n_node_ * n_var_ * sizeof( float ) );
 
-  CUDAMALLOCCTRL("&param_arr_",&param_arr_, n_node_*n_param_*sizeof(float));
+  CUDAMALLOCCTRL( "&param_arr_", &param_arr_, n_node_ * n_param_ * sizeof( float ) );
 
-  SetScalParam(0, n_node, "hold_spike_height", 1.0);
+  SetScalParam( 0, n_node, "hold_spike_height", 1.0 );
 
-  SetScalVar(0, n_node, "input_spike_height", 0.0);
+  SetScalVar( 0, n_node, "input_spike_height", 0.0 );
 
-  SetScalVar(0, n_node, "spike_height", 0.0);
+  SetScalVar( 0, n_node, "spike_height", 0.0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input signal is stored in input_spike_height
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("input_spike_height");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "input_spike_height" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  
+
   return 0;
 }
 
-int spike_detector::Update(long long /*i_time*/, double /*t1*/)
+int
+spike_detector::Update( long long /*i_time*/, double /*t1*/ )
 {
-  spike_detector_UpdateKernel<<<(n_node_+1023)/1024, 1024>>>
-    (i_node_0_, n_node_, var_arr_, param_arr_, n_var_, n_param_);
-  //gpuErrchk( cudaPeekAtLastError() );
-  //gpuErrchk( cudaDeviceSynchronize() );
+  spike_detector_UpdateKernel<<< ( n_node_ + 1023 ) / 1024, 1024 >>>(
+    i_node_0_, n_node_, var_arr_, param_arr_, n_var_, n_param_ );
+  // gpuErrchk( cudaPeekAtLastError() );
+  // gpuErrchk( cudaDeviceSynchronize() );
 
   return 0;
 }
 
-int spike_detector::Free()
+int
+spike_detector::Free()
 {
-  CUDAFREECTRL("var_arr_",var_arr_);
-  CUDAFREECTRL("param_arr_",param_arr_);	    
+  CUDAFREECTRL( "var_arr_", var_arr_ );
+  CUDAFREECTRL( "param_arr_", param_arr_ );
 
   return 0;
 }
 
 spike_detector::~spike_detector()
 {
-  if (n_node_>0) {
+  if ( n_node_ > 0 )
+  {
     Free();
   }
 }
diff --git a/src/spike_detector.h b/src/spike_detector.h
index 33050859a..7513e4e9d 100644
--- a/src/spike_detector.h
+++ b/src/spike_detector.h
@@ -20,16 +20,12 @@
  *
  */
 
-
-
-
-
 #ifndef SPIKEDETECTOR_H
 #define SPIKEDETECTOR_H
 
 #include <iostream>
 #include <string>
-//#include "node_group.h"
+// #include "node_group.h"
 #include "base_neuron.h"
 
 /* BeginUserDocs: device, recorder, spike
@@ -46,7 +42,7 @@ The ``spike_detector`` collects and records all spikes it receives
 from neurons that are connected to it.
 
 Any neuron from which spikes have to be recorded must be connected to
-the spike recorder using the standard ``Connect`` command. 
+the spike recorder using the standard ``Connect`` command.
 
 .. warning::
 
@@ -65,22 +61,23 @@ Here follows an example:
 
   neuron = nestgpu.Create("aeif_cond_beta", 3)
   spike_det = nestgpu.Create("spike_detector")
-  nestgpu.Connect([neuron[0]], spike_det, {"rule": "one_to_one"}, {"weight": 1.0, "delay": 1.0, "receptor":0})
+  nestgpu.Connect([neuron[0]], spike_det, {"rule": "one_to_one"},
+{"weight": 1.0, "delay": 1.0, "receptor":0})
 
   recorder = nestgpu.CreateRecord("", ["spike_height"], [spike_det[0]], [0])
 
   nestgpu.Simulate()
-   
+
   recorded_data = nestgpu.GetRecordData(record)
   time = [row[0] for row in recorded_data]
   spike_height = [row[1] for row in recorded_data]
 
 The output is thus a continuous variable, which is 0 when no spikes are emitted
-by the neuron, and is ``weights`` when a spike is emitted. 
+by the neuron, and is ``weights`` when a spike is emitted.
 
 .. note::
 
-  A faster implementation for spike recording, which is also similar to 
+  A faster implementation for spike recording, which is also similar to
   the one of NEST in terms of output, is described in the guide of
   :doc:`how to record spikes <../guides/how_to_record_spikes>`.
 
@@ -94,16 +91,14 @@ EndUserDocs */
 
 class spike_detector : public BaseNeuron
 {
- public:
+public:
   ~spike_detector();
 
-  int Init(int i_node_0, int n_node, int n_port, int i_group);
+  int Init( int i_node_0, int n_node, int n_port, int i_group );
 
   int Free();
-  
-  int Update(long long it, double t1);
 
+  int Update( long long it, double t1 );
 };
 
-
 #endif
diff --git a/src/spike_generator.cu b/src/spike_generator.cu
index 3d08f8f12..6108797cb 100644
--- a/src/spike_generator.cu
+++ b/src/spike_generator.cu
@@ -20,278 +20,308 @@
  *
  */
 
-
-
-
-
-#include <config.h>
 #include <cmath>
+#include <config.h>
 #include <iostream>
-//#include <stdio.h>
+// #include <stdio.h>
 
+#include "cuda_error.h"
 #include "nestgpu.h"
 #include "neuron_models.h"
-#include "spike_generator.h"
 #include "spike_buffer.h"
-#include "cuda_error.h"
-//#include "spike_generator_variables.h"
+#include "spike_generator.h"
+// #include "spike_generator_variables.h"
 const int N_SPIKE_GEN_SCAL_PARAM = 0;
-const std::string *spike_gen_scal_param_name = NULL;
-enum {
-  i_SPIKE_TIME_ARRAY_PARAM=0,
+const std::string* spike_gen_scal_param_name = nullptr;
+enum
+{
+  i_SPIKE_TIME_ARRAY_PARAM = 0,
   i_SPIKE_HEIGHT_ARRAY_PARAM,
   N_SPIKE_GEN_ARRAY_PARAM
 };
 
-const std::string spike_gen_array_param_name[N_SPIKE_GEN_ARRAY_PARAM]
-= {"spike_times", "spike_heights"};
+const std::string spike_gen_array_param_name[ N_SPIKE_GEN_ARRAY_PARAM ] = { "spike_times", "spike_heights" };
 
-__global__
-void spike_generatorUpdate(int i_node_0, int n_node, long long i_time,
-			  int *n_spikes, int *i_spike, int **spike_time_idx,
-			  float **spike_height)
+__global__ void
+spike_generatorUpdate( int i_node_0,
+  int n_node,
+  long long i_time,
+  int* n_spikes,
+  int* i_spike,
+  int** spike_time_idx,
+  float** spike_height )
 {
   int irel_node = threadIdx.x + blockIdx.x * blockDim.x;
-  if (irel_node < n_node) {
-    if (n_spikes[irel_node] > 0) {
-      int is = i_spike[irel_node];
-      if (is<n_spikes[irel_node]
-          && spike_time_idx[irel_node][is]==i_time) {
-	int i_node = i_node_0 + irel_node;
-	float height = spike_height[irel_node][is];
-	PushSpike(i_node, height);
-	i_spike[irel_node]++;
+  if ( irel_node < n_node )
+  {
+    if ( n_spikes[ irel_node ] > 0 )
+    {
+      int is = i_spike[ irel_node ];
+      if ( is < n_spikes[ irel_node ] && spike_time_idx[ irel_node ][ is ] == i_time )
+      {
+        int i_node = i_node_0 + irel_node;
+        float height = spike_height[ irel_node ][ is ];
+        PushSpike( i_node, height );
+        i_spike[ irel_node ]++;
       }
     }
   }
 }
 
-int spike_generator::Init(int i_node_0, int n_node, int /*n_port*/,
-			  int i_group)
+int
+spike_generator::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 0 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 0 /*n_port*/, i_group );
   node_type_ = i_spike_generator_model;
   n_scal_param_ = N_SPIKE_GEN_SCAL_PARAM;
   n_param_ = n_scal_param_;
   scal_param_name_ = spike_gen_scal_param_name;
 
-  for (int i=0; i<N_SPIKE_GEN_ARRAY_PARAM; i++) {
-    array_param_name_.push_back(spike_gen_array_param_name[i]);
-  }				
-  std::vector<float> empty_vect;
+  for ( int i = 0; i < N_SPIKE_GEN_ARRAY_PARAM; i++ )
+  {
+    array_param_name_.push_back( spike_gen_array_param_name[ i ] );
+  }
+  std::vector< float > empty_vect;
   spike_time_vect_.clear();
-  spike_time_vect_.insert(spike_time_vect_.begin(), n_node, empty_vect);
+  spike_time_vect_.insert( spike_time_vect_.begin(), n_node, empty_vect );
   spike_height_vect_.clear();
-  spike_height_vect_.insert(spike_height_vect_.begin(), n_node, empty_vect);
-  
-  CUDAMALLOCCTRL("&param_arr_",&param_arr_, n_node_*n_param_*sizeof(float));
-
-  //SetScalParam(0, n_node, "origin", 0.0);
-  
-  h_spike_time_idx_ = new int*[n_node_];
-  h_spike_height_ = new float*[n_node_];
-  for (int i_node=0; i_node<n_node_; i_node++) {
-    h_spike_time_idx_[i_node] = 0;
-    h_spike_height_[i_node] = 0;
+  spike_height_vect_.insert( spike_height_vect_.begin(), n_node, empty_vect );
+
+  CUDAMALLOCCTRL( "&param_arr_", &param_arr_, n_node_ * n_param_ * sizeof( float ) );
+
+  // SetScalParam(0, n_node, "origin", 0.0);
+
+  h_spike_time_idx_ = new int*[ n_node_ ];
+  h_spike_height_ = new float*[ n_node_ ];
+  for ( int i_node = 0; i_node < n_node_; i_node++ )
+  {
+    h_spike_time_idx_[ i_node ] = nullptr;
+    h_spike_height_[ i_node ] = nullptr;
   }
-  
-  CUDAMALLOCCTRL("&d_n_spikes_",&d_n_spikes_, n_node_*sizeof(int));
-  CUDAMALLOCCTRL("&d_i_spike_",&d_i_spike_, n_node_*sizeof(int));
-  CUDAMALLOCCTRL("&d_spike_time_idx_",&d_spike_time_idx_, n_node_*sizeof(int*));
-  CUDAMALLOCCTRL("&d_spike_height_",&d_spike_height_, n_node_*sizeof(float*));
-  
-  gpuErrchk(cudaMemset(d_n_spikes_, 0, n_node_*sizeof(int)));
-  gpuErrchk(cudaMemset(d_i_spike_, 0, n_node_*sizeof(int)));
-  gpuErrchk(cudaMemset(d_spike_time_idx_, 0, n_node_*sizeof(int*)));
-  gpuErrchk(cudaMemset(d_spike_height_, 0, n_node_*sizeof(float*)));
-  
+
+  CUDAMALLOCCTRL( "&d_n_spikes_", &d_n_spikes_, n_node_ * sizeof( int ) );
+  CUDAMALLOCCTRL( "&d_i_spike_", &d_i_spike_, n_node_ * sizeof( int ) );
+  CUDAMALLOCCTRL( "&d_spike_time_idx_", &d_spike_time_idx_, n_node_ * sizeof( int* ) );
+  CUDAMALLOCCTRL( "&d_spike_height_", &d_spike_height_, n_node_ * sizeof( float* ) );
+
+  gpuErrchk( cudaMemset( d_n_spikes_, 0, n_node_ * sizeof( int ) ) );
+  gpuErrchk( cudaMemset( d_i_spike_, 0, n_node_ * sizeof( int ) ) );
+  gpuErrchk( cudaMemset( d_spike_time_idx_, 0, n_node_ * sizeof( int* ) ) );
+  gpuErrchk( cudaMemset( d_spike_height_, 0, n_node_ * sizeof( float* ) ) );
+
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  
+
   return 0;
 }
 
-
-int spike_generator::Free()
+int
+spike_generator::Free()
 {
-  for (int i_node=0; i_node<n_node_; i_node++) {
-    if(h_spike_time_idx_[i_node] != 0) {
-      CUDAFREECTRL("h_spike_time_idx_[i_node]",h_spike_time_idx_[i_node]);
-      CUDAFREECTRL("h_spike_height_[i_node]",h_spike_height_[i_node]);
+  for ( int i_node = 0; i_node < n_node_; i_node++ )
+  {
+    if ( h_spike_time_idx_[ i_node ] != nullptr )
+    {
+      CUDAFREECTRL( "h_spike_time_idx_[i_node]", h_spike_time_idx_[ i_node ] );
+      CUDAFREECTRL( "h_spike_height_[i_node]", h_spike_height_[ i_node ] );
     }
-  }  
-  CUDAFREECTRL("d_n_spikes_",d_n_spikes_);
-  CUDAFREECTRL("d_i_spike_",d_i_spike_);	    
-  CUDAFREECTRL("d_spike_time_idx_",d_spike_time_idx_);
-  CUDAFREECTRL("d_spike_height_",d_spike_height_);
+  }
+  CUDAFREECTRL( "d_n_spikes_", d_n_spikes_ );
+  CUDAFREECTRL( "d_i_spike_", d_i_spike_ );
+  CUDAFREECTRL( "d_spike_time_idx_", d_spike_time_idx_ );
+  CUDAFREECTRL( "d_spike_height_", d_spike_height_ );
 
   delete[] h_spike_time_idx_;
   delete[] h_spike_height_;
-  
+
   return 0;
 }
 
 spike_generator::~spike_generator()
 {
-  if (n_node_>0) {
+  if ( n_node_ > 0 )
+  {
     Free();
   }
 }
 
-int spike_generator::Update(long long i_time, double /*t1*/)
+int
+spike_generator::Update( long long i_time, double /*t1*/ )
 {
-  spike_generatorUpdate<<<(n_node_+1023)/1024, 1024>>>
-    (i_node_0_, n_node_, i_time, d_n_spikes_, d_i_spike_, d_spike_time_idx_,
-     d_spike_height_);
-  //gpuErrchk( cudaPeekAtLastError() );
-  //gpuErrchk( cudaDeviceSynchronize() );
+  spike_generatorUpdate<<< ( n_node_ + 1023 ) / 1024, 1024 >>>(
+    i_node_0_, n_node_, i_time, d_n_spikes_, d_i_spike_, d_spike_time_idx_, d_spike_height_ );
+  // gpuErrchk( cudaPeekAtLastError() );
+  // gpuErrchk( cudaDeviceSynchronize() );
 
   return 0;
 }
 
-int spike_generator::SetArrayParam(int i_neuron, int n_neuron,
-				   std::string param_name, float *array,
-				   int array_size)
+int
+spike_generator::SetArrayParam( int i_neuron, int n_neuron, std::string param_name, float* array, int array_size )
 {
-  CheckNeuronIdx(i_neuron);
-  CheckNeuronIdx(i_neuron + n_neuron - 1);
+  CheckNeuronIdx( i_neuron );
+  CheckNeuronIdx( i_neuron + n_neuron - 1 );
 
-  if (param_name==array_param_name_[i_SPIKE_TIME_ARRAY_PARAM]) {
-    for (int in=i_neuron; in<i_neuron+n_neuron; in++) {
-      spike_time_vect_[in] = std::vector<float>(array, array+array_size);
+  if ( param_name == array_param_name_[ i_SPIKE_TIME_ARRAY_PARAM ] )
+  {
+    for ( int in = i_neuron; in < i_neuron + n_neuron; in++ )
+    {
+      spike_time_vect_[ in ] = std::vector< float >( array, array + array_size );
     }
   }
-  else if (param_name==array_param_name_[i_SPIKE_HEIGHT_ARRAY_PARAM]) {
-    for (int in=i_neuron; in<i_neuron+n_neuron; in++) {
-      spike_height_vect_[in] = std::vector<float>(array, array+array_size);
+  else if ( param_name == array_param_name_[ i_SPIKE_HEIGHT_ARRAY_PARAM ] )
+  {
+    for ( int in = i_neuron; in < i_neuron + n_neuron; in++ )
+    {
+      spike_height_vect_[ in ] = std::vector< float >( array, array + array_size );
     }
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized array parameter ")
-			 + param_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized array parameter " ) + param_name );
   }
 
   return 0;
 }
-  
-int spike_generator::SetArrayParam(int *i_neuron, int n_neuron,
-				   std::string param_name, float *array,
-				   int array_size)
+
+int
+spike_generator::SetArrayParam( int* i_neuron, int n_neuron, std::string param_name, float* array, int array_size )
 {
-  if (param_name==array_param_name_[i_SPIKE_TIME_ARRAY_PARAM]) {
-    for (int i=0; i<n_neuron; i++) {
-      int in = i_neuron[i];
-      CheckNeuronIdx(in);
-      spike_time_vect_[in] = std::vector<float>(array, array+array_size);
+  if ( param_name == array_param_name_[ i_SPIKE_TIME_ARRAY_PARAM ] )
+  {
+    for ( int i = 0; i < n_neuron; i++ )
+    {
+      int in = i_neuron[ i ];
+      CheckNeuronIdx( in );
+      spike_time_vect_[ in ] = std::vector< float >( array, array + array_size );
     }
   }
-  else if (param_name==array_param_name_[i_SPIKE_HEIGHT_ARRAY_PARAM]) {
-    for (int i=0; i<n_neuron; i++) {
-      int in = i_neuron[i];
-      CheckNeuronIdx(in);      
-      spike_height_vect_[in] = std::vector<float>(array, array+array_size);
+  else if ( param_name == array_param_name_[ i_SPIKE_HEIGHT_ARRAY_PARAM ] )
+  {
+    for ( int i = 0; i < n_neuron; i++ )
+    {
+      int in = i_neuron[ i ];
+      CheckNeuronIdx( in );
+      spike_height_vect_[ in ] = std::vector< float >( array, array + array_size );
     }
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized array parameter ")
-			 + param_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized array parameter " ) + param_name );
   }
 
   return 0;
 }
 
-int spike_generator::Calibrate(double time_min, float time_resolution)
+int
+spike_generator::Calibrate( double time_min, float time_resolution )
 {
-  for (int in=0; in<n_node_; in++) {
-    unsigned int n_spikes = spike_time_vect_[in].size();
-    if (n_spikes>0) {
-      if (spike_height_vect_[in].size()==0) {
-	spike_height_vect_[in].insert(spike_height_vect_[in].begin(),
-				      n_spikes, 1.0);
+  for ( int in = 0; in < n_node_; in++ )
+  {
+    unsigned int n_spikes = spike_time_vect_[ in ].size();
+    if ( n_spikes > 0 )
+    {
+      if ( spike_height_vect_[ in ].size() == 0 )
+      {
+        spike_height_vect_[ in ].insert( spike_height_vect_[ in ].begin(), n_spikes, 1.0 );
       }
-      else if (spike_height_vect_[in].size()!=n_spikes) {
-	throw ngpu_exception("spike time array and spike height array "
-			     "must have the same size in spike generator");
+      else if ( spike_height_vect_[ in ].size() != n_spikes )
+      {
+        throw ngpu_exception(
+          "spike time array and spike height array "
+          "must have the same size in spike generator" );
       }
-      SetSpikes(in, n_spikes, spike_time_vect_[in].data(),
-		spike_height_vect_[in].data(), (float)time_min, time_resolution);
+      SetSpikes( in,
+        n_spikes,
+        spike_time_vect_[ in ].data(),
+        spike_height_vect_[ in ].data(),
+        ( float ) time_min,
+        time_resolution );
     }
   }
-  
+
   return 0;
 }
-  
 
-
-int spike_generator::SetSpikes(int irel_node, int n_spikes, float *spike_time,
-			       float *spike_height, float time_min,
-			       float time_resolution)
+int
+spike_generator::SetSpikes( int irel_node,
+  int n_spikes,
+  float* spike_time,
+  float* spike_height,
+  float time_min,
+  float time_resolution )
 {
-  if (n_spikes <=0) {
-    throw ngpu_exception("Number of spikes must be greater than 0 "
-			 "in spike generator setting");
+  if ( n_spikes <= 0 )
+  {
+    throw ngpu_exception(
+      "Number of spikes must be greater than 0 "
+      "in spike generator setting" );
   }
-  
-  cudaMemcpy(&d_n_spikes_[irel_node], &n_spikes, sizeof(int),
-	     cudaMemcpyHostToDevice);
-  if (h_spike_time_idx_[irel_node] != 0) {
-    CUDAFREECTRL("h_spike_time_idx_[irel_node]",h_spike_time_idx_[irel_node]);
-    CUDAFREECTRL("h_spike_height_[irel_node]",h_spike_height_[irel_node]);
+
+  cudaMemcpy( &d_n_spikes_[ irel_node ], &n_spikes, sizeof( int ), cudaMemcpyHostToDevice );
+  if ( h_spike_time_idx_[ irel_node ] != nullptr )
+  {
+    CUDAFREECTRL( "h_spike_time_idx_[irel_node]", h_spike_time_idx_[ irel_node ] );
+    CUDAFREECTRL( "h_spike_height_[irel_node]", h_spike_height_[ irel_node ] );
   }
-  CUDAMALLOCCTRL("&h_spike_time_idx_[irel_node]",&h_spike_time_idx_[irel_node], n_spikes*sizeof(int));
-  CUDAMALLOCCTRL("&h_spike_height_[irel_node]",&h_spike_height_[irel_node], n_spikes*sizeof(float));
-
-  cudaMemcpy(&d_spike_time_idx_[irel_node], &h_spike_time_idx_[irel_node],
-	     sizeof(int*), cudaMemcpyHostToDevice);
-  cudaMemcpy(&d_spike_height_[irel_node], &h_spike_height_[irel_node], sizeof(float*),
-	     cudaMemcpyHostToDevice);
-
-  int *spike_time_idx = new int[n_spikes];
-  for(int i=0; i<n_spikes; i++) {
-    spike_time_idx[i] = (int)round((spike_time[i] - time_min)
-				   /time_resolution);
-    if (i>0 && spike_time_idx[i]<=spike_time_idx[i-1]) {
-      throw ngpu_exception("Spike times must be ordered, and the difference "
-			   "between\nconsecutive spikes must be >= the "
-			   "time resolution");
+  CUDAMALLOCCTRL( "&h_spike_time_idx_[irel_node]", &h_spike_time_idx_[ irel_node ], n_spikes * sizeof( int ) );
+  CUDAMALLOCCTRL( "&h_spike_height_[irel_node]", &h_spike_height_[ irel_node ], n_spikes * sizeof( float ) );
+
+  cudaMemcpy(
+    &d_spike_time_idx_[ irel_node ], &h_spike_time_idx_[ irel_node ], sizeof( int* ), cudaMemcpyHostToDevice );
+  cudaMemcpy( &d_spike_height_[ irel_node ], &h_spike_height_[ irel_node ], sizeof( float* ), cudaMemcpyHostToDevice );
+
+  int* spike_time_idx = new int[ n_spikes ];
+  for ( int i = 0; i < n_spikes; i++ )
+  {
+    spike_time_idx[ i ] = ( int ) round( ( spike_time[ i ] - time_min ) / time_resolution );
+    if ( i > 0 && spike_time_idx[ i ] <= spike_time_idx[ i - 1 ] )
+    {
+      throw ngpu_exception(
+        "Spike times must be ordered, and the difference "
+        "between\nconsecutive spikes must be >= the "
+        "time resolution" );
     }
-    //cout << "ti " << spike_time_idx[i] << endl;
-    //cout << spike_time[i] << " " << time_min << endl;
-      
+    // cout << "ti " << spike_time_idx[i] << endl;
+    // cout << spike_time[i] << " " << time_min << endl;
   }
-  
-  cudaMemcpy(h_spike_time_idx_[irel_node], spike_time_idx, n_spikes*sizeof(int),
-	     cudaMemcpyHostToDevice);
-  cudaMemcpy(h_spike_height_[irel_node], spike_height, n_spikes*sizeof(float),
-	     cudaMemcpyHostToDevice);
+
+  cudaMemcpy( h_spike_time_idx_[ irel_node ], spike_time_idx, n_spikes * sizeof( int ), cudaMemcpyHostToDevice );
+  cudaMemcpy( h_spike_height_[ irel_node ], spike_height, n_spikes * sizeof( float ), cudaMemcpyHostToDevice );
 
   return 0;
 }
 
-int spike_generator::GetArrayParamSize(int i_neuron, std::string param_name)
+int
+spike_generator::GetArrayParamSize( int i_neuron, std::string param_name )
 {
-  if (param_name==array_param_name_[i_SPIKE_TIME_ARRAY_PARAM]) {
-    return spike_time_vect_[i_neuron].size();
+  if ( param_name == array_param_name_[ i_SPIKE_TIME_ARRAY_PARAM ] )
+  {
+    return spike_time_vect_[ i_neuron ].size();
   }
-  else if (param_name==array_param_name_[i_SPIKE_HEIGHT_ARRAY_PARAM]) {
-    return spike_height_vect_[i_neuron].size();
+  else if ( param_name == array_param_name_[ i_SPIKE_HEIGHT_ARRAY_PARAM ] )
+  {
+    return spike_height_vect_[ i_neuron ].size();
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized parameter ")
-			 + param_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized parameter " ) + param_name );
   }
 }
 
-float *spike_generator::GetArrayParam(int i_neuron, std::string param_name)
+float*
+spike_generator::GetArrayParam( int i_neuron, std::string param_name )
 {
-  if (param_name==array_param_name_[i_SPIKE_TIME_ARRAY_PARAM]) {
-    return spike_time_vect_[i_neuron].data();
+  if ( param_name == array_param_name_[ i_SPIKE_TIME_ARRAY_PARAM ] )
+  {
+    return spike_time_vect_[ i_neuron ].data();
   }
-  else if (param_name==array_param_name_[i_SPIKE_HEIGHT_ARRAY_PARAM]) {
-    return spike_height_vect_[i_neuron].data();
+  else if ( param_name == array_param_name_[ i_SPIKE_HEIGHT_ARRAY_PARAM ] )
+  {
+    return spike_height_vect_[ i_neuron ].data();
   }
-  else {
-    throw ngpu_exception(std::string("Unrecognized parameter ")
-			 + param_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unrecognized parameter " ) + param_name );
   }
 }
diff --git a/src/spike_generator.h b/src/spike_generator.h
index f5ae12ab2..f0af9ba29 100644
--- a/src/spike_generator.h
+++ b/src/spike_generator.h
@@ -20,17 +20,13 @@
  *
  */
 
-
-
-
-
 #ifndef SPIKEGENERATOR_H
 #define SPIKEGENERATOR_H
 
+#include "cuda_error.h"
 #include <iostream>
 #include <string>
-#include "cuda_error.h"
-				    //#include "node_group.h"
+// #include "node_group.h"
 #include "base_neuron.h"
 #include "neuron_models.h"
 
@@ -58,7 +54,7 @@ The following parameters can be set in the status dictionary.
 
 Spike times are given in milliseconds, and must be sorted with the
 earliest spike first. All spike times must be strictly in the future
-(i.e. greater than the current time step). 
+(i.e. greater than the current time step).
 
 It is possible that spike times do not coincide with a time step,
 i.e., are not a multiple of the simulation resolution.
@@ -82,40 +78,40 @@ EndUserDocs
 
 class spike_generator : public BaseNeuron
 {
-  int *d_n_spikes_;
-  int *d_i_spike_;	    
-  int **d_spike_time_idx_;
-  float **d_spike_height_;
-  int **h_spike_time_idx_;
-  float ** h_spike_height_;
-  std::vector<std::vector<float> > spike_time_vect_;
-  std::vector<std::vector<float> > spike_height_vect_;
-
-  int SetSpikes(int irel_node, int n_spikes, float *spike_time,
-		float *spike_height, float time_min, float time_resolution);
-  
- public:
+  int* d_n_spikes_;
+  int* d_i_spike_;
+  int** d_spike_time_idx_;
+  float** d_spike_height_;
+  int** h_spike_time_idx_;
+  float** h_spike_height_;
+  std::vector< std::vector< float > > spike_time_vect_;
+  std::vector< std::vector< float > > spike_height_vect_;
+
+  int SetSpikes( int irel_node,
+    int n_spikes,
+    float* spike_time,
+    float* spike_height,
+    float time_min,
+    float time_resolution );
+
+public:
   ~spike_generator();
-  
-  int Init(int i_node_0, int n_node, int n_port, int i_group);
+
+  int Init( int i_node_0, int n_node, int n_port, int i_group );
 
   int Free();
-  
-  int Update(long long i_time, double t1);
 
-  int Calibrate(double time_min, float time_resolution);
+  int Update( long long i_time, double t1 );
 
-  int SetArrayParam(int i_neuron, int n_neuron, std::string param_name,
-		    float *array, int array_size);
-  
-  int SetArrayParam(int *i_neuron, int n_neuron, std::string param_name,
-		    float *array, int array_size);
-  
-  int GetArrayParamSize(int i_neuron, std::string param_name);
+  int Calibrate( double time_min, float time_resolution );
 
-  float *GetArrayParam(int i_neuron, std::string param_name);
+  int SetArrayParam( int i_neuron, int n_neuron, std::string param_name, float* array, int array_size );
 
-};
+  int SetArrayParam( int* i_neuron, int n_neuron, std::string param_name, float* array, int array_size );
 
+  int GetArrayParamSize( int i_neuron, std::string param_name );
+
+  float* GetArrayParam( int i_neuron, std::string param_name );
+};
 
 #endif
diff --git a/src/stdp.cu b/src/stdp.cu
index 6bc43377a..84083cafc 100644
--- a/src/stdp.cu
+++ b/src/stdp.cu
@@ -20,34 +20,31 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <stdio.h>
-#include <iostream>
-#include "ngpu_exception.h"
 #include "cuda_error.h"
+#include "ngpu_exception.h"
 #include "stdp.h"
 #include "syn_model.h"
+#include <config.h>
+#include <iostream>
+#include <stdio.h>
 
 using namespace stdp_ns;
 
-int STDP::Init()
+int
+STDP::_Init()
 {
   type_ = i_stdp_model;
   n_param_ = N_PARAM;
   param_name_ = stdp_param_name;
-  CUDAMALLOCCTRL("&d_param_arr_",&d_param_arr_, n_param_*sizeof(float));
-  SetParam("tau_plus", 20.0);
-  SetParam("tau_minus", 20.0);
-  SetParam("lambda", 1.0e-4);
-  SetParam("alpha", 1.0);
-  SetParam("mu_plus", 1.0);
-  SetParam("mu_minus", 1.0);
-  SetParam("Wmax", 100.0);
-  //SetParam("den_delay", 0.0);
+  CUDAMALLOCCTRL( "&d_param_arr_", &d_param_arr_, n_param_ * sizeof( float ) );
+  SetParam( "tau_plus", 20.0 );
+  SetParam( "tau_minus", 20.0 );
+  SetParam( "lambda", 1.0e-4 );
+  SetParam( "alpha", 1.0 );
+  SetParam( "mu_plus", 1.0 );
+  SetParam( "mu_minus", 1.0 );
+  SetParam( "Wmax", 100.0 );
+  // SetParam("den_delay", 0.0);
 
   return 0;
 }
diff --git a/src/stdp.h b/src/stdp.h
index 50937b1aa..2d0c09af3 100644
--- a/src/stdp.h
+++ b/src/stdp.h
@@ -20,7 +20,6 @@
  *
  */
 
-
 #ifndef STDP_H
 #define STDP_H
 #include <cmath>
@@ -37,7 +36,7 @@ Description
 
 The STDP class is a type of synapse model used to create
 synapses that enable spike timing dependent plasticity
-(as defined in [1]_). 
+(as defined in [1]_).
 Here the weight dependence exponent can be set separately
 for potentiation and depression.
 
@@ -69,49 +68,60 @@ EndUserDocs */
 
 namespace stdp_ns
 {
-  enum ParamIndexes {
-    i_tau_plus = 0, i_tau_minus, i_lambda, i_alpha, i_mu_plus, i_mu_minus,
-    i_Wmax, // i_den_delay,
-    N_PARAM
-  };
-
-  const std::string stdp_param_name[N_PARAM] = {
-    "tau_plus", "tau_minus", "lambda", "alpha", "mu_plus", "mu_minus", "Wmax"
-    //, "den_delay"
-  };
-
-
-
-  __device__ __forceinline__ void STDPUpdate(float *weight_pt, float Dt,
-					     float *param)
+enum ParamIndexes
+{
+  i_tau_plus = 0,
+  i_tau_minus,
+  i_lambda,
+  i_alpha,
+  i_mu_plus,
+  i_mu_minus,
+  i_Wmax, // i_den_delay,
+  N_PARAM
+};
+
+const std::string stdp_param_name[ N_PARAM ] = {
+  "tau_plus",
+  "tau_minus",
+  "lambda",
+  "alpha",
+  "mu_plus",
+  "mu_minus",
+  "Wmax"
+  //, "den_delay"
+};
+
+__device__ __forceinline__ void
+STDPUpdate( float* weight_pt, float Dt, float* param )
+{
+  // printf("Dt: %f\n", Dt);
+  double tau_plus = param[ i_tau_plus ];
+  double tau_minus = param[ i_tau_minus ];
+  double lambda = param[ i_lambda ];
+  double alpha = param[ i_alpha ];
+  double mu_plus = param[ i_mu_plus ];
+  double mu_minus = param[ i_mu_minus ];
+  double Wmax = param[ i_Wmax ];
+  // double den_delay = param[i_den_delay];
+
+  double w = *weight_pt;
+  double w1;
+  // Dt += den_delay;
+  if ( Dt >= 0 )
   {
-    //printf("Dt: %f\n", Dt);
-    double tau_plus = param[i_tau_plus];
-    double tau_minus = param[i_tau_minus];
-    double lambda = param[i_lambda];
-    double alpha = param[i_alpha];
-    double mu_plus = param[i_mu_plus];
-    double mu_minus = param[i_mu_minus];
-    double Wmax = param[i_Wmax];
-    //double den_delay = param[i_den_delay];
-    
-    double w = *weight_pt;
-    double w1;
-    //Dt += den_delay;
-    if (Dt>=0) {
-      double fact = lambda*exp(-(double)Dt/tau_plus);
-      w1 = w + fact*Wmax*pow(1.0 - w/Wmax, mu_plus);
-    }
-    else {
-      double fact = -alpha*lambda*exp((double)Dt/tau_minus);
-      w1 = w + fact*Wmax*pow(w/Wmax, mu_minus);
-    }
-    
-    w1 = w1 >0.0 ? w1 : 0.0;
-    w1 = w1 < Wmax ? w1 : Wmax;
-    *weight_pt = (float)w1;
+    double fact = lambda * exp( -( double ) Dt / tau_plus );
+    w1 = w + fact * Wmax * pow( 1.0 - w / Wmax, mu_plus );
+  }
+  else
+  {
+    double fact = -alpha * lambda * exp( ( double ) Dt / tau_minus );
+    w1 = w + fact * Wmax * pow( w / Wmax, mu_minus );
   }
-}
 
+  w1 = w1 > 0.0 ? w1 : 0.0;
+  w1 = w1 < Wmax ? w1 : Wmax;
+  *weight_pt = ( float ) w1;
+}
+} // namespace stdp_ns
 
 #endif
diff --git a/src/syn_model.cu b/src/syn_model.cu
index 6dd24c51b..2dc1f8766 100644
--- a/src/syn_model.cu
+++ b/src/syn_model.cu
@@ -20,200 +20,219 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <iostream>
-#include "ngpu_exception.h"
 #include "cuda_error.h"
 #include "nestgpu.h"
+#include "ngpu_exception.h"
+#include "stdp.h"
 #include "syn_model.h"
 #include "test_syn_model.h"
-#include "stdp.h"
+#include <config.h>
+#include <iostream>
 
-int *d_SynGroupTypeMap;
-__device__ int *SynGroupTypeMap;
+int* d_SynGroupTypeMap;
+__device__ int* SynGroupTypeMap;
 
-float **d_SynGroupParamMap;
-__device__ float **SynGroupParamMap;
+float** d_SynGroupParamMap;
+__device__ float** SynGroupParamMap;
 
-__device__ void TestSynModelUpdate(float *w, float Dt, float *param);
+__device__ void TestSynModelUpdate( float* w, float Dt, float* param );
 
-__global__ void SynGroupInit(int *syn_group_type_map,
-			     float **syn_group_param_map)
+__global__ void
+SynGroupInit( int* syn_group_type_map, float** syn_group_param_map )
 {
   SynGroupTypeMap = syn_group_type_map;
   SynGroupParamMap = syn_group_param_map;
-  
 }
 
-int SynModel::GetNParam()
+int
+SynModel::GetNParam()
 {
   return n_param_;
 }
 
-std::vector<std::string> SynModel::GetParamNames()
+std::vector< std::string >
+SynModel::GetParamNames()
 {
-  std::vector<std::string> param_name_vect;
-  for (int i=0; i<n_param_; i++) {
-    param_name_vect.push_back(param_name_[i]);
+  std::vector< std::string > param_name_vect;
+  for ( int i = 0; i < n_param_; i++ )
+  {
+    param_name_vect.push_back( param_name_[ i ] );
   }
-  
+
   return param_name_vect;
 }
 
-bool SynModel::IsParam(std::string param_name)
+bool
+SynModel::IsParam( std::string param_name )
 {
   int i_param;
-  for (i_param=0; i_param<n_param_; i_param++) {
-    if (param_name == param_name_[i_param]) return true;
+  for ( i_param = 0; i_param < n_param_; i_param++ )
+  {
+    if ( param_name == param_name_[ i_param ] )
+    {
+      return true;
+    }
   }
   return false;
 }
 
-int SynModel::GetParamIdx(std::string param_name)
+int
+SynModel::GetParamIdx( std::string param_name )
 {
   int i_param;
-  for (i_param=0; i_param<n_param_; i_param++) {
-    if (param_name == param_name_[i_param]) break;
+  for ( i_param = 0; i_param < n_param_; i_param++ )
+  {
+    if ( param_name == param_name_[ i_param ] )
+    {
+      break;
+    }
   }
-  if (i_param == n_param_) {
-    throw ngpu_exception(std::string("Unrecognized parameter ")
-			 + param_name);
+  if ( i_param == n_param_ )
+  {
+    throw ngpu_exception( std::string( "Unrecognized parameter " ) + param_name );
   }
-  
+
   return i_param;
 }
 
-float SynModel::GetParam(std::string param_name)
+float
+SynModel::GetParam( std::string param_name )
 {
-  if (!IsParam(param_name)) {
-    throw ngpu_exception(std::string("Unrecognized synapse parameter ")
-			 + param_name);
+  if ( !IsParam( param_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized synapse parameter " ) + param_name );
   }
-  int i_param =  GetParamIdx(param_name);
-  float *d_param_pt = d_param_arr_ + i_param;
+  int i_param = GetParamIdx( param_name );
+  float* d_param_pt = d_param_arr_ + i_param;
   float param_val;
-  gpuErrchk(cudaMemcpy(&param_val, d_param_pt, sizeof(float),
-		       cudaMemcpyDeviceToHost));
+  gpuErrchk( cudaMemcpy( &param_val, d_param_pt, sizeof( float ), cudaMemcpyDeviceToHost ) );
   return param_val;
 }
 
-
-int SynModel::SetParam(std::string param_name, float val)
+int
+SynModel::SetParam( std::string param_name, float val )
 {
-  if (!IsParam(param_name)) {
-    throw ngpu_exception(std::string("Unrecognized synapse parameter ")
-			 + param_name);
-  }
-  int i_param =  GetParamIdx(param_name);
-  float *d_param_pt = d_param_arr_ + i_param;
-  gpuErrchk(cudaMemcpy(d_param_pt, &val, sizeof(float),
-		       cudaMemcpyHostToDevice));
+  if ( !IsParam( param_name ) )
+  {
+    throw ngpu_exception( std::string( "Unrecognized synapse parameter " ) + param_name );
+  }
+  int i_param = GetParamIdx( param_name );
+  float* d_param_pt = d_param_arr_ + i_param;
+  gpuErrchk( cudaMemcpy( d_param_pt, &val, sizeof( float ), cudaMemcpyHostToDevice ) );
   return 0;
 }
 
-  
-int NESTGPU::CreateSynGroup(std::string model_name)
+int
+NESTGPU::CreateSynGroup( std::string model_name )
 {
-  CheckUncalibrated("Nodes cannot be created after calibration");
-  if (model_name == syn_model_name[i_test_syn_model]) {
-    TestSynModel *test_syn_model_group = new TestSynModel;
-    syn_group_vect_.push_back(test_syn_model_group);
+  CheckUncalibrated( "Nodes cannot be created after calibration" );
+  if ( model_name == syn_model_name[ i_test_syn_model ] )
+  {
+    TestSynModel* test_syn_model_group = new TestSynModel;
+    syn_group_vect_.push_back( test_syn_model_group );
   }
-  else if (model_name == syn_model_name[i_stdp_model]) {
-    STDP *stdp_group = new STDP;
-    syn_group_vect_.push_back(stdp_group);
+  else if ( model_name == syn_model_name[ i_stdp_model ] )
+  {
+    STDP* stdp_group = new STDP;
+    syn_group_vect_.push_back( stdp_group );
   }
-  else {
-    throw ngpu_exception(std::string("Unknown synapse model name: ")
-			 + model_name);
+  else
+  {
+    throw ngpu_exception( std::string( "Unknown synapse model name: " ) + model_name );
   }
   return syn_group_vect_.size(); // 0 is standard synapse
 }
 
-int NESTGPU::GetSynGroupNParam(int syn_group)
+int
+NESTGPU::GetSynGroupNParam( int syn_group )
 {
-  if (syn_group<1 || syn_group>(int)syn_group_vect_.size()) {
-    throw ngpu_exception("Unrecognized synapse group");
+  if ( syn_group < 1 || syn_group > ( int ) syn_group_vect_.size() )
+  {
+    throw ngpu_exception( "Unrecognized synapse group" );
   }
 
-  return syn_group_vect_[syn_group-1]->GetNParam();
+  return syn_group_vect_[ syn_group - 1 ]->GetNParam();
 }
 
-std::vector<std::string> NESTGPU::GetSynGroupParamNames(int syn_group)
+std::vector< std::string >
+NESTGPU::GetSynGroupParamNames( int syn_group )
 {
-  if (syn_group<1 || syn_group>(int)syn_group_vect_.size()) {
-    throw ngpu_exception("Unrecognized synapse group");
+  if ( syn_group < 1 || syn_group > ( int ) syn_group_vect_.size() )
+  {
+    throw ngpu_exception( "Unrecognized synapse group" );
   }
 
-  return syn_group_vect_[syn_group-1]->GetParamNames();
+  return syn_group_vect_[ syn_group - 1 ]->GetParamNames();
 }
 
-bool NESTGPU::IsSynGroupParam(int syn_group, std::string param_name)
+bool
+NESTGPU::IsSynGroupParam( int syn_group, std::string param_name )
 {
-  if (syn_group<1 || syn_group>(int)syn_group_vect_.size()) {
-    throw ngpu_exception("Unrecognized synapse group");
+  if ( syn_group < 1 || syn_group > ( int ) syn_group_vect_.size() )
+  {
+    throw ngpu_exception( "Unrecognized synapse group" );
   }
 
-  return syn_group_vect_[syn_group-1]->IsParam(param_name);
+  return syn_group_vect_[ syn_group - 1 ]->IsParam( param_name );
 }
 
-int NESTGPU::GetSynGroupParamIdx(int syn_group, std::string param_name)
+int
+NESTGPU::GetSynGroupParamIdx( int syn_group, std::string param_name )
 {
-  if (syn_group<1 || syn_group>(int)syn_group_vect_.size()) {
-    throw ngpu_exception("Unrecognized synapse group");
+  if ( syn_group < 1 || syn_group > ( int ) syn_group_vect_.size() )
+  {
+    throw ngpu_exception( "Unrecognized synapse group" );
   }
 
-  return syn_group_vect_[syn_group-1]->GetParamIdx(param_name);
+  return syn_group_vect_[ syn_group - 1 ]->GetParamIdx( param_name );
 }
 
-float NESTGPU::GetSynGroupParam(int syn_group, std::string param_name)
+float
+NESTGPU::GetSynGroupParam( int syn_group, std::string param_name )
 {
-  if (syn_group<1 || syn_group>(int)syn_group_vect_.size()) {
-    throw ngpu_exception("Unrecognized synapse group");
+  if ( syn_group < 1 || syn_group > ( int ) syn_group_vect_.size() )
+  {
+    throw ngpu_exception( "Unrecognized synapse group" );
   }
 
-  return syn_group_vect_[syn_group-1]->GetParam(param_name);
+  return syn_group_vect_[ syn_group - 1 ]->GetParam( param_name );
 }
 
-int NESTGPU::SetSynGroupParam(int syn_group, std::string param_name,
-				float val)
+int
+NESTGPU::SetSynGroupParam( int syn_group, std::string param_name, float val )
 {
-  if (syn_group<1 || syn_group>(int)syn_group_vect_.size()) {
-    throw ngpu_exception("Unrecognized synapse group");
+  if ( syn_group < 1 || syn_group > ( int ) syn_group_vect_.size() )
+  {
+    throw ngpu_exception( "Unrecognized synapse group" );
   }
 
-  return syn_group_vect_[syn_group-1]->SetParam(param_name, val);
+  return syn_group_vect_[ syn_group - 1 ]->SetParam( param_name, val );
 }
 
-
-int NESTGPU::SynGroupCalibrate()
+int
+NESTGPU::SynGroupCalibrate()
 {
   int n_group = syn_group_vect_.size();
-  int *h_SynGroupTypeMap = new int[n_group];
-  float **h_SynGroupParamMap = new float*[n_group];
+  int* h_SynGroupTypeMap = new int[ n_group ];
+  float** h_SynGroupParamMap = new float*[ n_group ];
 
-  for (int syn_group=1; syn_group<=n_group; syn_group++) {
-    h_SynGroupTypeMap[syn_group-1] = syn_group_vect_[syn_group-1]->type_;
-    h_SynGroupParamMap[syn_group-1]
-      = syn_group_vect_[syn_group-1]->d_param_arr_;
+  for ( int syn_group = 1; syn_group <= n_group; syn_group++ )
+  {
+    h_SynGroupTypeMap[ syn_group - 1 ] = syn_group_vect_[ syn_group - 1 ]->type_;
+    h_SynGroupParamMap[ syn_group - 1 ] = syn_group_vect_[ syn_group - 1 ]->d_param_arr_;
   }
-  CUDAMALLOCCTRL("&d_SynGroupTypeMap",&d_SynGroupTypeMap, n_group*sizeof(int));
-  CUDAMALLOCCTRL("&d_SynGroupParamMap",&d_SynGroupParamMap, n_group*sizeof(float*));
+  CUDAMALLOCCTRL( "&d_SynGroupTypeMap", &d_SynGroupTypeMap, n_group * sizeof( int ) );
+  CUDAMALLOCCTRL( "&d_SynGroupParamMap", &d_SynGroupParamMap, n_group * sizeof( float* ) );
 
   // Memcopies will be synchronised with SynGroupInit kernel
-  gpuErrchk(cudaMemcpyAsync(d_SynGroupTypeMap, h_SynGroupTypeMap,
-		       n_group*sizeof(int), cudaMemcpyHostToDevice));
-  gpuErrchk(cudaMemcpyAsync(d_SynGroupParamMap, h_SynGroupParamMap,
-		       n_group*sizeof(float*), cudaMemcpyHostToDevice));
+  gpuErrchk( cudaMemcpyAsync( d_SynGroupTypeMap, h_SynGroupTypeMap, n_group * sizeof( int ), cudaMemcpyHostToDevice ) );
+  gpuErrchk(
+    cudaMemcpyAsync( d_SynGroupParamMap, h_SynGroupParamMap, n_group * sizeof( float* ), cudaMemcpyHostToDevice ) );
 
-  SynGroupInit<<<1,1>>>(d_SynGroupTypeMap, d_SynGroupParamMap);
+  SynGroupInit<<< 1, 1 >>>( d_SynGroupTypeMap, d_SynGroupParamMap );
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
-  
+
   delete[] h_SynGroupTypeMap;
   delete[] h_SynGroupParamMap;
 
diff --git a/src/syn_model.h b/src/syn_model.h
index e476fff2b..267da136c 100644
--- a/src/syn_model.h
+++ b/src/syn_model.h
@@ -20,72 +20,85 @@
  *
  */
 
-
 #ifndef SYNMODEL_H
 #define SYNMODEL_H
 
+#include "stdp.h"
 #include <string>
 #include <vector>
-#include "stdp.h"
 
 #define MAX_SYN_DT 16384
 
-extern __device__ int *SynGroupTypeMap;
-extern __device__ float **SynGroupParamMap;
+extern __device__ int* SynGroupTypeMap;
+extern __device__ float** SynGroupParamMap;
 
-__device__ void TestSynModelUpdate(float *w, float Dt, float *param);
+__device__ void TestSynModelUpdate( float* w, float Dt, float* param );
 
-enum SynModels {
-  i_null_syn_model = 0, i_test_syn_model, i_stdp_model,
+enum SynModels
+{
+  i_null_syn_model = 0,
+  i_test_syn_model,
+  i_stdp_model,
   N_SYN_MODELS
 };
 
-__device__ __forceinline__ void SynapseUpdate(int syn_group, float *w, float Dt)
+__device__ __forceinline__ void
+SynapseUpdate( int syn_group, float* w, float Dt )
 {
-  int syn_type = SynGroupTypeMap[syn_group-1];
-  float *param = SynGroupParamMap[syn_group-1];
-  switch(syn_type) {
+  int syn_type = SynGroupTypeMap[ syn_group - 1 ];
+  float* param = SynGroupParamMap[ syn_group - 1 ];
+  switch ( syn_type )
+  {
   case i_test_syn_model:
-    TestSynModelUpdate(w, Dt, param);
+    TestSynModelUpdate( w, Dt, param );
     break;
   case i_stdp_model:
-    stdp_ns::STDPUpdate(w, Dt, param);
+    stdp_ns::STDPUpdate( w, Dt, param );
     break;
   }
 }
 
-
-const std::string syn_model_name[N_SYN_MODELS] = {
-  "", "test_syn_model", "stdp"
-};
+const std::string syn_model_name[ N_SYN_MODELS ] = { "", "test_syn_model", "stdp" };
 
 class SynModel
 {
- protected:
+protected:
   int type_;
   int n_param_;
-  const std::string *param_name_;
-  float *d_param_arr_;
- public:
-  virtual int Init() {return 0;}
+  const std::string* param_name_;
+  float* d_param_arr_;
+
+public:
+  virtual int
+  Init()
+  {
+    return 0;
+  }
   int GetNParam();
-  std::vector<std::string> GetParamNames();
-  bool IsParam(std::string param_name);
-  int GetParamIdx(std::string param_name);
-  virtual float GetParam(std::string param_name);
-  virtual int SetParam(std::string param_name, float val);
+  std::vector< std::string > GetParamNames();
+  bool IsParam( std::string param_name );
+  int GetParamIdx( std::string param_name );
+  virtual float GetParam( std::string param_name );
+  virtual int SetParam( std::string param_name, float val );
 
   friend class NESTGPU;
 };
 
-
 class STDP : public SynModel
 {
- public:
-  STDP() {Init();}
-  int Init();
-};
+  int _Init();
 
+public:
+  STDP()
+  {
+    _Init();
+  }
 
+  int
+  Init()
+  {
+    return _Init();
+  }
+};
 
 #endif
diff --git a/src/test_syn_model.cu b/src/test_syn_model.cu
index 615eb055b..6599f3802 100644
--- a/src/test_syn_model.cu
+++ b/src/test_syn_model.cu
@@ -20,34 +20,32 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <stdio.h>
-#include <iostream>
-#include "ngpu_exception.h"
 #include "cuda_error.h"
+#include "ngpu_exception.h"
 #include "test_syn_model.h"
+#include <config.h>
+#include <iostream>
+#include <stdio.h>
 
 using namespace test_syn_model_ns;
 
-__device__ void TestSynModelUpdate(float *w, float Dt, float *param)
+__device__ void
+TestSynModelUpdate( float* w, float Dt, float* param )
 {
-  float fact = param[0];
-  float offset = param[1];
-  *w += offset + fact*Dt;
+  float fact = param[ 0 ];
+  float offset = param[ 1 ];
+  *w += offset + fact * Dt;
 }
 
-int TestSynModel::Init()
+int
+TestSynModel::_Init()
 {
   type_ = i_test_syn_model;
   n_param_ = N_PARAM;
   param_name_ = test_syn_model_param_name;
-  CUDAMALLOCCTRL("&d_param_arr_",&d_param_arr_, n_param_*sizeof(float));
-  SetParam("fact", 0.1);
-  SetParam("offset", 0.0);
+  CUDAMALLOCCTRL( "&d_param_arr_", &d_param_arr_, n_param_ * sizeof( float ) );
+  SetParam( "fact", 0.1 );
+  SetParam( "offset", 0.0 );
 
   return 0;
 }
diff --git a/src/test_syn_model.h b/src/test_syn_model.h
index d610021f8..ba3c51552 100644
--- a/src/test_syn_model.h
+++ b/src/test_syn_model.h
@@ -20,10 +20,6 @@
  *
  */
 
-
-
-
-
 #ifndef TESTSYNMODEL_H
 #define TESTSYNMODEL_H
 
@@ -31,22 +27,31 @@
 
 class TestSynModel : public SynModel
 {
- public:
-  TestSynModel() {Init();}
-  int Init();
+  int _Init();
+
+public:
+  TestSynModel()
+  {
+    _Init();
+  }
+  int
+  Init()
+  {
+    return _Init();
+  }
 };
 
 namespace test_syn_model_ns
 {
-  enum ParamIndexes {
-    i_fact = 0, i_offset,
-    N_PARAM
-  };
+enum ParamIndexes
+{
+  i_fact = 0,
+  i_offset,
+  N_PARAM
+};
 
-  const std::string test_syn_model_param_name[N_PARAM] = {
-    "fact", "offset"
-  };
+const std::string test_syn_model_param_name[ N_PARAM ] = { "fact", "offset" };
 
-}
+} // namespace test_syn_model_ns
 
 #endif
diff --git a/src/tmp.cu b/src/tmp.cu
deleted file mode 100644
index a37cb0736..000000000
--- a/src/tmp.cu
+++ /dev/null
@@ -1,182 +0,0 @@
-
-
-int poiss_gen::OrganizeConnections()
-{
-  typedef uint key_t;
-  typedef regular_block_array<key_t> array_t;
-  
-  uint k = KeySubarray.size();
-  int64_t n = NConn;
-  int64_t block_size = h_ConnBlockSize;
-  
-  key_t **key_subarray = KeySubarray.data();
-  
-  array_t h_key_array;
-  array_t d_key_array;
-
-  h_key_array.data_pt = key_subarray;
-  h_key_array.block_size = block_size;
-  h_key_array.offset = 0;
-  h_key_array.size = n;
-
-  key_t **d_key_array_data_pt = NULL;
-  CUDAMALLOCCTRL("&d_key_array_data_pt",&d_key_array_data_pt, k*sizeof(key_t*));
-  gpuErrchk(cudaMemcpy(d_key_array_data_pt, key_subarray,
-		       k*sizeof(key_t*), cudaMemcpyHostToDevice));
-
-  d_key_array.data_pt = d_key_array_data_pt; //key_subarray;
-  d_key_array.block_size = block_size;
-  d_key_array.offset = 0;
-  d_key_array.size = n;
-
-  array_t h_subarray[k];
-  for (uint i=0; i<k; i++) {
-    h_subarray[i].h_data_pt = key_subarray;
-    h_subarray[i].data_pt = d_key_array_data_pt; //key_subarray;
-    h_subarray[i].block_size = block_size;
-    h_subarray[i].offset = i * block_size;
-    h_subarray[i].size = i<k-1 ? block_size : n-(k-1)*block_size;
-  }
-
-  array_t *d_subarray;
-  CUDAMALLOCCTRL("&d_subarray",&d_subarray, k, sizeof(array_t));
-  gpuErrchk(cudaMemcpyAsync(d_subarray, h_subarray,
-			    k*sizeof(array_t), cudaMemcpyHostToDevice));
-
-  
-  
-  int64_t h_num[k];
-  int64_t *d_num;
-  CUDAMALLOCCTRL("&d_num",&d_num, 2*k*sizeof(int64_t));
-  int64_t *d_sum;
-  CUDAMALLOCCTRL("&d_sum",&d_sum, 2*sizeof(int64_t));
-  
-  key_t h_thresh[2];
-  key_t *d_thresh;
-  CUDAMALLOCCTRL("&d_thresh",&d_thresh, 2*sizeof(key_t));
-  
-  int64_t *d_num0 = &d_num[0];
-  int64_t *d_num1 = &d_num[k];
-  int64_t *h_num0 = &h_num[0];
-  int64_t *h_num1 = &h_num[k];
-  
-
-  h_thresh[0] = i_node_0_ << MaxPortNBits;
-  h_thresh[1] = (i_node_0_ + n_node_) << MaxPortNBits;
-  
-  search_multi_down<key_t, array_t, 1024>
-    (d_subarray, k, &d_thresh[0], d_num0, d_sum[0]);
-  CUDASYNC
-    
-  search_multi_down<key_t, array_t, 1024>
-    (d_subarray, k, &d_thresh[1], d_num1, d_sum[1]);
-  CUDASYNC
-
-  gpuErrchk(cudaMemcpy(h_num, d_num, 2*k*sizeof(int64_t),
-		       cudaMemcpyDeviceToHost));
-  int64_t n_conn;
-  int64_t i_conn0 = 0;
-  int64_t i_conn1 = 0;
-  uint ib0 = 0;
-  uint ib1 = 0;
-  uint nb;
-  for (uint i=0; i<k; i++) {
-    if (h_num0[i] < block_size) {
-      i_conn0 = block_size*i + h_num0[i];
-      ib0 = i;
-      break;
-    }
-  }
-  for (uint i=0; i<k; i++) {
-    if (h_num1[i] < block_size) {
-      i_conn1 = block_size*i + h_num1[i];
-      ib1 = i;
-      break;
-    }
-  }
-  n_conn = i_conn0 - i_conn1;
-  if (n_conn>0) {
-    key_t *d_poiss_key_array;
-    CUDAMALLOCCTRL("&d_poiss_key_array",&d_poiss_key_array, n_conn*sizeof(key_t));
-    
-    int64_t offset = 0;
-    for (uint ib=ib0; ib<=ib1; ib++) {
-      if (ib==ib0 && ib==ib1) {
-	gpuErrchk(cudaMemcpy(d_poiss_key_array, key_subarray[ib] + h_num0[ib],
-			     n_conn*sizeof(key_t), cudaMemcpyDeviceToDevice));
-	break;
-      }
-      else if (ib==ib0) {
-	offset = block_size - h_num0[ib];
-	gpuErrchk(cudaMemcpy(d_poiss_key_array, key_subarray[ib] + h_num0[ib],
-			     offset*sizeof(key_t),
-			     cudaMemcpyDeviceToDevice));
-      }
-      else if (ib==ib1) {
-	gpuErrchk(cudaMemcpy(d_poiss_key_array + offset,
-			     key_subarray[ib] + h_num0[ib],
-			     h_num1[i]*sizeof(key_t),
-			     cudaMemcpyDeviceToDevice));
-      }
-      else {
-	gpuErrchk(cudaMemcpy(d_poiss_key_array + offset,
-			     key_subarray[ib],
-			     block_size*sizeof(key_t),
-			     cudaMemcpyDeviceToDevice));
-      }
-    }
-    key_t *h_poiss_key_array = new key_t[n_conn];
-    gpuErrchk(cudaMemcpy(h_poiss_key_array, d_poiss_key_array,
-			 n_conn*sizeof(key_t),
-			 cudaMemcpyDeviceToHost));
-    printf("i_conn0: %ld\ti_conn1: %ld\tn_conn: %ld\n", i_conn0, i_conn1,
-	   n_conn);
-    int i_min = h_poiss_key_array[0] >> MaxPortNBits;
-    int d_min = h_poiss_key_array[0] & PortMask;
-    int i_max = h_poiss_key_array[n_conn - 1] >> MaxPortNBits;
-    int d_max = h_poiss_key_array[n_conn - 1] & PortMask;
-    printf("i_min: %d\ti_max: %d\td_min: %d\td_max: %d\n"
-	   i_min, i_max, d_min, d_max);
-  }
-  
-  CUDAFREECTRL("d_key_array_data_pt",d_key_array_data_pt);
-  CUDAFREECTRL("d_subarray",d_subarray);
-  CUDAFREECTRL("d_num",d_num);
-  CUDAFREECTRL("d_sum",d_sum);
-  CUDAFREECTRL("d_thresh",d_thresh);
-
-  return 0;
-}
-
-__global__ void SendDirectSpikes(int64_t n_conn, int64_t i_conn_0,
-				 int64_t block_size, int n_node,
-				 float *rate_arr, int max_delay_num,
-				 float time_resolution)
-{
-  i_conn_rel = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_conn_rel >= n_conn) {
-    return 0;
-  }
-  uint source_delay = PoissKeyArray[i_conn_rel];
-  int i_source = source_delay >> MaxPortNBits;
-  int i_delay = source_delay & PortMask;
-  int id = (NESTGPUTimeIdx - i_delay + 1) % max_delay_num;
-  float r = rate_arr[id*n_node + i_source];
-  float height = r*time_resolution;
-  
-  int64_t i_conn = i_conn_0 + i_conn_rel;
-  int i_block = (int)(i_conn / block_size);
-  int64_t i_block_conn = i_conn % block_size;
-  connection_struct conn = ConnectionArray[i_block][i_block_conn];
-  uint target_port = conn.target_port;
-  int i_target = target_port >> MaxPortNBits;
-  uint port = target_port & PortMask;
-  float weight = conn.weight;
-
-  int i_group=NodeGroupMap[i_target];
-  int i = port*NodeGroupArray[i_group].n_node_ + i_target
-    - NodeGroupArray[i_group].i_node_0_;
-  double d_val = (double)(height*weight);
-  atomicAddDouble(&NodeGroupArray[i_group].get_spike_array_[i], d_val);
-
-}
diff --git a/src/user_m1.cu b/src/user_m1.cu
index c8b3e3b75..6e0bee62a 100644
--- a/src/user_m1.cu
+++ b/src/user_m1.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
-#include "user_m1_kernel.h"
 #include "rk5.h"
 #include "user_m1.h"
+#include "user_m1_kernel.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace user_m1_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      user_m1_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -54,72 +49,73 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0;
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
-    g(i) = 0;
-    g1(i) = 0;
-    E_rev(i) = 0.0;
-    tau_decay(i) = 20.0;
-    tau_rise(i) = 2.0;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    g( i ) = 0;
+    g1( i ) = 0;
+    E_rev( i ) = 0.0;
+    tau_decay( i ) = 20.0;
+    tau_rise( i ) = 2.0;
   }
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, user_m1_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
+  for ( int i = 0; i < n_port; i++ )
+  {
     // denominator is computed here to check that it is != 0
-    float denom1 = tau_decay(i) - tau_rise(i);
+    float denom1 = tau_decay( i ) - tau_rise( i );
     float denom2 = 0;
-    if (denom1 != 0) {
+    if ( denom1 != 0 )
+    {
       // peak time
-      float t_p = tau_decay(i)*tau_rise(i)
-	*log(tau_decay(i)/tau_rise(i)) / denom1;
+      float t_p = tau_decay( i ) * tau_rise( i ) * log( tau_decay( i ) / tau_rise( i ) ) / denom1;
       // another denominator is computed here to check that it is != 0
-      denom2 = exp(-t_p / tau_decay(i))
-	- exp(-t_p / tau_rise(i));
+      denom2 = exp( -t_p / tau_decay( i ) ) - exp( -t_p / tau_rise( i ) );
     }
-    if (denom2 == 0) { // if rise time == decay time use alpha function
+    if ( denom2 == 0 )
+    { // if rise time == decay time use alpha function
       // use normalization for alpha function in this case
-      g0(i) = M_E / tau_decay(i);
+      g0( i ) = M_E / tau_decay( i );
     }
-    else { // if rise time != decay time use beta function
-      g0(i) // normalization factor for conductance
-	= ( 1. / tau_rise(i) - 1. / tau_decay(i) ) / denom2;
+    else
+    {         // if rise time != decay time use beta function
+      g0( i ) // normalization factor for conductance
+        = ( 1. / tau_rise( i ) - 1. / tau_decay( i ) ) / denom2;
     }
   }
 }
 
-}
-			    
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, user_m1_rk5 data_struct)
+} // namespace user_m1_ns
+
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-    user_m1_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  user_m1_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, user_m1_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 
 {
-    user_m1_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  user_m1_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace user_m1_ns;
 
-int user_m1::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+user_m1::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_user_m1_model;
   n_scal_var_ = N_SCAL_VAR;
   n_port_var_ = N_PORT_VAR;
@@ -127,58 +123,60 @@ int user_m1::Init(int i_node_0, int n_node, int n_port,
   n_port_param_ = N_PORT_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
 
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = user_m1_scal_var_name;
-  port_var_name_= user_m1_port_var_name;
+  port_var_name_ = user_m1_port_var_name;
   scal_param_name_ = user_m1_scal_param_name;
   port_param_name_ = user_m1_port_param_name;
   group_param_name_ = user_m1_group_param_name;
-  //rk5_data_struct_.node_type_ = i_user_m1_model;
+  // rk5_data_struct_.node_type_ = i_user_m1_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
-  port_weight_arr_ = GetParamArr() + n_scal_param_
-    + GetPortParamIdx("g0");
+  port_weight_arr_ = GetParamArr() + n_scal_param_ + GetPortParamIdx( "g0" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = n_port_param_;
 
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("g1");
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "g1" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int user_m1::Calibrate(double time_min, float time_resolution)
+int
+user_m1::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
 template <>
-int user_m1::UpdateNR<0>(long long it, double t1)
+int
+user_m1::UpdateNR< 0 >( long long it, double t1 )
 {
   return 0;
 }
 
-int user_m1::Update(long long it, double t1) {
-  UpdateNR<MAX_PORT_NUM>(it, t1);
+int
+user_m1::Update( long long it, double t1 )
+{
+  UpdateNR< MAX_PORT_NUM >( it, t1 );
 
   return 0;
 }
diff --git a/src/user_m1.h b/src/user_m1.h
index 5416de7b3..f15f6cef4 100644
--- a/src/user_m1.h
+++ b/src/user_m1.h
@@ -20,20 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1_H
 #define USERM1_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 #define MAX_PORT_NUM 20
 
@@ -44,30 +40,32 @@ struct user_m1_rk5
 
 class user_m1 : public BaseNeuron
 {
- public:
-  RungeKutta5<user_m1_rk5> rk5_;
+public:
+  RungeKutta5< user_m1_rk5 > rk5_;
   float h_min_;
   float h_;
   user_m1_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/user_m1_cond_alpha.cu b/src/user_m1_cond_alpha.cu
index de23b1216..2afb4fe0e 100644
--- a/src/user_m1_cond_alpha.cu
+++ b/src/user_m1_cond_alpha.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
-#include "user_m1_kernel.h"
 #include "rk5.h"
 #include "user_m1.h"
+#include "user_m1_kernel.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace user_m1_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      user_m1_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -54,54 +49,54 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0;
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
-    g(i) = 0;
-    g1(i) = 0;
-    E_rev(i) = 0.0;
-    tau_syn(i) = 2.0;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    g( i ) = 0;
+    g1( i ) = 0;
+    E_rev( i ) = 0.0;
+    tau_syn( i ) = 2.0;
   }
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, user_m1_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
+  for ( int i = 0; i < n_port; i++ )
+  {
     // use normalization for alpha function
-    g0(i) = M_E / tau_syn(i);
+    g0( i ) = M_E / tau_syn( i );
   }
 }
 
-}
-			    
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, user_m1_rk5 data_struct)
+} // namespace user_m1_ns
+
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-    user_m1_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  user_m1_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, user_m1_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 
 {
-    user_m1_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  user_m1_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace user_m1_ns;
 
-int user_m1::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+user_m1::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_user_m1_model;
   n_scal_var_ = N_SCAL_VAR;
   n_port_var_ = N_PORT_VAR;
@@ -109,58 +104,60 @@ int user_m1::Init(int i_node_0, int n_node, int n_port,
   n_port_param_ = N_PORT_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
+
+  group_param_ = new float[ N_GROUP_PARAM ];
 
-  group_param_ = new float[N_GROUP_PARAM];
-  
   scal_var_name_ = user_m1_scal_var_name;
-  port_var_name_= user_m1_port_var_name;
+  port_var_name_ = user_m1_port_var_name;
   scal_param_name_ = user_m1_scal_param_name;
   port_param_name_ = user_m1_port_param_name;
   group_param_name_ = user_m1_group_param_name;
-  //rk5_data_struct_.node_type_ = i_user_m1_model;
+  // rk5_data_struct_.node_type_ = i_user_m1_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
-  port_weight_arr_ = GetParamArr() + n_scal_param_
-    + GetPortParamIdx("g0");
+  port_weight_arr_ = GetParamArr() + n_scal_param_ + GetPortParamIdx( "g0" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = n_port_param_;
 
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("g1");
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "g1" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int user_m1::Calibrate(double time_min, float time_resolution)
+int
+user_m1::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
 template <>
-int user_m1::UpdateNR<0>(long long it, double t1)
+int
+user_m1::UpdateNR< 0 >( long long it, double t1 )
 {
   return 0;
 }
 
-int user_m1::Update(long long it, double t1) {
-  UpdateNR<MAX_PORT_NUM>(it, t1);
+int
+user_m1::Update( long long it, double t1 )
+{
+  UpdateNR< MAX_PORT_NUM >( it, t1 );
 
   return 0;
 }
diff --git a/src/user_m1_cond_alpha.h b/src/user_m1_cond_alpha.h
index 1b07c1806..f6195f99f 100644
--- a/src/user_m1_cond_alpha.h
+++ b/src/user_m1_cond_alpha.h
@@ -20,20 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1CONDALPHA_H
 #define USERM1CONDALPHA_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 #define MAX_PORT_NUM 20
 
@@ -44,30 +40,32 @@ struct user_m1_rk5
 
 class user_m1 : public BaseNeuron
 {
- public:
-  RungeKutta5<user_m1_rk5> rk5_;
+public:
+  RungeKutta5< user_m1_rk5 > rk5_;
   float h_min_;
   float h_;
   user_m1_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/user_m1_cond_alpha_kernel.h b/src/user_m1_cond_alpha_kernel.h
index 15f0d8cf5..affc7d0ac 100644
--- a/src/user_m1_cond_alpha_kernel.h
+++ b/src/user_m1_cond_alpha_kernel.h
@@ -20,38 +20,37 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1CONDALPHAKERNEL_H
 #define USERM1CONDALPHAKERNEL_H
 
 #include <string>
-				    //#include <cmath>
-#include "spike_buffer.h"
+// #include <cmath>
 #include "node_group.h"
+#include "spike_buffer.h"
 #include "user_m1.h"
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace user_m1_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_g = 0,
   i_g1,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -69,31 +68,26 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_E_rev = 0,
   i_tau_syn,
   i_g0,
   N_PORT_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
-const std::string user_m1_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
+const std::string user_m1_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string user_m1_port_var_name[N_PORT_VAR] = {
-  "g",
-  "g1"
-};
+const std::string user_m1_port_var_name[ N_PORT_VAR ] = { "g", "g1" };
 
-const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string user_m1_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -106,164 +100,157 @@ const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string user_m1_port_param_name[N_PORT_PARAM] = {
-  "E_rev",
-  "tau_syn",
-  "g0"  
-};
+const std::string user_m1_port_param_name[ N_PORT_PARAM ] = { "E_rev", "tau_syn", "g0" };
 
-const std::string user_m1_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string user_m1_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-#define g(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define g1(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-#define dgdt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define dg1dt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define E_rev(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_E_rev]
-#define tau_syn(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_syn]
-#define g0(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_g0]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     user_m1_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+#define g( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define g1( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+#define dgdt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define dg1dt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define E_rev( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_E_rev ]
+#define tau_syn( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_syn ]
+#define g0( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_g0 ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct )
 {
-  enum { n_port = (NVAR-N_SCAL_VAR)/N_PORT_VAR };
+  enum
+  {
+    n_port = ( NVAR - N_SCAL_VAR ) / N_PORT_VAR
+  };
   float I_syn = 0.0;
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
-  for (int i = 0; i<n_port; i++) {
-    I_syn += g(i)*(E_rev(i) - V);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn += g( i ) * ( E_rev( i ) - V );
   }
-  float V_spike = Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn - w + I_e ) / C_m;
 
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
-  for (int i=0; i<n_port; i++) {
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
+  for ( int i = 0; i < n_port; i++ )
+  {
     // Synaptic conductance derivative
-    dg1dt(i) = -g1(i) / tau_syn(i);
-    dgdt(i) = g1(i) - g(i) / tau_syn(i);
+    dg1dt( i ) = -g1( i ) / tau_syn( i );
+    dgdt( i ) = g1( i ) - g( i ) / tau_syn( i );
   }
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)::round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) ::round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
-};
+}; // namespace user_m1_ns
 
 template <>
-int user_m1::UpdateNR<0>(long long it, double t1);
+int user_m1::UpdateNR< 0 >( long long it, double t1 );
 
-template<int N_PORT>
-int user_m1::UpdateNR(long long it, double t1)
+template < int N_PORT >
+int
+user_m1::UpdateNR( long long it, double t1 )
 {
-  if (N_PORT == n_port_) {
-    const int NVAR = user_m1_ns::N_SCAL_VAR
-      + user_m1_ns::N_PORT_VAR*N_PORT;
-    const int NPARAM = user_m1_ns::N_SCAL_PARAM
-      + user_m1_ns::N_PORT_PARAM*N_PORT;
+  if ( N_PORT == n_port_ )
+  {
+    const int NVAR = user_m1_ns::N_SCAL_VAR + user_m1_ns::N_PORT_VAR * N_PORT;
+    const int NPARAM = user_m1_ns::N_SCAL_PARAM + user_m1_ns::N_PORT_PARAM * N_PORT;
 
-    rk5_.Update<NVAR, NPARAM>(t1, h_min_, rk5_data_struct_);
+    rk5_.Update< NVAR, NPARAM >( t1, h_min_, rk5_data_struct_ );
   }
-  else {
-    UpdateNR<N_PORT - 1>(it, t1);
+  else
+  {
+    UpdateNR< N_PORT - 1 >( it, t1 );
   }
 
   return 0;
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct )
 {
-    user_m1_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  user_m1_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct )
 {
-    user_m1_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  user_m1_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/user_m1_cond_alpha_rk5.h b/src/user_m1_cond_alpha_rk5.h
index b7e2613c8..71c867ea6 100644
--- a/src/user_m1_cond_alpha_rk5.h
+++ b/src/user_m1_cond_alpha_rk5.h
@@ -20,32 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1CONDALPHARK5_H
 #define USERM1CONDALPHARK5_H
 
 struct user_m1_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m1_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m1_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, user_m1_rk5 data_struct);
+__device__ void NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, user_m1_rk5 data_struct);
+__device__ void NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct );
 
 #endif
diff --git a/src/user_m1_cond_beta.cu b/src/user_m1_cond_beta.cu
index 4288800fb..30c486b30 100644
--- a/src/user_m1_cond_beta.cu
+++ b/src/user_m1_cond_beta.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
-#include "user_m1_kernel.h"
 #include "rk5.h"
 #include "user_m1.h"
+#include "user_m1_kernel.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace user_m1_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      user_m1_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -54,72 +49,73 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0;
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
-    g(i) = 0;
-    g1(i) = 0;
-    E_rev(i) = 0.0;
-    tau_decay(i) = 20.0;
-    tau_rise(i) = 2.0;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    g( i ) = 0;
+    g1( i ) = 0;
+    E_rev( i ) = 0.0;
+    tau_decay( i ) = 20.0;
+    tau_rise( i ) = 2.0;
   }
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, user_m1_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
+  for ( int i = 0; i < n_port; i++ )
+  {
     // denominator is computed here to check that it is != 0
-    float denom1 = tau_decay(i) - tau_rise(i);
+    float denom1 = tau_decay( i ) - tau_rise( i );
     float denom2 = 0;
-    if (denom1 != 0) {
+    if ( denom1 != 0 )
+    {
       // peak time
-      float t_p = tau_decay(i)*tau_rise(i)
-	*log(tau_decay(i)/tau_rise(i)) / denom1;
+      float t_p = tau_decay( i ) * tau_rise( i ) * log( tau_decay( i ) / tau_rise( i ) ) / denom1;
       // another denominator is computed here to check that it is != 0
-      denom2 = exp(-t_p / tau_decay(i))
-	- exp(-t_p / tau_rise(i));
+      denom2 = exp( -t_p / tau_decay( i ) ) - exp( -t_p / tau_rise( i ) );
     }
-    if (denom2 == 0) { // if rise time == decay time use alpha function
+    if ( denom2 == 0 )
+    { // if rise time == decay time use alpha function
       // use normalization for alpha function in this case
-      g0(i) = M_E / tau_decay(i);
+      g0( i ) = M_E / tau_decay( i );
     }
-    else { // if rise time != decay time use beta function
-      g0(i) // normalization factor for conductance
-	= ( 1. / tau_rise(i) - 1. / tau_decay(i) ) / denom2;
+    else
+    {         // if rise time != decay time use beta function
+      g0( i ) // normalization factor for conductance
+        = ( 1. / tau_rise( i ) - 1. / tau_decay( i ) ) / denom2;
     }
   }
 }
 
-}
-			    
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, user_m1_rk5 data_struct)
+} // namespace user_m1_ns
+
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-    user_m1_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  user_m1_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, user_m1_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 
 {
-    user_m1_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  user_m1_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace user_m1_ns;
 
-int user_m1::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+user_m1::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_user_m1_model;
   n_scal_var_ = N_SCAL_VAR;
   n_port_var_ = N_PORT_VAR;
@@ -127,58 +123,60 @@ int user_m1::Init(int i_node_0, int n_node, int n_port,
   n_port_param_ = N_PORT_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
 
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = user_m1_scal_var_name;
-  port_var_name_= user_m1_port_var_name;
+  port_var_name_ = user_m1_port_var_name;
   scal_param_name_ = user_m1_scal_param_name;
   port_param_name_ = user_m1_port_param_name;
   group_param_name_ = user_m1_group_param_name;
-  //rk5_data_struct_.node_type_ = i_user_m1_model;
+  // rk5_data_struct_.node_type_ = i_user_m1_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
-  port_weight_arr_ = GetParamArr() + n_scal_param_
-    + GetPortParamIdx("g0");
+  port_weight_arr_ = GetParamArr() + n_scal_param_ + GetPortParamIdx( "g0" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = n_port_param_;
 
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("g1");
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "g1" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int user_m1::Calibrate(double time_min, float time_resolution)
+int
+user_m1::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
 template <>
-int user_m1::UpdateNR<0>(long long it, double t1)
+int
+user_m1::UpdateNR< 0 >( long long it, double t1 )
 {
   return 0;
 }
 
-int user_m1::Update(long long it, double t1) {
-  UpdateNR<MAX_PORT_NUM>(it, t1);
+int
+user_m1::Update( long long it, double t1 )
+{
+  UpdateNR< MAX_PORT_NUM >( it, t1 );
 
   return 0;
 }
diff --git a/src/user_m1_cond_beta.h b/src/user_m1_cond_beta.h
index 4333719bd..de3b5e4ad 100644
--- a/src/user_m1_cond_beta.h
+++ b/src/user_m1_cond_beta.h
@@ -20,20 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1CONDBETA_H
 #define USERM1CONDBETA_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 #define MAX_PORT_NUM 20
 
@@ -44,30 +40,32 @@ struct user_m1_rk5
 
 class user_m1 : public BaseNeuron
 {
- public:
-  RungeKutta5<user_m1_rk5> rk5_;
+public:
+  RungeKutta5< user_m1_rk5 > rk5_;
   float h_min_;
   float h_;
   user_m1_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/user_m1_cond_beta_kernel.h b/src/user_m1_cond_beta_kernel.h
index 2d607b4a8..a843902f8 100644
--- a/src/user_m1_cond_beta_kernel.h
+++ b/src/user_m1_cond_beta_kernel.h
@@ -20,38 +20,37 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1CONDBETAKERNEL_H
 #define USERM1CONDBETAKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
 #include "node_group.h"
+#include "spike_buffer.h"
 #include "user_m1.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace user_m1_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_g = 0,
   i_g1,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -69,7 +68,8 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_E_rev = 0,
   i_tau_rise,
   i_tau_decay,
@@ -77,25 +77,18 @@ enum PortParamIndexes {
   N_PORT_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
+const std::string user_m1_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string user_m1_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
-
-const std::string user_m1_port_var_name[N_PORT_VAR] = {
-  "g",
-  "g1"
-};
+const std::string user_m1_port_var_name[ N_PORT_VAR ] = { "g", "g1" };
 
-const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string user_m1_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -108,165 +101,157 @@ const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string user_m1_port_param_name[N_PORT_PARAM] = {
-  "E_rev",
-  "tau_rise",
-  "tau_decay",
-  "g0"  
-};
+const std::string user_m1_port_param_name[ N_PORT_PARAM ] = { "E_rev", "tau_rise", "tau_decay", "g0" };
 
-const std::string user_m1_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string user_m1_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-#define g(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define g1(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-#define dgdt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define dg1dt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define E_rev(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_E_rev]
-#define tau_rise(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_rise]
-#define tau_decay(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_decay]
-#define g0(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_g0]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     user_m1_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+#define g( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define g1( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+#define dgdt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define dg1dt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define E_rev( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_E_rev ]
+#define tau_rise( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_rise ]
+#define tau_decay( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_decay ]
+#define g0( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_g0 ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct )
 {
-  enum { n_port = (NVAR-N_SCAL_VAR)/N_PORT_VAR };
+  enum
+  {
+    n_port = ( NVAR - N_SCAL_VAR ) / N_PORT_VAR
+  };
   float I_syn = 0.0;
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
-  for (int i = 0; i<n_port; i++) {
-    I_syn += g(i)*(E_rev(i) - V);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn += g( i ) * ( E_rev( i ) - V );
   }
-  float V_spike = Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
-  for (int i=0; i<n_port; i++) {
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
+  for ( int i = 0; i < n_port; i++ )
+  {
     // Synaptic conductance derivative
-    dg1dt(i) = -g1(i) / tau_rise(i);
-    dgdt(i) = g1(i) - g(i) / tau_decay(i);
+    dg1dt( i ) = -g1( i ) / tau_rise( i );
+    dgdt( i ) = g1( i ) - g( i ) / tau_decay( i );
   }
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
-};
+}; // namespace user_m1_ns
 
 template <>
-int user_m1::UpdateNR<0>(long long it, double t1);
+int user_m1::UpdateNR< 0 >( long long it, double t1 );
 
-template<int N_PORT>
-int user_m1::UpdateNR(long long it, double t1)
+template < int N_PORT >
+int
+user_m1::UpdateNR( long long it, double t1 )
 {
-  if (N_PORT == n_port_) {
-    const int NVAR = user_m1_ns::N_SCAL_VAR
-      + user_m1_ns::N_PORT_VAR*N_PORT;
-    const int NPARAM = user_m1_ns::N_SCAL_PARAM
-      + user_m1_ns::N_PORT_PARAM*N_PORT;
+  if ( N_PORT == n_port_ )
+  {
+    const int NVAR = user_m1_ns::N_SCAL_VAR + user_m1_ns::N_PORT_VAR * N_PORT;
+    const int NPARAM = user_m1_ns::N_SCAL_PARAM + user_m1_ns::N_PORT_PARAM * N_PORT;
 
-    rk5_.Update<NVAR, NPARAM>(t1, h_min_, rk5_data_struct_);
+    rk5_.Update< NVAR, NPARAM >( t1, h_min_, rk5_data_struct_ );
   }
-  else {
-    UpdateNR<N_PORT - 1>(it, t1);
+  else
+  {
+    UpdateNR< N_PORT - 1 >( it, t1 );
   }
 
   return 0;
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct )
 {
-    user_m1_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  user_m1_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct )
 {
-    user_m1_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  user_m1_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/user_m1_cond_beta_rk5.h b/src/user_m1_cond_beta_rk5.h
index d5c9763f3..f91b8362e 100644
--- a/src/user_m1_cond_beta_rk5.h
+++ b/src/user_m1_cond_beta_rk5.h
@@ -20,32 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1CONDBETARK5_H
 #define USERM1CONDBETARK5_H
 
 struct user_m1_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m1_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m1_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, user_m1_rk5 data_struct);
+__device__ void NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, user_m1_rk5 data_struct);
+__device__ void NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct );
 
 #endif
diff --git a/src/user_m1_iaf_psc_exp.cu b/src/user_m1_iaf_psc_exp.cu
index 891d1866c..c7d818327 100644
--- a/src/user_m1_iaf_psc_exp.cu
+++ b/src/user_m1_iaf_psc_exp.cu
@@ -20,91 +20,91 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/user_m1.cpp
 
-#include <config.h>
-#include <cmath>
-#include <iostream>
-#include "user_m1.h"
 #include "propagator_stability.h"
 #include "spike_buffer.h"
+#include "user_m1.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 using namespace user_m1_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
-extern __device__ double propagator_32(double, double, double, double);
-
-#define I_syn_ex var[i_I_syn_ex]
-#define I_syn_in var[i_I_syn_in]
-#define V_m_rel var[i_V_m_rel]
-#define refractory_step var[i_refractory_step]
-
-#define tau_m param[i_tau_m]
-#define C_m param[i_C_m]
-#define E_L param[i_E_L]
-#define I_e param[i_I_e]
-#define Theta_rel param[i_Theta_rel]
-#define V_reset_rel param[i_V_reset_rel]
-#define tau_ex param[i_tau_ex]
-#define tau_in param[i_tau_in]
-//#define rho param[i_rho]
-//#define delta param[i_delta]
-#define t_ref param[i_t_ref]
-#define den_delay param[i_den_delay]
-
-#define P20 param[i_P20]
-#define P11ex param[i_P11ex]
-#define P11in param[i_P11in]
-#define P21ex param[i_P21ex]
-#define P21in param[i_P21in]
-#define P22 param[i_P22]
-
-__global__ void user_m1_Calibrate(int n_node, float *param_arr,
-				      int n_param, float h)
+extern __device__ double propagator_32( double, double, double, double );
+
+#define I_syn_ex var[ i_I_syn_ex ]
+#define I_syn_in var[ i_I_syn_in ]
+#define V_m_rel var[ i_V_m_rel ]
+#define refractory_step var[ i_refractory_step ]
+
+#define tau_m param[ i_tau_m ]
+#define C_m param[ i_C_m ]
+#define E_L param[ i_E_L ]
+#define I_e param[ i_I_e ]
+#define Theta_rel param[ i_Theta_rel ]
+#define V_reset_rel param[ i_V_reset_rel ]
+#define tau_ex param[ i_tau_ex ]
+#define tau_in param[ i_tau_in ]
+// #define rho param[i_rho]
+// #define delta param[i_delta]
+#define t_ref param[ i_t_ref ]
+#define den_delay param[ i_den_delay ]
+
+#define P20 param[ i_P20 ]
+#define P11ex param[ i_P11ex ]
+#define P11in param[ i_P11in ]
+#define P21ex param[ i_P21ex ]
+#define P21in param[ i_P21in ]
+#define P22 param[ i_P22 ]
+
+__global__ void
+user_m1_Calibrate( int n_node, float* param_arr, int n_param, float h )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *param = param_arr + n_param*i_neuron;
-    
+  if ( i_neuron < n_node )
+  {
+    float* param = param_arr + n_param * i_neuron;
+
     P11ex = exp( -h / tau_ex );
     P11in = exp( -h / tau_in );
     P22 = exp( -h / tau_m );
-    P21ex = (float)propagator_32( tau_ex, tau_m, C_m, h );
-    P21in = (float)propagator_32( tau_in, tau_m, C_m, h ); 
+    P21ex = ( float ) propagator_32( tau_ex, tau_m, C_m, h );
+    P21in = ( float ) propagator_32( tau_in, tau_m, C_m, h );
     P20 = tau_m / C_m * ( 1.0 - P22 );
   }
 }
 
-
-__global__ void user_m1_Update(int n_node, int i_node_0, float *var_arr,
-				   float *param_arr, int n_var, int n_param)
+__global__ void
+user_m1_Update( int n_node, int i_node_0, float* var_arr, float* param_arr, int n_var, int n_param )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V
+    else
+    { // neuron is not refractory, so evolve V
       V_m_rel = V_m_rel * P22 + I_syn_ex * P21ex + I_syn_in * P21in + I_e * P20;
     }
     // exponential decaying PSCs
     I_syn_ex *= P11ex;
     I_syn_in *= P11in;
-    
-    if (V_m_rel >= Theta_rel ) { // threshold crossing
-      PushSpike(i_node_0 + i_neuron, 1.0);
+
+    if ( V_m_rel >= Theta_rel )
+    { // threshold crossing
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m_rel = V_reset_rel;
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-    }    
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+    }
   }
 }
 
@@ -114,87 +114,87 @@ user_m1::~user_m1()
   FreeParamArr();
 }
 
-int user_m1::Init(int i_node_0, int n_node, int /*n_port*/,
-			 int i_group)
+int
+user_m1::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 2 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 2 /*n_port*/, i_group );
   node_type_ = i_user_m1_model;
 
   n_scal_var_ = N_SCAL_VAR;
   n_var_ = n_scal_var_;
   n_scal_param_ = N_SCAL_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
 
   scal_var_name_ = user_m1_scal_var_name;
   scal_param_name_ = user_m1_scal_param_name;
 
-  SetScalParam(0, n_node, "tau_m", 10.0 );           // in ms
-  SetScalParam(0, n_node, "C_m", 250.0 );            // in pF
-  SetScalParam(0, n_node, "E_L", -70.0 );            // in mV
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
-  SetScalParam(0, n_node, "Theta_rel", -55.0 - (-70.0) );   // relative to E_L_
-  SetScalParam(0, n_node, "V_reset_rel", -70.0 - (-70.0) ); // relative to E_L_
-  SetScalParam(0, n_node, "tau_ex", 2.0 );           // in ms
-  SetScalParam(0, n_node, "tau_in", 2.0 );           // in ms
+  SetScalParam( 0, n_node, "tau_m", 10.0 );                    // in ms
+  SetScalParam( 0, n_node, "C_m", 250.0 );                     // in pF
+  SetScalParam( 0, n_node, "E_L", -70.0 );                     // in mV
+  SetScalParam( 0, n_node, "I_e", 0.0 );                       // in pA
+  SetScalParam( 0, n_node, "Theta_rel", -55.0 - ( -70.0 ) );   // relative to E_L_
+  SetScalParam( 0, n_node, "V_reset_rel", -70.0 - ( -70.0 ) ); // relative to E_L_
+  SetScalParam( 0, n_node, "tau_ex", 2.0 );                    // in ms
+  SetScalParam( 0, n_node, "tau_in", 2.0 );                    // in ms
   // SetScalParam(0, n_node, "rho", 0.01 );             // in 1/s
   // SetScalParam(0, n_node, "delta", 0.0 );            // in mV
-  SetScalParam(0, n_node, "t_ref",  2.0 );           // in ms
-  SetScalParam(0, n_node, "den_delay", 0.0);         // in ms
-  SetScalParam(0, n_node, "P20", 0.0);
-  SetScalParam(0, n_node, "P11ex", 0.0);
-  SetScalParam(0, n_node, "P11in", 0.0);
-  SetScalParam(0, n_node, "P21ex", 0.0);
-  SetScalParam(0, n_node, "P21in", 0.0);
-  SetScalParam(0, n_node, "P22", 0.0);
-
-  SetScalVar(0, n_node, "I_syn_ex", 0.0 );
-  SetScalVar(0, n_node, "I_syn_in", 0.0 );
-  SetScalVar(0, n_node, "V_m_rel", -70.0 - (-70.0) ); // in mV, relative to E_L
-  SetScalVar(0, n_node, "refractory_step", 0 );
+  SetScalParam( 0, n_node, "t_ref", 2.0 );     // in ms
+  SetScalParam( 0, n_node, "den_delay", 0.0 ); // in ms
+  SetScalParam( 0, n_node, "P20", 0.0 );
+  SetScalParam( 0, n_node, "P11ex", 0.0 );
+  SetScalParam( 0, n_node, "P11in", 0.0 );
+  SetScalParam( 0, n_node, "P21ex", 0.0 );
+  SetScalParam( 0, n_node, "P21in", 0.0 );
+  SetScalParam( 0, n_node, "P22", 0.0 );
+
+  SetScalVar( 0, n_node, "I_syn_ex", 0.0 );
+  SetScalVar( 0, n_node, "I_syn_in", 0.0 );
+  SetScalVar( 0, n_node, "V_m_rel", -70.0 - ( -70.0 ) ); // in mV, relative to E_L
+  SetScalVar( 0, n_node, "refractory_step", 0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn_ex, I_syn_in
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn_ex");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn_ex" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 1;
 
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
-  
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
+
   return 0;
 }
 
-int user_m1::Update(long long it, double t1)
+int
+user_m1::Update( long long it, double t1 )
 {
   // std::cout << "user_m1 neuron update\n";
-  user_m1_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_);
+  user_m1_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>( n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_ );
   // gpuErrchk( cudaDeviceSynchronize() );
-  
+
   return 0;
 }
 
-int user_m1::Free()
+int
+user_m1::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
-  
+
   return 0;
 }
 
-int user_m1::Calibrate(double, float time_resolution)
+int
+user_m1::Calibrate( double, float time_resolution )
 {
-  user_m1_Calibrate<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, param_arr_, n_param_, time_resolution);
+  user_m1_Calibrate<<< ( n_node_ + 1023 ) / 1024, 1024 >>>( n_node_, param_arr_, n_param_, time_resolution );
 
   return 0;
 }
diff --git a/src/user_m1_iaf_psc_exp.h b/src/user_m1_iaf_psc_exp.h
index 65a0e288d..5dd27ae55 100644
--- a/src/user_m1_iaf_psc_exp.h
+++ b/src/user_m1_iaf_psc_exp.h
@@ -20,48 +20,44 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/user_m1.h
 
-
 #ifndef USERM1IAFPSCEXP_H
 #define USERM1IAFPSCEXP_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 namespace user_m1_ns
 {
-enum ScalVarIndexes {
-  i_I_syn_ex = 0,        // postsynaptic current for exc. inputs
-  i_I_syn_in,            // postsynaptic current for inh. inputs
-  i_V_m_rel,                 // membrane potential
-  i_refractory_step,     // refractory step counter
+enum ScalVarIndexes
+{
+  i_I_syn_ex = 0,    // postsynaptic current for exc. inputs
+  i_I_syn_in,        // postsynaptic current for inh. inputs
+  i_V_m_rel,         // membrane potential
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_tau_m = 0,       // Membrane time constant in ms
-  i_C_m,             // Membrane capacitance in pF
-  i_E_L,             // Resting potential in mV
-  i_I_e,             // External current in pA
-  i_Theta_rel,       // Threshold, RELATIVE TO RESTING POTENTAIL(!)
-                     // i.e. the real threshold is (E_L_+Theta_rel_)
-  i_V_reset_rel,     // relative reset value of the membrane potential
-  i_tau_ex,          // Time constant of excitatory synaptic current in ms
-  i_tau_in,          // Time constant of inhibitory synaptic current in ms
+enum ScalParamIndexes
+{
+  i_tau_m = 0,   // Membrane time constant in ms
+  i_C_m,         // Membrane capacitance in pF
+  i_E_L,         // Resting potential in mV
+  i_I_e,         // External current in pA
+  i_Theta_rel,   // Threshold, RELATIVE TO RESTING POTENTAIL(!)
+                 // i.e. the real threshold is (E_L_+Theta_rel_)
+  i_V_reset_rel, // relative reset value of the membrane potential
+  i_tau_ex,      // Time constant of excitatory synaptic current in ms
+  i_tau_in,      // Time constant of inhibitory synaptic current in ms
   // i_rho,          // Stochastic firing intensity at threshold in 1/s
   // i_delta,        // Width of threshold region in mV
-  i_t_ref,           // Refractory period in ms
+  i_t_ref,     // Refractory period in ms
   i_den_delay, // dendritic backpropagation delay
   // time evolution operator
   i_P20,
@@ -73,17 +69,9 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
- 
-const std::string user_m1_scal_var_name[N_SCAL_VAR] = {
-  "I_syn_ex",
-  "I_syn_in",
-  "V_m_rel",
-  "refractory_step"
-};
-
+const std::string user_m1_scal_var_name[ N_SCAL_VAR ] = { "I_syn_ex", "I_syn_in", "V_m_rel", "refractory_step" };
 
-const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
-  "tau_m",
+const std::string user_m1_scal_param_name[ N_SCAL_PARAM ] = { "tau_m",
   "C_m",
   "E_L",
   "I_e",
@@ -100,26 +88,22 @@ const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
   "P11in",
   "P21ex",
   "P21in",
-  "P22"
-};
+  "P22" };
+
+} // namespace user_m1_ns
 
-} // namespace
- 
 class user_m1 : public BaseNeuron
 {
- public:
+public:
   ~user_m1();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Calibrate(double, float time_resolution);
-		
-  int Update(long long it, double t1);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
 
-  int Free();
+  int Calibrate( double, float time_resolution );
 
-};
+  int Update( long long it, double t1 );
 
+  int Free();
+};
 
 #endif
diff --git a/src/user_m1_iaf_psc_exp_g.cu b/src/user_m1_iaf_psc_exp_g.cu
index 0329fe916..39c0852a4 100644
--- a/src/user_m1_iaf_psc_exp_g.cu
+++ b/src/user_m1_iaf_psc_exp_g.cu
@@ -20,74 +20,82 @@
  *
  */
 
-
-
-
-
-#include <config.h>
+#include "spike_buffer.h"
+#include "user_m1.h"
 #include <cmath>
+#include <config.h>
 #include <iostream>
-#include "user_m1.h"
-#include "spike_buffer.h"
 
 using namespace user_m1_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
 
-#define I_syn var[i_I_syn]
-#define V_m_rel var[i_V_m_rel]
-#define refractory_step var[i_refractory_step]
-#define I_e param[i_I_e]
-
-#define tau_m_ group_param_[i_tau_m]
-#define C_m_ group_param_[i_C_m]
-#define E_L_ group_param_[i_E_L]
-#define Theta_rel_ group_param_[i_Theta_rel]
-#define V_reset_rel_ group_param_[i_V_reset_rel]
-#define tau_syn_ group_param_[i_tau_syn]
-#define t_ref_ group_param_[i_t_ref]
-
-__global__ void user_m1_Update
-( int n_node, int i_node_0, float *var_arr, float *param_arr, int n_var,
-  int n_param, float Theta_rel, float V_reset_rel, int n_refractory_steps,
-  float P11, float P22, float P21, float P20 )
+#define I_syn var[ i_I_syn ]
+#define V_m_rel var[ i_V_m_rel ]
+#define refractory_step var[ i_refractory_step ]
+#define I_e param[ i_I_e ]
+
+#define tau_m_ group_param_[ i_tau_m ]
+#define C_m_ group_param_[ i_C_m ]
+#define E_L_ group_param_[ i_E_L ]
+#define Theta_rel_ group_param_[ i_Theta_rel ]
+#define V_reset_rel_ group_param_[ i_V_reset_rel ]
+#define tau_syn_ group_param_[ i_tau_syn ]
+#define t_ref_ group_param_[ i_t_ref ]
+
+__global__ void
+user_m1_Update( int n_node,
+  int i_node_0,
+  float* var_arr,
+  float* param_arr,
+  int n_var,
+  int n_param,
+  float Theta_rel,
+  float V_reset_rel,
+  int n_refractory_steps,
+  float P11,
+  float P22,
+  float P21,
+  float P20 )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V
+    else
+    { // neuron is not refractory, so evolve V
       V_m_rel = V_m_rel * P22 + I_syn * P21 + I_e * P20;
     }
     // exponential decaying PSC
     I_syn *= P11;
-    
-    if (V_m_rel >= Theta_rel ) { // threshold crossing
-      PushSpike(i_node_0 + i_neuron, 1.0);
+
+    if ( V_m_rel >= Theta_rel )
+    { // threshold crossing
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m_rel = V_reset_rel;
       refractory_step = n_refractory_steps;
-    }    
+    }
   }
 }
 
-double h_propagator_32( double tau_syn, double tau, double C, double h )
+double
+h_propagator_32( double tau_syn, double tau, double C, double h )
 {
-  const double P32_linear = 1.0 / ( 2.0 * C * tau * tau ) * h * h
-    * ( tau_syn - tau ) * exp( -h / tau );
+  const double P32_linear = 1.0 / ( 2.0 * C * tau * tau ) * h * h * ( tau_syn - tau ) * exp( -h / tau );
   const double P32_singular = h / C * exp( -h / tau );
   const double P32 =
-    -tau / ( C * ( 1.0 - tau / tau_syn ) ) * exp( -h / tau_syn )
-    * expm1( h * ( 1.0 / tau_syn - 1.0 / tau ) );
+    -tau / ( C * ( 1.0 - tau / tau_syn ) ) * exp( -h / tau_syn ) * expm1( h * ( 1.0 / tau_syn - 1.0 / tau ) );
 
   const double dev_P32 = fabs( P32 - P32_singular );
 
-  if ( tau == tau_syn || ( fabs( tau - tau_syn ) < 0.1 && dev_P32 > 2.0
-			   * fabs( P32_linear ) ) )
+  if ( tau == tau_syn || ( fabs( tau - tau_syn ) < 0.1 && dev_P32 > 2.0 * fabs( P32_linear ) ) )
   {
     return P32_singular;
   }
@@ -103,10 +111,10 @@ user_m1::~user_m1()
   FreeParamArr();
 }
 
-int user_m1::Init(int i_node_0, int n_node, int /*n_port*/,
-			   int i_group)
+int
+user_m1::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 1 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 1 /*n_port*/, i_group );
   node_type_ = i_user_m1_model;
 
   n_scal_var_ = N_SCAL_VAR;
@@ -114,46 +122,46 @@ int user_m1::Init(int i_node_0, int n_node, int /*n_port*/,
   n_scal_param_ = N_SCAL_PARAM;
   n_group_param_ = N_GROUP_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = user_m1_scal_var_name;
   scal_param_name_ = user_m1_scal_param_name;
   group_param_name_ = user_m1_group_param_name;
 
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
+  SetScalParam( 0, n_node, "I_e", 0.0 ); // in pA
 
-  SetScalVar(0, n_node, "I_syn", 0.0 );
-  SetScalVar(0, n_node, "V_m_rel", 0.0 ); // in mV
-  SetScalVar(0, n_node, "refractory_step", 0 );
+  SetScalVar( 0, n_node, "I_syn", 0.0 );
+  SetScalVar( 0, n_node, "V_m_rel", 0.0 ); // in mV
+  SetScalVar( 0, n_node, "refractory_step", 0 );
 
-  SetGroupParam("tau_m", 10.0);
-  SetGroupParam("C_m", 250.0);
-  SetGroupParam("E_L", -65.0);
-  SetGroupParam("Theta_rel", 15.0);
-  SetGroupParam("V_reset_rel", 0.0);
-  SetGroupParam("tau_syn", 0.5);
-  SetGroupParam("t_ref", 2.0);
+  SetGroupParam( "tau_m", 10.0 );
+  SetGroupParam( "C_m", 250.0 );
+  SetGroupParam( "E_L", -65.0 );
+  SetGroupParam( "Theta_rel", 15.0 );
+  SetGroupParam( "V_reset_rel", 0.0 );
+  SetGroupParam( "tau_syn", 0.5 );
+  SetGroupParam( "t_ref", 2.0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 0;
 
   return 0;
 }
 
-int user_m1::Update(long long it, double t1)
+int
+user_m1::Update( long long it, double t1 )
 {
   // std::cout << "user_m1 neuron update\n";
   float h = time_resolution_;
@@ -161,21 +169,32 @@ int user_m1::Update(long long it, double t1)
   float P22 = exp( -h / tau_m_ );
   float P21 = h_propagator_32( tau_syn_, tau_m_, C_m_, h );
   float P20 = tau_m_ / C_m_ * ( 1.0 - P22 );
-  int n_refractory_steps = int(round(t_ref_ / h));
+  int n_refractory_steps = int( round( t_ref_ / h ) );
+
+  user_m1_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>( n_node_,
+    i_node_0_,
+    var_arr_,
+    param_arr_,
+    n_var_,
+    n_param_,
+    Theta_rel_,
+    V_reset_rel_,
+    n_refractory_steps,
+    P11,
+    P22,
+    P21,
+    P20 );
+  // gpuErrchk( cudaDeviceSynchronize() );
 
-  user_m1_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_,
-      Theta_rel_, V_reset_rel_, n_refractory_steps, P11, P22, P21, P20 );
-  //gpuErrchk( cudaDeviceSynchronize() );
-  
   return 0;
 }
 
-int user_m1::Free()
+int
+user_m1::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
   delete[] group_param_;
-  
+
   return 0;
 }
diff --git a/src/user_m1_iaf_psc_exp_g.h b/src/user_m1_iaf_psc_exp_g.h
index 972d6553e..99e1e13ba 100644
--- a/src/user_m1_iaf_psc_exp_g.h
+++ b/src/user_m1_iaf_psc_exp_g.h
@@ -20,97 +20,76 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/iaf_psc_exp.h
 
-
 #ifndef USERM1IAFPSCEXPG_H
 #define USERM1IAFPSCEXPG_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 namespace user_m1_ns
 {
-enum ScalVarIndexes {
-  i_I_syn = 0,        // postsynaptic current for exc. inputs
-  i_V_m_rel,          // membrane potential relative to E_L
-  i_refractory_step,  // refractory step counter
+enum ScalVarIndexes
+{
+  i_I_syn = 0,       // postsynaptic current for exc. inputs
+  i_V_m_rel,         // membrane potential relative to E_L
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_I_e = 0,         // External current in pA
+enum ScalParamIndexes
+{
+  i_I_e = 0, // External current in pA
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
-  i_tau_m = 0,       // Membrane time constant in ms
-  i_C_m,             // Membrane capacitance in pF
-  i_E_L,             // Resting potential in mV
-  i_Theta_rel,       // Threshold, RELATIVE TO RESTING POTENTIAL(!)
-                     // i.e. the real threshold is (E_L_+Theta_rel_)
-  i_V_reset_rel,     // relative reset value of the membrane potential
-  i_tau_syn,         // Time constant of synaptic current in ms
-  i_t_ref,           // Refractory period in ms
+enum GroupParamIndexes
+{
+  i_tau_m = 0,   // Membrane time constant in ms
+  i_C_m,         // Membrane capacitance in pF
+  i_E_L,         // Resting potential in mV
+  i_Theta_rel,   // Threshold, RELATIVE TO RESTING POTENTIAL(!)
+                 // i.e. the real threshold is (E_L_+Theta_rel_)
+  i_V_reset_rel, // relative reset value of the membrane potential
+  i_tau_syn,     // Time constant of synaptic current in ms
+  i_t_ref,       // Refractory period in ms
   N_GROUP_PARAM
 };
 
+const std::string user_m1_scal_var_name[ N_SCAL_VAR ] = { "I_syn", "V_m_rel", "refractory_step" };
 
- 
-const std::string user_m1_scal_var_name[N_SCAL_VAR] = {
-  "I_syn",
-  "V_m_rel",
-  "refractory_step"
-};
-
-const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
-  "I_e"
-};
-
-const std::string user_m1_group_param_name[N_GROUP_PARAM] = {
-  "tau_m",
-  "C_m",
-  "E_L",
-  "Theta_rel",
-  "V_reset_rel",
-  "tau_syn",
-  "t_ref"
-};
- 
-} // namespace
- 
+const std::string user_m1_scal_param_name[ N_SCAL_PARAM ] = { "I_e" };
 
+const std::string
+  user_m1_group_param_name[ N_GROUP_PARAM ] = { "tau_m", "C_m", "E_L", "Theta_rel", "V_reset_rel", "tau_syn", "t_ref" };
 
+} // namespace user_m1_ns
 
 class user_m1 : public BaseNeuron
 {
   float time_resolution_;
 
- public:
+public:
   ~user_m1();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
-  int Calibrate(double /*time_min*/, float time_res) {
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int
+  Calibrate( double /*time_min*/, float time_res )
+  {
     time_resolution_ = time_res;
     return 0;
   }
-  
-  int Update(long long it, double t1);
 
-  int Free();
+  int Update( long long it, double t1 );
 
+  int Free();
 };
 
-
 #endif
diff --git a/src/user_m1_iaf_psc_exp_hc.cu b/src/user_m1_iaf_psc_exp_hc.cu
index 5662b5be9..b68c37105 100644
--- a/src/user_m1_iaf_psc_exp_hc.cu
+++ b/src/user_m1_iaf_psc_exp_hc.cu
@@ -20,51 +20,50 @@
  *
  */
 
-
-
-
-
-#include <config.h>
+#include "spike_buffer.h"
+#include "user_m1_hc.h"
 #include <cmath>
+#include <config.h>
 #include <iostream>
-#include "user_m1_hc.h"
-#include "spike_buffer.h"
 
 using namespace user_m1_hc_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
 
-#define I_syn var[i_I_syn]
-#define V_m_rel var[i_V_m_rel]
-#define refractory_step var[i_refractory_step]
-#define I_e param[i_I_e]
+#define I_syn var[ i_I_syn ]
+#define V_m_rel var[ i_V_m_rel ]
+#define refractory_step var[ i_refractory_step ]
+#define I_e param[ i_I_e ]
 
 #include "user_m1_hc_params.h"
 
-__global__ void user_m1_hc_Update(int n_node, int i_node_0,
-					float *var_arr, float *param_arr,
-					int n_var, int n_param)
+__global__ void
+user_m1_hc_Update( int n_node, int i_node_0, float* var_arr, float* param_arr, int n_var, int n_param )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V
+    else
+    { // neuron is not refractory, so evolve V
       V_m_rel = V_m_rel * P22 + I_syn * P21 + I_e * P20;
     }
     // exponential decaying PSC
     I_syn *= P11;
-    
-    if (V_m_rel >= Theta_rel ) { // threshold crossing
-      PushSpike(i_node_0 + i_neuron, 1.0);
+
+    if ( V_m_rel >= Theta_rel )
+    { // threshold crossing
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m_rel = V_reset_rel;
       refractory_step = n_refractory_steps;
-    }    
+    }
   }
 }
 
@@ -74,59 +73,60 @@ user_m1_hc::~user_m1_hc()
   FreeParamArr();
 }
 
-int user_m1_hc::Init(int i_node_0, int n_node, int /*n_port*/,
-			   int i_group)
+int
+user_m1_hc::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 1 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 1 /*n_port*/, i_group );
   node_type_ = i_user_m1_hc_model;
 
   n_scal_var_ = N_SCAL_VAR;
   n_var_ = n_scal_var_;
   n_scal_param_ = N_SCAL_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
 
   scal_var_name_ = user_m1_hc_scal_var_name;
   scal_param_name_ = user_m1_hc_scal_param_name;
 
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
+  SetScalParam( 0, n_node, "I_e", 0.0 ); // in pA
 
-  SetScalVar(0, n_node, "I_syn", 0.0 );
-  SetScalVar(0, n_node, "V_m_rel", 0.0 ); // in mV
-  SetScalVar(0, n_node, "refractory_step", 0 );
+  SetScalVar( 0, n_node, "I_syn", 0.0 );
+  SetScalVar( 0, n_node, "V_m_rel", 0.0 ); // in mV
+  SetScalVar( 0, n_node, "refractory_step", 0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 0;
 
   return 0;
 }
 
-int user_m1_hc::Update(long long it, double t1)
+int
+user_m1_hc::Update( long long it, double t1 )
 {
   // std::cout << "user_m1_hc neuron update\n";
-  user_m1_hc_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_);
-  //gpuErrchk( cudaDeviceSynchronize() );
-  
+  user_m1_hc_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>(
+    n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_ );
+  // gpuErrchk( cudaDeviceSynchronize() );
+
   return 0;
 }
 
-int user_m1_hc::Free()
+int
+user_m1_hc::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
-  
+
   return 0;
 }
diff --git a/src/user_m1_iaf_psc_exp_hc.h b/src/user_m1_iaf_psc_exp_hc.h
index d085962a6..5457ba99b 100644
--- a/src/user_m1_iaf_psc_exp_hc.h
+++ b/src/user_m1_iaf_psc_exp_hc.h
@@ -20,65 +20,51 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/user_m1.h
 
-
 #ifndef USERM1IAFPSCEXPHC_H
 #define USERM1IAFPSCEXPHC_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 namespace user_m1_hc_ns
 {
-enum ScalVarIndexes {
-  i_I_syn = 0,        // postsynaptic current for exc. inputs
-  i_V_m_rel,          // membrane potential relative to E_L
-  i_refractory_step,  // refractory step counter
+enum ScalVarIndexes
+{
+  i_I_syn = 0,       // postsynaptic current for exc. inputs
+  i_V_m_rel,         // membrane potential relative to E_L
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_I_e = 0,         // External current in pA
+enum ScalParamIndexes
+{
+  i_I_e = 0, // External current in pA
   N_SCAL_PARAM
 };
 
- const std::string user_m1_hc_scal_var_name[N_SCAL_VAR] = {
-  "I_syn",
-  "V_m_rel",
-  "refractory_step"
-};
+const std::string user_m1_hc_scal_var_name[ N_SCAL_VAR ] = { "I_syn", "V_m_rel", "refractory_step" };
 
-const std::string user_m1_hc_scal_param_name[N_SCAL_PARAM] = {
-  "I_e"
-};
+const std::string user_m1_hc_scal_param_name[ N_SCAL_PARAM ] = { "I_e" };
 
-} // namespace
- 
+} // namespace user_m1_hc_ns
 
 class user_m1_hc : public BaseNeuron
 {
- public:
+public:
   ~user_m1_hc();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Update(long long it, double t1);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
 
-  int Free();
+  int Update( long long it, double t1 );
 
+  int Free();
 };
 
-
 #endif
diff --git a/src/user_m1_kernel.h b/src/user_m1_kernel.h
index 377d2147c..6eba2de2b 100644
--- a/src/user_m1_kernel.h
+++ b/src/user_m1_kernel.h
@@ -20,38 +20,37 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1KERNEL_H
 #define USERM1KERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
 #include "node_group.h"
+#include "spike_buffer.h"
 #include "user_m1.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace user_m1_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_g = 0,
   i_g1,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -69,7 +68,8 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_E_rev = 0,
   i_tau_rise,
   i_tau_decay,
@@ -77,25 +77,18 @@ enum PortParamIndexes {
   N_PORT_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
+const std::string user_m1_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string user_m1_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
-
-const std::string user_m1_port_var_name[N_PORT_VAR] = {
-  "g",
-  "g1"
-};
+const std::string user_m1_port_var_name[ N_PORT_VAR ] = { "g", "g1" };
 
-const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string user_m1_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -108,165 +101,157 @@ const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string user_m1_port_param_name[N_PORT_PARAM] = {
-  "E_rev",
-  "tau_rise",
-  "tau_decay",
-  "g0"  
-};
+const std::string user_m1_port_param_name[ N_PORT_PARAM ] = { "E_rev", "tau_rise", "tau_decay", "g0" };
 
-const std::string user_m1_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string user_m1_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-#define g(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define g1(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-#define dgdt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define dg1dt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define E_rev(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_E_rev]
-#define tau_rise(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_rise]
-#define tau_decay(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_decay]
-#define g0(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_g0]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     user_m1_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+#define g( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define g1( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+#define dgdt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define dg1dt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define E_rev( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_E_rev ]
+#define tau_rise( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_rise ]
+#define tau_decay( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_decay ]
+#define g0( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_g0 ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct )
 {
-  enum { n_port = (NVAR-N_SCAL_VAR)/N_PORT_VAR };
+  enum
+  {
+    n_port = ( NVAR - N_SCAL_VAR ) / N_PORT_VAR
+  };
   float I_syn = 0.0;
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
-  for (int i = 0; i<n_port; i++) {
-    I_syn += g(i)*(E_rev(i) - V);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn += g( i ) * ( E_rev( i ) - V );
   }
-  float V_spike = Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
-  for (int i=0; i<n_port; i++) {
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
+  for ( int i = 0; i < n_port; i++ )
+  {
     // Synaptic conductance derivative
-    dg1dt(i) = -g1(i) / tau_rise(i);
-    dgdt(i) = g1(i) - g(i) / tau_decay(i);
+    dg1dt( i ) = -g1( i ) / tau_rise( i );
+    dgdt( i ) = g1( i ) - g( i ) / tau_decay( i );
   }
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
-};
+}; // namespace user_m1_ns
 
 template <>
-int user_m1::UpdateNR<0>(long long it, double t1);
+int user_m1::UpdateNR< 0 >( long long it, double t1 );
 
-template<int N_PORT>
-int user_m1::UpdateNR(long long it, double t1)
+template < int N_PORT >
+int
+user_m1::UpdateNR( long long it, double t1 )
 {
-  if (N_PORT == n_port_) {
-    const int NVAR = user_m1_ns::N_SCAL_VAR
-      + user_m1_ns::N_PORT_VAR*N_PORT;
-    const int NPARAM = user_m1_ns::N_SCAL_PARAM
-      + user_m1_ns::N_PORT_PARAM*N_PORT;
+  if ( N_PORT == n_port_ )
+  {
+    const int NVAR = user_m1_ns::N_SCAL_VAR + user_m1_ns::N_PORT_VAR * N_PORT;
+    const int NPARAM = user_m1_ns::N_SCAL_PARAM + user_m1_ns::N_PORT_PARAM * N_PORT;
 
-    rk5_.Update<NVAR, NPARAM>(t1, h_min_, rk5_data_struct_);
+    rk5_.Update< NVAR, NPARAM >( t1, h_min_, rk5_data_struct_ );
   }
-  else {
-    UpdateNR<N_PORT - 1>(it, t1);
+  else
+  {
+    UpdateNR< N_PORT - 1 >( it, t1 );
   }
 
   return 0;
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct )
 {
-    user_m1_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  user_m1_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct )
 {
-    user_m1_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  user_m1_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/user_m1_psc_alpha.cu b/src/user_m1_psc_alpha.cu
index 010177733..e050a365f 100644
--- a/src/user_m1_psc_alpha.cu
+++ b/src/user_m1_psc_alpha.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
-#include "user_m1_kernel.h"
 #include "rk5.h"
 #include "user_m1.h"
+#include "user_m1_kernel.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace user_m1_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      user_m1_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -54,56 +49,57 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0.0;
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
-    I_syn(i) = 0.0;
-    I1_syn(i) = 0.0;
-    tau_syn(i) = 0.2;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn( i ) = 0.0;
+    I1_syn( i ) = 0.0;
+    tau_syn( i ) = 0.2;
   }
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, user_m1_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   refractory_step = 0;
   // set the right threshold depending on Delta_T
-  if (Delta_T <= 0.0) {
+  if ( Delta_T <= 0.0 )
+  {
     V_peak = V_th; // same as IAF dynamics for spikes if Delta_T == 0.
   }
-  for (int i = 0; i<n_port; i++) {
-    I0(i) = M_E / tau_syn(i);
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I0( i ) = M_E / tau_syn( i );
   }
 }
 
-}
+} // namespace user_m1_ns
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, user_m1_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-    user_m1_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  user_m1_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, user_m1_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 
 {
-    user_m1_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  user_m1_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace user_m1_ns;
 
-int user_m1::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+user_m1::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_user_m1_model;
   n_scal_var_ = N_SCAL_VAR;
   n_port_var_ = N_PORT_VAR;
@@ -111,58 +107,60 @@ int user_m1::Init(int i_node_0, int n_node, int n_port,
   n_port_param_ = N_PORT_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
 
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = user_m1_scal_var_name;
-  port_var_name_= user_m1_port_var_name;
+  port_var_name_ = user_m1_port_var_name;
   scal_param_name_ = user_m1_scal_param_name;
   port_param_name_ = user_m1_port_param_name;
   group_param_name_ = user_m1_group_param_name;
-  //rk5_data_struct_.node_type_ = i_user_m1_model;
+  // rk5_data_struct_.node_type_ = i_user_m1_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
 
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
-  port_weight_arr_ = GetParamArr() + n_scal_param_
-    + GetPortParamIdx("I0");
+  port_weight_arr_ = GetParamArr() + n_scal_param_ + GetPortParamIdx( "I0" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = n_port_param_;
-  
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("I1_syn");
+
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "I1_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int user_m1::Calibrate(double time_min, float time_resolution)
+int
+user_m1::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
 template <>
-int user_m1::UpdateNR<0>(long long it, double t1)
+int
+user_m1::UpdateNR< 0 >( long long it, double t1 )
 {
   return 0;
 }
 
-int user_m1::Update(long long it, double t1) {
-  UpdateNR<MAX_PORT_NUM>(it, t1);
+int
+user_m1::Update( long long it, double t1 )
+{
+  UpdateNR< MAX_PORT_NUM >( it, t1 );
 
   return 0;
 }
diff --git a/src/user_m1_psc_alpha.h b/src/user_m1_psc_alpha.h
index e1df78865..4593f07f9 100644
--- a/src/user_m1_psc_alpha.h
+++ b/src/user_m1_psc_alpha.h
@@ -20,20 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1PSCALPHA_H
 #define USERM1PSCALPHA_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 #define MAX_PORT_NUM 20
 
@@ -44,30 +40,32 @@ struct user_m1_rk5
 
 class user_m1 : public BaseNeuron
 {
- public:
-  RungeKutta5<user_m1_rk5> rk5_;
+public:
+  RungeKutta5< user_m1_rk5 > rk5_;
   float h_min_;
   float h_;
   user_m1_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/user_m1_psc_alpha_kernel.h b/src/user_m1_psc_alpha_kernel.h
index e866044ab..9874d2349 100644
--- a/src/user_m1_psc_alpha_kernel.h
+++ b/src/user_m1_psc_alpha_kernel.h
@@ -20,38 +20,37 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1PSCALPHAKERNEL_H
 #define USERM1PSCALPHAKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
 #include "node_group.h"
+#include "spike_buffer.h"
 #include "user_m1.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace user_m1_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_I_syn = 0,
   i_I1_syn,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -69,31 +68,25 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_tau_syn = 0,
   i_I0,
   N_PORT_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
+const std::string user_m1_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string user_m1_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
-
-const std::string user_m1_port_var_name[N_PORT_VAR] = {
-  "I_syn",
-  "I1_syn"
-};
+const std::string user_m1_port_var_name[ N_PORT_VAR ] = { "I_syn", "I1_syn" };
 
-const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string user_m1_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -106,162 +99,155 @@ const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string user_m1_port_param_name[N_PORT_PARAM] = {
-  "tau_syn",
-  "I0"
-};
+const std::string user_m1_port_param_name[ N_PORT_PARAM ] = { "tau_syn", "I0" };
 
-const std::string user_m1_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string user_m1_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-#define I_syn(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_I_syn]
-#define I1_syn(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_I1_syn]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-#define dI_syndt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_I_syn]
-#define dI1_syndt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_I1_syn]
-#define I0(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_I0]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define tau_syn(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_syn]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
-
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     user_m1_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+#define I_syn( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_I_syn ]
+#define I1_syn( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_I1_syn ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+#define dI_syndt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_I_syn ]
+#define dI1_syndt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_I1_syn ]
+#define I0( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_I0 ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define tau_syn( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_syn ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct )
 {
-  enum { n_port = (NVAR-N_SCAL_VAR)/N_PORT_VAR };
+  enum
+  {
+    n_port = ( NVAR - N_SCAL_VAR ) / N_PORT_VAR
+  };
   float I_syn_tot = 0.0;
-  
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
-  for (int i = 0; i<n_port; i++) {
-    I_syn_tot += I_syn(i);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn_tot += I_syn( i );
   }
-  float V_spike = Delta_T == 0. ? 0. : Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T == 0. ? 0. : Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn_tot - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn_tot - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
-  for (int i=0; i<n_port; i++) {
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
+  for ( int i = 0; i < n_port; i++ )
+  {
     // Synaptic current derivatives
-    dI1_syndt(i) = -I1_syn(i)/tau_syn(i);
-    dI_syndt(i) = I1_syn(i) - I_syn(i)/tau_syn(i);
+    dI1_syndt( i ) = -I1_syn( i ) / tau_syn( i );
+    dI_syndt( i ) = I1_syn( i ) - I_syn( i ) / tau_syn( i );
   }
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
-};
+}; // namespace user_m1_ns
 
 template <>
-int user_m1::UpdateNR<0>(long long it, double t1);
+int user_m1::UpdateNR< 0 >( long long it, double t1 );
 
-template<int N_PORT>
-int user_m1::UpdateNR(long long it, double t1)
+template < int N_PORT >
+int
+user_m1::UpdateNR( long long it, double t1 )
 {
-  if (N_PORT == n_port_) {
-    const int NVAR = user_m1_ns::N_SCAL_VAR
-      + user_m1_ns::N_PORT_VAR*N_PORT;
-    const int NPARAM = user_m1_ns::N_SCAL_PARAM
-      + user_m1_ns::N_PORT_PARAM*N_PORT;
+  if ( N_PORT == n_port_ )
+  {
+    const int NVAR = user_m1_ns::N_SCAL_VAR + user_m1_ns::N_PORT_VAR * N_PORT;
+    const int NPARAM = user_m1_ns::N_SCAL_PARAM + user_m1_ns::N_PORT_PARAM * N_PORT;
 
-    rk5_.Update<NVAR, NPARAM>(t1, h_min_, rk5_data_struct_);
+    rk5_.Update< NVAR, NPARAM >( t1, h_min_, rk5_data_struct_ );
   }
-  else {
-    UpdateNR<N_PORT - 1>(it, t1);
+  else
+  {
+    UpdateNR< N_PORT - 1 >( it, t1 );
   }
 
   return 0;
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct )
 {
-    user_m1_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  user_m1_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct )
 {
-    user_m1_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  user_m1_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/user_m1_psc_alpha_rk5.h b/src/user_m1_psc_alpha_rk5.h
index 328ec8532..9854e404e 100644
--- a/src/user_m1_psc_alpha_rk5.h
+++ b/src/user_m1_psc_alpha_rk5.h
@@ -20,32 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1PSCALPHARK5_H
 #define USERM1PSCALPHARK5_H
 
 struct user_m1_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m1_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m1_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, user_m1_rk5 data_struct);
+__device__ void NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, user_m1_rk5 data_struct);
+__device__ void NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct );
 
 #endif
diff --git a/src/user_m1_psc_delta.cu b/src/user_m1_psc_delta.cu
index adee2d8eb..0b476a2fc 100644
--- a/src/user_m1_psc_delta.cu
+++ b/src/user_m1_psc_delta.cu
@@ -20,25 +20,20 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
-#include "user_m1_kernel.h"
 #include "rk5.h"
 #include "user_m1.h"
+#include "user_m1_kernel.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace user_m1_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      user_m1_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -53,100 +48,100 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0;
   refractory_step = 0;
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, user_m1_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  //int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  // int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
 
   refractory_step = 0;
   // set the right threshold depending on Delta_T
-  if (Delta_T <= 0.0) {
+  if ( Delta_T <= 0.0 )
+  {
     V_peak = V_th; // same as IAF dynamics for spikes if Delta_T == 0.
   }
 }
 
-}
+} // namespace user_m1_ns
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, user_m1_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-    user_m1_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  user_m1_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, user_m1_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 
 {
-    user_m1_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  user_m1_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace user_m1_ns;
 
-int user_m1::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+user_m1::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_user_m1_model;
   n_scal_var_ = N_SCAL_VAR;
   n_scal_param_ = N_SCAL_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
+
+  group_param_ = new float[ N_GROUP_PARAM ];
 
-  group_param_ = new float[N_GROUP_PARAM];
-  
   scal_var_name_ = user_m1_scal_var_name;
   scal_param_name_ = user_m1_scal_param_name;
   group_param_name_ = user_m1_group_param_name;
-  //rk5_data_struct_.node_type_ = i_user_m1_model;
+  // rk5_data_struct_.node_type_ = i_user_m1_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
 
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("V_m");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "V_m" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int user_m1::Calibrate(double time_min, float time_resolution)
+int
+user_m1::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
-int user_m1::Update(long long it, double t1)
+int
+user_m1::Update( long long it, double t1 )
 {
-  rk5_.Update<N_SCAL_VAR, N_SCAL_PARAM>(t1, h_min_, rk5_data_struct_);
- 
+  rk5_.Update< N_SCAL_VAR, N_SCAL_PARAM >( t1, h_min_, rk5_data_struct_ );
+
   return 0;
 }
diff --git a/src/user_m1_psc_delta.h b/src/user_m1_psc_delta.h
index 92cbdf5aa..9cb4de780 100644
--- a/src/user_m1_psc_delta.h
+++ b/src/user_m1_psc_delta.h
@@ -20,20 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1PSCDELTA_H
 #define USERM1PSCDELTA_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 #define MAX_PORT_NUM 20
 
@@ -44,27 +40,29 @@ struct user_m1_rk5
 
 class user_m1 : public BaseNeuron
 {
- public:
-  RungeKutta5<user_m1_rk5> rk5_;
+public:
+  RungeKutta5< user_m1_rk5 > rk5_;
   float h_min_;
   float h_;
   user_m1_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
 };
 
 #endif
diff --git a/src/user_m1_psc_delta_kernel.h b/src/user_m1_psc_delta_kernel.h
index 212c79a1d..bfc60979a 100644
--- a/src/user_m1_psc_delta_kernel.h
+++ b/src/user_m1_psc_delta_kernel.h
@@ -20,36 +20,35 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1PSCDELTAKERNEL_H
 #define USERM1PSCDELTAKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
 #include "node_group.h"
+#include "spike_buffer.h"
 #include "user_m1.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace user_m1_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   N_PORT_VAR = 0
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -67,20 +66,16 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
+const std::string user_m1_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string user_m1_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
-
-const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string user_m1_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -93,121 +88,110 @@ const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
-
-const std::string user_m1_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+  "den_delay" };
 
+const std::string user_m1_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     user_m1_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct )
 {
-  
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
 
-  float V_spike = Delta_T == 0. ? 0. : Delta_T*exp((V - V_th)/Delta_T);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+
+  float V_spike = Delta_T == 0. ? 0. : Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
+}; // namespace user_m1_ns
 
-};
-
-
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct )
 {
-    user_m1_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  user_m1_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct )
 {
-    user_m1_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  user_m1_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/user_m1_psc_delta_rk5.h b/src/user_m1_psc_delta_rk5.h
index 0ea128b9d..eed3a0a03 100644
--- a/src/user_m1_psc_delta_rk5.h
+++ b/src/user_m1_psc_delta_rk5.h
@@ -20,32 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1PSCDELTARK5_H
 #define USERM1PSCDELTARK5_H
 
 struct user_m1_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m1_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m1_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, user_m1_rk5 data_struct);
+__device__ void NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, user_m1_rk5 data_struct);
+__device__ void NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct );
 
 #endif
diff --git a/src/user_m1_psc_exp.cu b/src/user_m1_psc_exp.cu
index 948e68d11..7844abdce 100644
--- a/src/user_m1_psc_exp.cu
+++ b/src/user_m1_psc_exp.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
-#include "user_m1_kernel.h"
 #include "rk5.h"
 #include "user_m1.h"
+#include "user_m1_kernel.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace user_m1_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      user_m1_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -54,52 +49,52 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0;
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
-    tau_syn(i) = 0.2;
-    I_syn(i) = 0;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    tau_syn( i ) = 0.2;
+    I_syn( i ) = 0;
   }
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, user_m1_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  //int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  // int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
 
   refractory_step = 0;
   // set the right threshold depending on Delta_T
-  if (Delta_T <= 0.0) {
+  if ( Delta_T <= 0.0 )
+  {
     V_peak = V_th; // same as IAF dynamics for spikes if Delta_T == 0.
   }
 }
 
-}
+} // namespace user_m1_ns
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, user_m1_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 {
-    user_m1_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  user_m1_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, user_m1_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct )
 
 {
-    user_m1_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  user_m1_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace user_m1_ns;
 
-int user_m1::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+user_m1::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_user_m1_model;
   n_scal_var_ = N_SCAL_VAR;
   n_port_var_ = N_PORT_VAR;
@@ -107,61 +102,63 @@ int user_m1::Init(int i_node_0, int n_node, int n_port,
   n_port_param_ = N_PORT_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
+
+  group_param_ = new float[ N_GROUP_PARAM ];
 
-  group_param_ = new float[N_GROUP_PARAM];
-  
   scal_var_name_ = user_m1_scal_var_name;
-  port_var_name_= user_m1_port_var_name;
+  port_var_name_ = user_m1_port_var_name;
   scal_param_name_ = user_m1_scal_param_name;
   port_param_name_ = user_m1_port_param_name;
   group_param_name_ = user_m1_group_param_name;
-  //rk5_data_struct_.node_type_ = i_user_m1_model;
+  // rk5_data_struct_.node_type_ = i_user_m1_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
 
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int user_m1::Calibrate(double time_min, float time_resolution)
+int
+user_m1::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
 template <>
-int user_m1::UpdateNR<0>(long long it, double t1)
+int
+user_m1::UpdateNR< 0 >( long long it, double t1 )
 {
   return 0;
 }
 
-int user_m1::Update(long long it, double t1) {
-  UpdateNR<MAX_PORT_NUM>(it, t1);
+int
+user_m1::Update( long long it, double t1 )
+{
+  UpdateNR< MAX_PORT_NUM >( it, t1 );
 
   return 0;
 }
diff --git a/src/user_m1_psc_exp.h b/src/user_m1_psc_exp.h
index 3293f6690..78e4dbe38 100644
--- a/src/user_m1_psc_exp.h
+++ b/src/user_m1_psc_exp.h
@@ -20,20 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1PSCEXP_H
 #define USERM1PSCEXP_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 #define MAX_PORT_NUM 20
 
@@ -44,30 +40,32 @@ struct user_m1_rk5
 
 class user_m1 : public BaseNeuron
 {
- public:
-  RungeKutta5<user_m1_rk5> rk5_;
+public:
+  RungeKutta5< user_m1_rk5 > rk5_;
   float h_min_;
   float h_;
   user_m1_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/user_m1_psc_exp_g.cu b/src/user_m1_psc_exp_g.cu
index a084fd59a..82160f541 100644
--- a/src/user_m1_psc_exp_g.cu
+++ b/src/user_m1_psc_exp_g.cu
@@ -20,74 +20,82 @@
  *
  */
 
-
-
-
-
-#include <config.h>
+#include "spike_buffer.h"
+#include "user_m1.h"
 #include <cmath>
+#include <config.h>
 #include <iostream>
-#include "user_m1.h"
-#include "spike_buffer.h"
 
 using namespace user_m1_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
 
-#define I_syn var[i_I_syn]
-#define V_m_rel var[i_V_m_rel]
-#define refractory_step var[i_refractory_step]
-#define I_e param[i_I_e]
-
-#define tau_m_ group_param_[i_tau_m]
-#define C_m_ group_param_[i_C_m]
-#define E_L_ group_param_[i_E_L]
-#define Theta_rel_ group_param_[i_Theta_rel]
-#define V_reset_rel_ group_param_[i_V_reset_rel]
-#define tau_syn_ group_param_[i_tau_syn]
-#define t_ref_ group_param_[i_t_ref]
-
-__global__ void user_m1_Update
-( int n_node, int i_node_0, float *var_arr, float *param_arr, int n_var,
-  int n_param, float Theta_rel, float V_reset_rel, int n_refractory_steps,
-  float P11, float P22, float P21, float P20 )
+#define I_syn var[ i_I_syn ]
+#define V_m_rel var[ i_V_m_rel ]
+#define refractory_step var[ i_refractory_step ]
+#define I_e param[ i_I_e ]
+
+#define tau_m_ group_param_[ i_tau_m ]
+#define C_m_ group_param_[ i_C_m ]
+#define E_L_ group_param_[ i_E_L ]
+#define Theta_rel_ group_param_[ i_Theta_rel ]
+#define V_reset_rel_ group_param_[ i_V_reset_rel ]
+#define tau_syn_ group_param_[ i_tau_syn ]
+#define t_ref_ group_param_[ i_t_ref ]
+
+__global__ void
+user_m1_Update( int n_node,
+  int i_node_0,
+  float* var_arr,
+  float* param_arr,
+  int n_var,
+  int n_param,
+  float Theta_rel,
+  float V_reset_rel,
+  int n_refractory_steps,
+  float P11,
+  float P22,
+  float P21,
+  float P20 )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V
+    else
+    { // neuron is not refractory, so evolve V
       V_m_rel = V_m_rel * P22 + I_syn * P21 + I_e * P20;
     }
     // exponential decaying PSC
     I_syn *= P11;
-    
-    if (V_m_rel >= Theta_rel ) { // threshold crossing
-      PushSpike(i_node_0 + i_neuron, 1.0);
+
+    if ( V_m_rel >= Theta_rel )
+    { // threshold crossing
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m_rel = V_reset_rel;
       refractory_step = n_refractory_steps;
-    }    
+    }
   }
 }
 
-double h_propagator_32( double tau_syn, double tau, double C, double h )
+double
+h_propagator_32( double tau_syn, double tau, double C, double h )
 {
-  const double P32_linear = 1.0 / ( 2.0 * C * tau * tau ) * h * h
-    * ( tau_syn - tau ) * exp( -h / tau );
+  const double P32_linear = 1.0 / ( 2.0 * C * tau * tau ) * h * h * ( tau_syn - tau ) * exp( -h / tau );
   const double P32_singular = h / C * exp( -h / tau );
   const double P32 =
-    -tau / ( C * ( 1.0 - tau / tau_syn ) ) * exp( -h / tau_syn )
-    * expm1( h * ( 1.0 / tau_syn - 1.0 / tau ) );
+    -tau / ( C * ( 1.0 - tau / tau_syn ) ) * exp( -h / tau_syn ) * expm1( h * ( 1.0 / tau_syn - 1.0 / tau ) );
 
   const double dev_P32 = fabs( P32 - P32_singular );
 
-  if ( tau == tau_syn || ( fabs( tau - tau_syn ) < 0.1 && dev_P32 > 2.0
-			   * fabs( P32_linear ) ) )
+  if ( tau == tau_syn || ( fabs( tau - tau_syn ) < 0.1 && dev_P32 > 2.0 * fabs( P32_linear ) ) )
   {
     return P32_singular;
   }
@@ -103,10 +111,10 @@ user_m1::~user_m1()
   FreeParamArr();
 }
 
-int user_m1::Init(int i_node_0, int n_node, int /*n_port*/,
-			   int i_group)
+int
+user_m1::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 1 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 1 /*n_port*/, i_group );
   node_type_ = i_user_m1_model;
 
   n_scal_var_ = N_SCAL_VAR;
@@ -114,46 +122,46 @@ int user_m1::Init(int i_node_0, int n_node, int /*n_port*/,
   n_scal_param_ = N_SCAL_PARAM;
   n_group_param_ = N_GROUP_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = user_m1_scal_var_name;
   scal_param_name_ = user_m1_scal_param_name;
   group_param_name_ = user_m1_group_param_name;
 
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
+  SetScalParam( 0, n_node, "I_e", 0.0 ); // in pA
 
-  SetScalVar(0, n_node, "I_syn", 0.0 );
-  SetScalVar(0, n_node, "V_m_rel", 0.0 ); // in mV
-  SetScalVar(0, n_node, "refractory_step", 0 );
+  SetScalVar( 0, n_node, "I_syn", 0.0 );
+  SetScalVar( 0, n_node, "V_m_rel", 0.0 ); // in mV
+  SetScalVar( 0, n_node, "refractory_step", 0 );
 
-  SetGroupParam("tau_m", 10.0);
-  SetGroupParam("C_m", 250.0);
-  SetGroupParam("E_L", -65.0);
-  SetGroupParam("Theta_rel", 15.0);
-  SetGroupParam("V_reset_rel", 0.0);
-  SetGroupParam("tau_syn", 0.5);
-  SetGroupParam("t_ref", 2.0);
+  SetGroupParam( "tau_m", 10.0 );
+  SetGroupParam( "C_m", 250.0 );
+  SetGroupParam( "E_L", -65.0 );
+  SetGroupParam( "Theta_rel", 15.0 );
+  SetGroupParam( "V_reset_rel", 0.0 );
+  SetGroupParam( "tau_syn", 0.5 );
+  SetGroupParam( "t_ref", 2.0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 0;
 
   return 0;
 }
 
-int user_m1::Update(long long it, double t1)
+int
+user_m1::Update( long long it, double t1 )
 {
   // std::cout << "user_m1 neuron update\n";
   float h = time_resolution_;
@@ -161,21 +169,32 @@ int user_m1::Update(long long it, double t1)
   float P22 = exp( -h / tau_m_ );
   float P21 = h_propagator_32( tau_syn_, tau_m_, C_m_, h );
   float P20 = tau_m_ / C_m_ * ( 1.0 - P22 );
-  int n_refractory_steps = int(round(t_ref_ / h));
+  int n_refractory_steps = int( round( t_ref_ / h ) );
+
+  user_m1_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>( n_node_,
+    i_node_0_,
+    var_arr_,
+    param_arr_,
+    n_var_,
+    n_param_,
+    Theta_rel_,
+    V_reset_rel_,
+    n_refractory_steps,
+    P11,
+    P22,
+    P21,
+    P20 );
+  // gpuErrchk( cudaDeviceSynchronize() );
 
-  user_m1_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_,
-      Theta_rel_, V_reset_rel_, n_refractory_steps, P11, P22, P21, P20 );
-  //gpuErrchk( cudaDeviceSynchronize() );
-  
   return 0;
 }
 
-int user_m1::Free()
+int
+user_m1::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
   delete[] group_param_;
-  
+
   return 0;
 }
diff --git a/src/user_m1_psc_exp_g.h b/src/user_m1_psc_exp_g.h
index 62a071ec9..222217a19 100644
--- a/src/user_m1_psc_exp_g.h
+++ b/src/user_m1_psc_exp_g.h
@@ -20,97 +20,76 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/iaf_psc_exp.h
 
-
 #ifndef USERM1PSCEXPG_H
 #define USERM1PSCEXPG_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 namespace user_m1_ns
 {
-enum ScalVarIndexes {
-  i_I_syn = 0,        // postsynaptic current for exc. inputs
-  i_V_m_rel,          // membrane potential relative to E_L
-  i_refractory_step,  // refractory step counter
+enum ScalVarIndexes
+{
+  i_I_syn = 0,       // postsynaptic current for exc. inputs
+  i_V_m_rel,         // membrane potential relative to E_L
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_I_e = 0,         // External current in pA
+enum ScalParamIndexes
+{
+  i_I_e = 0, // External current in pA
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
-  i_tau_m = 0,       // Membrane time constant in ms
-  i_C_m,             // Membrane capacitance in pF
-  i_E_L,             // Resting potential in mV
-  i_Theta_rel,       // Threshold, RELATIVE TO RESTING POTENTIAL(!)
-                     // i.e. the real threshold is (E_L_+Theta_rel_)
-  i_V_reset_rel,     // relative reset value of the membrane potential
-  i_tau_syn,         // Time constant of synaptic current in ms
-  i_t_ref,           // Refractory period in ms
+enum GroupParamIndexes
+{
+  i_tau_m = 0,   // Membrane time constant in ms
+  i_C_m,         // Membrane capacitance in pF
+  i_E_L,         // Resting potential in mV
+  i_Theta_rel,   // Threshold, RELATIVE TO RESTING POTENTIAL(!)
+                 // i.e. the real threshold is (E_L_+Theta_rel_)
+  i_V_reset_rel, // relative reset value of the membrane potential
+  i_tau_syn,     // Time constant of synaptic current in ms
+  i_t_ref,       // Refractory period in ms
   N_GROUP_PARAM
 };
 
+const std::string user_m1_scal_var_name[ N_SCAL_VAR ] = { "I_syn", "V_m_rel", "refractory_step" };
 
- 
-const std::string user_m1_scal_var_name[N_SCAL_VAR] = {
-  "I_syn",
-  "V_m_rel",
-  "refractory_step"
-};
-
-const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
-  "I_e"
-};
-
-const std::string user_m1_group_param_name[N_GROUP_PARAM] = {
-  "tau_m",
-  "C_m",
-  "E_L",
-  "Theta_rel",
-  "V_reset_rel",
-  "tau_syn",
-  "t_ref"
-};
- 
-} // namespace
- 
+const std::string user_m1_scal_param_name[ N_SCAL_PARAM ] = { "I_e" };
 
+const std::string
+  user_m1_group_param_name[ N_GROUP_PARAM ] = { "tau_m", "C_m", "E_L", "Theta_rel", "V_reset_rel", "tau_syn", "t_ref" };
 
+} // namespace user_m1_ns
 
 class user_m1 : public BaseNeuron
 {
   float time_resolution_;
 
- public:
+public:
   ~user_m1();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
-  int Calibrate(double /*time_min*/, float time_res) {
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int
+  Calibrate( double /*time_min*/, float time_res )
+  {
     time_resolution_ = time_res;
     return 0;
   }
-  
-  int Update(long long it, double t1);
 
-  int Free();
+  int Update( long long it, double t1 );
 
+  int Free();
 };
 
-
 #endif
diff --git a/src/user_m1_psc_exp_hc.cu b/src/user_m1_psc_exp_hc.cu
index 83a9fa0d4..e5565ddd5 100644
--- a/src/user_m1_psc_exp_hc.cu
+++ b/src/user_m1_psc_exp_hc.cu
@@ -20,51 +20,50 @@
  *
  */
 
-
-
-
-
-#include <config.h>
+#include "spike_buffer.h"
+#include "user_m1_hc.h"
 #include <cmath>
+#include <config.h>
 #include <iostream>
-#include "user_m1_hc.h"
-#include "spike_buffer.h"
 
 using namespace user_m1_hc_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
 
-#define I_syn var[i_I_syn]
-#define V_m_rel var[i_V_m_rel]
-#define refractory_step var[i_refractory_step]
-#define I_e param[i_I_e]
+#define I_syn var[ i_I_syn ]
+#define V_m_rel var[ i_V_m_rel ]
+#define refractory_step var[ i_refractory_step ]
+#define I_e param[ i_I_e ]
 
 #include "user_m1_hc_params.h"
 
-__global__ void user_m1_hc_Update(int n_node, int i_node_0,
-					float *var_arr, float *param_arr,
-					int n_var, int n_param)
+__global__ void
+user_m1_hc_Update( int n_node, int i_node_0, float* var_arr, float* param_arr, int n_var, int n_param )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V
+    else
+    { // neuron is not refractory, so evolve V
       V_m_rel = V_m_rel * P22 + I_syn * P21 + I_e * P20;
     }
     // exponential decaying PSC
     I_syn *= P11;
-    
-    if (V_m_rel >= Theta_rel ) { // threshold crossing
-      PushSpike(i_node_0 + i_neuron, 1.0);
+
+    if ( V_m_rel >= Theta_rel )
+    { // threshold crossing
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m_rel = V_reset_rel;
       refractory_step = n_refractory_steps;
-    }    
+    }
   }
 }
 
@@ -74,59 +73,60 @@ user_m1_hc::~user_m1_hc()
   FreeParamArr();
 }
 
-int user_m1_hc::Init(int i_node_0, int n_node, int /*n_port*/,
-			   int i_group)
+int
+user_m1_hc::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 1 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 1 /*n_port*/, i_group );
   node_type_ = i_user_m1_hc_model;
 
   n_scal_var_ = N_SCAL_VAR;
   n_var_ = n_scal_var_;
   n_scal_param_ = N_SCAL_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
 
   scal_var_name_ = user_m1_hc_scal_var_name;
   scal_param_name_ = user_m1_hc_scal_param_name;
 
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
+  SetScalParam( 0, n_node, "I_e", 0.0 ); // in pA
 
-  SetScalVar(0, n_node, "I_syn", 0.0 );
-  SetScalVar(0, n_node, "V_m_rel", 0.0 ); // in mV
-  SetScalVar(0, n_node, "refractory_step", 0 );
+  SetScalVar( 0, n_node, "I_syn", 0.0 );
+  SetScalVar( 0, n_node, "V_m_rel", 0.0 ); // in mV
+  SetScalVar( 0, n_node, "refractory_step", 0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 0;
 
   return 0;
 }
 
-int user_m1_hc::Update(long long it, double t1)
+int
+user_m1_hc::Update( long long it, double t1 )
 {
   // std::cout << "user_m1_hc neuron update\n";
-  user_m1_hc_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_);
-  //gpuErrchk( cudaDeviceSynchronize() );
-  
+  user_m1_hc_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>(
+    n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_ );
+  // gpuErrchk( cudaDeviceSynchronize() );
+
   return 0;
 }
 
-int user_m1_hc::Free()
+int
+user_m1_hc::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
-  
+
   return 0;
 }
diff --git a/src/user_m1_psc_exp_hc.h b/src/user_m1_psc_exp_hc.h
index 6450e425b..672bbb1f1 100644
--- a/src/user_m1_psc_exp_hc.h
+++ b/src/user_m1_psc_exp_hc.h
@@ -20,65 +20,51 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/user_m1.h
 
-
 #ifndef USERM1PSCEXPHC_H
 #define USERM1PSCEXPHC_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 namespace user_m1_hc_ns
 {
-enum ScalVarIndexes {
-  i_I_syn = 0,        // postsynaptic current for exc. inputs
-  i_V_m_rel,          // membrane potential relative to E_L
-  i_refractory_step,  // refractory step counter
+enum ScalVarIndexes
+{
+  i_I_syn = 0,       // postsynaptic current for exc. inputs
+  i_V_m_rel,         // membrane potential relative to E_L
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_I_e = 0,         // External current in pA
+enum ScalParamIndexes
+{
+  i_I_e = 0, // External current in pA
   N_SCAL_PARAM
 };
 
- const std::string user_m1_hc_scal_var_name[N_SCAL_VAR] = {
-  "I_syn",
-  "V_m_rel",
-  "refractory_step"
-};
+const std::string user_m1_hc_scal_var_name[ N_SCAL_VAR ] = { "I_syn", "V_m_rel", "refractory_step" };
 
-const std::string user_m1_hc_scal_param_name[N_SCAL_PARAM] = {
-  "I_e"
-};
+const std::string user_m1_hc_scal_param_name[ N_SCAL_PARAM ] = { "I_e" };
 
-} // namespace
- 
+} // namespace user_m1_hc_ns
 
 class user_m1_hc : public BaseNeuron
 {
- public:
+public:
   ~user_m1_hc();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Update(long long it, double t1);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
 
-  int Free();
+  int Update( long long it, double t1 );
 
+  int Free();
 };
 
-
 #endif
diff --git a/src/user_m1_psc_exp_hc_params.h b/src/user_m1_psc_exp_hc_params.h
index cd5bd480f..bc34c5b6e 100644
--- a/src/user_m1_psc_exp_hc_params.h
+++ b/src/user_m1_psc_exp_hc_params.h
@@ -1,7 +1,6 @@
 #ifndef USERM1PSCEXPHCPARAMS_H
 #define USERM1PSCEXPHCPARAMS_H
 
-
 #define P11 8.1873075E-01
 #define P22 9.9004983E-01
 #define P21 3.6067175E-04
diff --git a/src/user_m1_psc_exp_kernel.h b/src/user_m1_psc_exp_kernel.h
index 7133ff8c4..d67dd9af9 100644
--- a/src/user_m1_psc_exp_kernel.h
+++ b/src/user_m1_psc_exp_kernel.h
@@ -20,37 +20,36 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1PSCEXPKERNEL_H
 #define USERM1PSCEXPKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
 #include "node_group.h"
+#include "spike_buffer.h"
 #include "user_m1.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace user_m1_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_I_syn = 0,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -68,28 +67,24 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_tau_syn = 0,
   N_PORT_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
-const std::string user_m1_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
+const std::string user_m1_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string user_m1_port_var_name[N_PORT_VAR] = {
-  "I_syn"
-};
+const std::string user_m1_port_var_name[ N_PORT_VAR ] = { "I_syn" };
 
-const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string user_m1_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -102,157 +97,153 @@ const std::string user_m1_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string user_m1_port_param_name[N_PORT_PARAM] = {
+const std::string user_m1_port_param_name[ N_PORT_PARAM ] = {
   "tau_syn",
 };
 
-const std::string user_m1_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string user_m1_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-#define I_syn(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_I_syn]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-#define dI_syndt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_I_syn]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define tau_syn(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_syn]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     user_m1_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+#define I_syn( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_I_syn ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+#define dI_syndt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_I_syn ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define tau_syn( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_syn ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct )
 {
-  enum { n_port = (NVAR-N_SCAL_VAR)/N_PORT_VAR };
+  enum
+  {
+    n_port = ( NVAR - N_SCAL_VAR ) / N_PORT_VAR
+  };
   float I_syn_tot = 0.0;
-  
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
-  for (int i = 0; i<n_port; i++) {
-    I_syn_tot += I_syn(i);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn_tot += I_syn( i );
   }
-  float V_spike = Delta_T == 0. ? 0. : Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T == 0. ? 0. : Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn_tot - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn_tot - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
-  for (int i=0; i<n_port; i++) {
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
+  for ( int i = 0; i < n_port; i++ )
+  {
     // Synaptic current derivative
-    dI_syndt(i) = -I_syn(i) / tau_syn(i);
+    dI_syndt( i ) = -I_syn( i ) / tau_syn( i );
   }
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
-};
+}; // namespace user_m1_ns
 
 template <>
-int user_m1::UpdateNR<0>(long long it, double t1);
+int user_m1::UpdateNR< 0 >( long long it, double t1 );
 
-template<int N_PORT>
-int user_m1::UpdateNR(long long it, double t1)
+template < int N_PORT >
+int
+user_m1::UpdateNR( long long it, double t1 )
 {
-  if (N_PORT == n_port_) {
-    const int NVAR = user_m1_ns::N_SCAL_VAR
-      + user_m1_ns::N_PORT_VAR*N_PORT;
-    const int NPARAM = user_m1_ns::N_SCAL_PARAM
-      + user_m1_ns::N_PORT_PARAM*N_PORT;
+  if ( N_PORT == n_port_ )
+  {
+    const int NVAR = user_m1_ns::N_SCAL_VAR + user_m1_ns::N_PORT_VAR * N_PORT;
+    const int NPARAM = user_m1_ns::N_SCAL_PARAM + user_m1_ns::N_PORT_PARAM * N_PORT;
 
-    rk5_.Update<NVAR, NPARAM>(t1, h_min_, rk5_data_struct_);
+    rk5_.Update< NVAR, NPARAM >( t1, h_min_, rk5_data_struct_ );
   }
-  else {
-    UpdateNR<N_PORT - 1>(it, t1);
+  else
+  {
+    UpdateNR< N_PORT - 1 >( it, t1 );
   }
 
   return 0;
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct )
 {
-    user_m1_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  user_m1_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m1_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct )
 {
-    user_m1_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  user_m1_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/user_m1_psc_exp_rk5.h b/src/user_m1_psc_exp_rk5.h
index 48c513bbd..8ca1e174e 100644
--- a/src/user_m1_psc_exp_rk5.h
+++ b/src/user_m1_psc_exp_rk5.h
@@ -20,32 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1PSCEXPRK5_H
 #define USERM1PSCEXPRK5_H
 
 struct user_m1_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m1_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m1_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, user_m1_rk5 data_struct);
+__device__ void NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, user_m1_rk5 data_struct);
+__device__ void NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct );
 
 #endif
diff --git a/src/user_m1_rk5.h b/src/user_m1_rk5.h
index 2692c5b69..9669de8cb 100644
--- a/src/user_m1_rk5.h
+++ b/src/user_m1_rk5.h
@@ -20,32 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef USERM1RK5_H
 #define USERM1RK5_H
 
 struct user_m1_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void Derivatives( double x, float* y, float* dydx, float* param, user_m1_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m1_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m1_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m1_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, user_m1_rk5 data_struct);
+__device__ void NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, user_m1_rk5 data_struct);
+__device__ void NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m1_rk5 data_struct );
 
 #endif
diff --git a/src/user_m2.cu b/src/user_m2.cu
index f4ea09d90..42d7d23be 100644
--- a/src/user_m2.cu
+++ b/src/user_m2.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
-#include "user_m2_kernel.h"
 #include "rk5.h"
 #include "user_m2.h"
+#include "user_m2_kernel.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace user_m2_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      user_m2_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -54,72 +49,73 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0;
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
-    g(i) = 0;
-    g1(i) = 0;
-    E_rev(i) = 0.0;
-    tau_decay(i) = 20.0;
-    tau_rise(i) = 2.0;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    g( i ) = 0;
+    g1( i ) = 0;
+    E_rev( i ) = 0.0;
+    tau_decay( i ) = 20.0;
+    tau_rise( i ) = 2.0;
   }
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, user_m2_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
+  for ( int i = 0; i < n_port; i++ )
+  {
     // denominator is computed here to check that it is != 0
-    float denom1 = tau_decay(i) - tau_rise(i);
+    float denom1 = tau_decay( i ) - tau_rise( i );
     float denom2 = 0;
-    if (denom1 != 0) {
+    if ( denom1 != 0 )
+    {
       // peak time
-      float t_p = tau_decay(i)*tau_rise(i)
-	*log(tau_decay(i)/tau_rise(i)) / denom1;
+      float t_p = tau_decay( i ) * tau_rise( i ) * log( tau_decay( i ) / tau_rise( i ) ) / denom1;
       // another denominator is computed here to check that it is != 0
-      denom2 = exp(-t_p / tau_decay(i))
-	- exp(-t_p / tau_rise(i));
+      denom2 = exp( -t_p / tau_decay( i ) ) - exp( -t_p / tau_rise( i ) );
     }
-    if (denom2 == 0) { // if rise time == decay time use alpha function
+    if ( denom2 == 0 )
+    { // if rise time == decay time use alpha function
       // use normalization for alpha function in this case
-      g0(i) = M_E / tau_decay(i);
+      g0( i ) = M_E / tau_decay( i );
     }
-    else { // if rise time != decay time use beta function
-      g0(i) // normalization factor for conductance
-	= ( 1. / tau_rise(i) - 1. / tau_decay(i) ) / denom2;
+    else
+    {         // if rise time != decay time use beta function
+      g0( i ) // normalization factor for conductance
+        = ( 1. / tau_rise( i ) - 1. / tau_decay( i ) ) / denom2;
     }
   }
 }
 
-}
-			    
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, user_m2_rk5 data_struct)
+} // namespace user_m2_ns
+
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-    user_m2_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  user_m2_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, user_m2_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 
 {
-    user_m2_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  user_m2_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace user_m2_ns;
 
-int user_m2::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+user_m2::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_user_m2_model;
   n_scal_var_ = N_SCAL_VAR;
   n_port_var_ = N_PORT_VAR;
@@ -127,58 +123,60 @@ int user_m2::Init(int i_node_0, int n_node, int n_port,
   n_port_param_ = N_PORT_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
 
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = user_m2_scal_var_name;
-  port_var_name_= user_m2_port_var_name;
+  port_var_name_ = user_m2_port_var_name;
   scal_param_name_ = user_m2_scal_param_name;
   port_param_name_ = user_m2_port_param_name;
   group_param_name_ = user_m2_group_param_name;
-  //rk5_data_struct_.node_type_ = i_user_m2_model;
+  // rk5_data_struct_.node_type_ = i_user_m2_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
-  port_weight_arr_ = GetParamArr() + n_scal_param_
-    + GetPortParamIdx("g0");
+  port_weight_arr_ = GetParamArr() + n_scal_param_ + GetPortParamIdx( "g0" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = n_port_param_;
 
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("g1");
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "g1" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int user_m2::Calibrate(double time_min, float time_resolution)
+int
+user_m2::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
 template <>
-int user_m2::UpdateNR<0>(long long it, double t1)
+int
+user_m2::UpdateNR< 0 >( long long it, double t1 )
 {
   return 0;
 }
 
-int user_m2::Update(long long it, double t1) {
-  UpdateNR<MAX_PORT_NUM>(it, t1);
+int
+user_m2::Update( long long it, double t1 )
+{
+  UpdateNR< MAX_PORT_NUM >( it, t1 );
 
   return 0;
 }
diff --git a/src/user_m2.h b/src/user_m2.h
index 28afd73a9..eeb939d50 100644
--- a/src/user_m2.h
+++ b/src/user_m2.h
@@ -20,20 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2_H
 #define USERM2_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 #define MAX_PORT_NUM 20
 
@@ -44,30 +40,32 @@ struct user_m2_rk5
 
 class user_m2 : public BaseNeuron
 {
- public:
-  RungeKutta5<user_m2_rk5> rk5_;
+public:
+  RungeKutta5< user_m2_rk5 > rk5_;
   float h_min_;
   float h_;
   user_m2_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/user_m2_cond_alpha.cu b/src/user_m2_cond_alpha.cu
index b6bead719..705f51922 100644
--- a/src/user_m2_cond_alpha.cu
+++ b/src/user_m2_cond_alpha.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
-#include "user_m2_kernel.h"
 #include "rk5.h"
 #include "user_m2.h"
+#include "user_m2_kernel.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace user_m2_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      user_m2_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -54,54 +49,54 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0;
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
-    g(i) = 0;
-    g1(i) = 0;
-    E_rev(i) = 0.0;
-    tau_syn(i) = 2.0;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    g( i ) = 0;
+    g1( i ) = 0;
+    E_rev( i ) = 0.0;
+    tau_syn( i ) = 2.0;
   }
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, user_m2_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
+  for ( int i = 0; i < n_port; i++ )
+  {
     // use normalization for alpha function
-    g0(i) = M_E / tau_syn(i);
+    g0( i ) = M_E / tau_syn( i );
   }
 }
 
-}
-			    
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, user_m2_rk5 data_struct)
+} // namespace user_m2_ns
+
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-    user_m2_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  user_m2_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, user_m2_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 
 {
-    user_m2_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  user_m2_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace user_m2_ns;
 
-int user_m2::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+user_m2::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_user_m2_model;
   n_scal_var_ = N_SCAL_VAR;
   n_port_var_ = N_PORT_VAR;
@@ -109,58 +104,60 @@ int user_m2::Init(int i_node_0, int n_node, int n_port,
   n_port_param_ = N_PORT_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
+
+  group_param_ = new float[ N_GROUP_PARAM ];
 
-  group_param_ = new float[N_GROUP_PARAM];
-  
   scal_var_name_ = user_m2_scal_var_name;
-  port_var_name_= user_m2_port_var_name;
+  port_var_name_ = user_m2_port_var_name;
   scal_param_name_ = user_m2_scal_param_name;
   port_param_name_ = user_m2_port_param_name;
   group_param_name_ = user_m2_group_param_name;
-  //rk5_data_struct_.node_type_ = i_user_m2_model;
+  // rk5_data_struct_.node_type_ = i_user_m2_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
-  port_weight_arr_ = GetParamArr() + n_scal_param_
-    + GetPortParamIdx("g0");
+  port_weight_arr_ = GetParamArr() + n_scal_param_ + GetPortParamIdx( "g0" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = n_port_param_;
 
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("g1");
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "g1" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int user_m2::Calibrate(double time_min, float time_resolution)
+int
+user_m2::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
 template <>
-int user_m2::UpdateNR<0>(long long it, double t1)
+int
+user_m2::UpdateNR< 0 >( long long it, double t1 )
 {
   return 0;
 }
 
-int user_m2::Update(long long it, double t1) {
-  UpdateNR<MAX_PORT_NUM>(it, t1);
+int
+user_m2::Update( long long it, double t1 )
+{
+  UpdateNR< MAX_PORT_NUM >( it, t1 );
 
   return 0;
 }
diff --git a/src/user_m2_cond_alpha.h b/src/user_m2_cond_alpha.h
index be2e684c2..9ab2dd038 100644
--- a/src/user_m2_cond_alpha.h
+++ b/src/user_m2_cond_alpha.h
@@ -20,20 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2CONDALPHA_H
 #define USERM2CONDALPHA_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 #define MAX_PORT_NUM 20
 
@@ -44,30 +40,32 @@ struct user_m2_rk5
 
 class user_m2 : public BaseNeuron
 {
- public:
-  RungeKutta5<user_m2_rk5> rk5_;
+public:
+  RungeKutta5< user_m2_rk5 > rk5_;
   float h_min_;
   float h_;
   user_m2_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/user_m2_cond_alpha_kernel.h b/src/user_m2_cond_alpha_kernel.h
index 67163e4bd..6d7ceb2b5 100644
--- a/src/user_m2_cond_alpha_kernel.h
+++ b/src/user_m2_cond_alpha_kernel.h
@@ -20,38 +20,37 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2CONDALPHAKERNEL_H
 #define USERM2CONDALPHAKERNEL_H
 
 #include <string>
-				    //#include <cmath>
-#include "spike_buffer.h"
+// #include <cmath>
 #include "node_group.h"
+#include "spike_buffer.h"
 #include "user_m2.h"
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace user_m2_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_g = 0,
   i_g1,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -69,31 +68,26 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_E_rev = 0,
   i_tau_syn,
   i_g0,
   N_PORT_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
-const std::string user_m2_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
+const std::string user_m2_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string user_m2_port_var_name[N_PORT_VAR] = {
-  "g",
-  "g1"
-};
+const std::string user_m2_port_var_name[ N_PORT_VAR ] = { "g", "g1" };
 
-const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string user_m2_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -106,164 +100,157 @@ const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string user_m2_port_param_name[N_PORT_PARAM] = {
-  "E_rev",
-  "tau_syn",
-  "g0"  
-};
+const std::string user_m2_port_param_name[ N_PORT_PARAM ] = { "E_rev", "tau_syn", "g0" };
 
-const std::string user_m2_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string user_m2_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-#define g(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define g1(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-#define dgdt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define dg1dt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define E_rev(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_E_rev]
-#define tau_syn(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_syn]
-#define g0(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_g0]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     user_m2_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+#define g( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define g1( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+#define dgdt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define dg1dt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define E_rev( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_E_rev ]
+#define tau_syn( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_syn ]
+#define g0( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_g0 ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct )
 {
-  enum { n_port = (NVAR-N_SCAL_VAR)/N_PORT_VAR };
+  enum
+  {
+    n_port = ( NVAR - N_SCAL_VAR ) / N_PORT_VAR
+  };
   float I_syn = 0.0;
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
-  for (int i = 0; i<n_port; i++) {
-    I_syn += g(i)*(E_rev(i) - V);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn += g( i ) * ( E_rev( i ) - V );
   }
-  float V_spike = Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn - w + I_e ) / C_m;
 
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
-  for (int i=0; i<n_port; i++) {
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
+  for ( int i = 0; i < n_port; i++ )
+  {
     // Synaptic conductance derivative
-    dg1dt(i) = -g1(i) / tau_syn(i);
-    dgdt(i) = g1(i) - g(i) / tau_syn(i);
+    dg1dt( i ) = -g1( i ) / tau_syn( i );
+    dgdt( i ) = g1( i ) - g( i ) / tau_syn( i );
   }
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)::round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) ::round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
-};
+}; // namespace user_m2_ns
 
 template <>
-int user_m2::UpdateNR<0>(long long it, double t1);
+int user_m2::UpdateNR< 0 >( long long it, double t1 );
 
-template<int N_PORT>
-int user_m2::UpdateNR(long long it, double t1)
+template < int N_PORT >
+int
+user_m2::UpdateNR( long long it, double t1 )
 {
-  if (N_PORT == n_port_) {
-    const int NVAR = user_m2_ns::N_SCAL_VAR
-      + user_m2_ns::N_PORT_VAR*N_PORT;
-    const int NPARAM = user_m2_ns::N_SCAL_PARAM
-      + user_m2_ns::N_PORT_PARAM*N_PORT;
+  if ( N_PORT == n_port_ )
+  {
+    const int NVAR = user_m2_ns::N_SCAL_VAR + user_m2_ns::N_PORT_VAR * N_PORT;
+    const int NPARAM = user_m2_ns::N_SCAL_PARAM + user_m2_ns::N_PORT_PARAM * N_PORT;
 
-    rk5_.Update<NVAR, NPARAM>(t1, h_min_, rk5_data_struct_);
+    rk5_.Update< NVAR, NPARAM >( t1, h_min_, rk5_data_struct_ );
   }
-  else {
-    UpdateNR<N_PORT - 1>(it, t1);
+  else
+  {
+    UpdateNR< N_PORT - 1 >( it, t1 );
   }
 
   return 0;
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct )
 {
-    user_m2_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  user_m2_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct )
 {
-    user_m2_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  user_m2_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/user_m2_cond_alpha_rk5.h b/src/user_m2_cond_alpha_rk5.h
index 818673aa3..f64058e80 100644
--- a/src/user_m2_cond_alpha_rk5.h
+++ b/src/user_m2_cond_alpha_rk5.h
@@ -20,32 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2CONDALPHARK5_H
 #define USERM2CONDALPHARK5_H
 
 struct user_m2_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m2_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m2_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, user_m2_rk5 data_struct);
+__device__ void NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, user_m2_rk5 data_struct);
+__device__ void NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct );
 
 #endif
diff --git a/src/user_m2_cond_beta.cu b/src/user_m2_cond_beta.cu
index 56ec547b6..994864934 100644
--- a/src/user_m2_cond_beta.cu
+++ b/src/user_m2_cond_beta.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
-#include "user_m2_kernel.h"
 #include "rk5.h"
 #include "user_m2.h"
+#include "user_m2_kernel.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace user_m2_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      user_m2_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -54,72 +49,73 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0;
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
-    g(i) = 0;
-    g1(i) = 0;
-    E_rev(i) = 0.0;
-    tau_decay(i) = 20.0;
-    tau_rise(i) = 2.0;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    g( i ) = 0;
+    g1( i ) = 0;
+    E_rev( i ) = 0.0;
+    tau_decay( i ) = 20.0;
+    tau_rise( i ) = 2.0;
   }
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, user_m2_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
+  for ( int i = 0; i < n_port; i++ )
+  {
     // denominator is computed here to check that it is != 0
-    float denom1 = tau_decay(i) - tau_rise(i);
+    float denom1 = tau_decay( i ) - tau_rise( i );
     float denom2 = 0;
-    if (denom1 != 0) {
+    if ( denom1 != 0 )
+    {
       // peak time
-      float t_p = tau_decay(i)*tau_rise(i)
-	*log(tau_decay(i)/tau_rise(i)) / denom1;
+      float t_p = tau_decay( i ) * tau_rise( i ) * log( tau_decay( i ) / tau_rise( i ) ) / denom1;
       // another denominator is computed here to check that it is != 0
-      denom2 = exp(-t_p / tau_decay(i))
-	- exp(-t_p / tau_rise(i));
+      denom2 = exp( -t_p / tau_decay( i ) ) - exp( -t_p / tau_rise( i ) );
     }
-    if (denom2 == 0) { // if rise time == decay time use alpha function
+    if ( denom2 == 0 )
+    { // if rise time == decay time use alpha function
       // use normalization for alpha function in this case
-      g0(i) = M_E / tau_decay(i);
+      g0( i ) = M_E / tau_decay( i );
     }
-    else { // if rise time != decay time use beta function
-      g0(i) // normalization factor for conductance
-	= ( 1. / tau_rise(i) - 1. / tau_decay(i) ) / denom2;
+    else
+    {         // if rise time != decay time use beta function
+      g0( i ) // normalization factor for conductance
+        = ( 1. / tau_rise( i ) - 1. / tau_decay( i ) ) / denom2;
     }
   }
 }
 
-}
-			    
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, user_m2_rk5 data_struct)
+} // namespace user_m2_ns
+
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-    user_m2_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  user_m2_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, user_m2_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 
 {
-    user_m2_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  user_m2_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace user_m2_ns;
 
-int user_m2::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+user_m2::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_user_m2_model;
   n_scal_var_ = N_SCAL_VAR;
   n_port_var_ = N_PORT_VAR;
@@ -127,58 +123,60 @@ int user_m2::Init(int i_node_0, int n_node, int n_port,
   n_port_param_ = N_PORT_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
 
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = user_m2_scal_var_name;
-  port_var_name_= user_m2_port_var_name;
+  port_var_name_ = user_m2_port_var_name;
   scal_param_name_ = user_m2_scal_param_name;
   port_param_name_ = user_m2_port_param_name;
   group_param_name_ = user_m2_group_param_name;
-  //rk5_data_struct_.node_type_ = i_user_m2_model;
+  // rk5_data_struct_.node_type_ = i_user_m2_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
-  port_weight_arr_ = GetParamArr() + n_scal_param_
-    + GetPortParamIdx("g0");
+  port_weight_arr_ = GetParamArr() + n_scal_param_ + GetPortParamIdx( "g0" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = n_port_param_;
 
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("g1");
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "g1" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int user_m2::Calibrate(double time_min, float time_resolution)
+int
+user_m2::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
 template <>
-int user_m2::UpdateNR<0>(long long it, double t1)
+int
+user_m2::UpdateNR< 0 >( long long it, double t1 )
 {
   return 0;
 }
 
-int user_m2::Update(long long it, double t1) {
-  UpdateNR<MAX_PORT_NUM>(it, t1);
+int
+user_m2::Update( long long it, double t1 )
+{
+  UpdateNR< MAX_PORT_NUM >( it, t1 );
 
   return 0;
 }
diff --git a/src/user_m2_cond_beta.h b/src/user_m2_cond_beta.h
index e7c819c57..e1be6bc33 100644
--- a/src/user_m2_cond_beta.h
+++ b/src/user_m2_cond_beta.h
@@ -20,20 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2CONDBETA_H
 #define USERM2CONDBETA_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 #define MAX_PORT_NUM 20
 
@@ -44,30 +40,32 @@ struct user_m2_rk5
 
 class user_m2 : public BaseNeuron
 {
- public:
-  RungeKutta5<user_m2_rk5> rk5_;
+public:
+  RungeKutta5< user_m2_rk5 > rk5_;
   float h_min_;
   float h_;
   user_m2_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/user_m2_cond_beta_kernel.h b/src/user_m2_cond_beta_kernel.h
index 914adb95c..9c345a1c8 100644
--- a/src/user_m2_cond_beta_kernel.h
+++ b/src/user_m2_cond_beta_kernel.h
@@ -20,38 +20,37 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2CONDBETAKERNEL_H
 #define USERM2CONDBETAKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
 #include "node_group.h"
+#include "spike_buffer.h"
 #include "user_m2.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace user_m2_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_g = 0,
   i_g1,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -69,7 +68,8 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_E_rev = 0,
   i_tau_rise,
   i_tau_decay,
@@ -77,25 +77,18 @@ enum PortParamIndexes {
   N_PORT_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
+const std::string user_m2_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string user_m2_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
-
-const std::string user_m2_port_var_name[N_PORT_VAR] = {
-  "g",
-  "g1"
-};
+const std::string user_m2_port_var_name[ N_PORT_VAR ] = { "g", "g1" };
 
-const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string user_m2_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -108,165 +101,157 @@ const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string user_m2_port_param_name[N_PORT_PARAM] = {
-  "E_rev",
-  "tau_rise",
-  "tau_decay",
-  "g0"  
-};
+const std::string user_m2_port_param_name[ N_PORT_PARAM ] = { "E_rev", "tau_rise", "tau_decay", "g0" };
 
-const std::string user_m2_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string user_m2_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-#define g(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define g1(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-#define dgdt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define dg1dt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define E_rev(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_E_rev]
-#define tau_rise(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_rise]
-#define tau_decay(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_decay]
-#define g0(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_g0]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     user_m2_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+#define g( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define g1( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+#define dgdt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define dg1dt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define E_rev( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_E_rev ]
+#define tau_rise( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_rise ]
+#define tau_decay( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_decay ]
+#define g0( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_g0 ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct )
 {
-  enum { n_port = (NVAR-N_SCAL_VAR)/N_PORT_VAR };
+  enum
+  {
+    n_port = ( NVAR - N_SCAL_VAR ) / N_PORT_VAR
+  };
   float I_syn = 0.0;
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
-  for (int i = 0; i<n_port; i++) {
-    I_syn += g(i)*(E_rev(i) - V);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn += g( i ) * ( E_rev( i ) - V );
   }
-  float V_spike = Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
-  for (int i=0; i<n_port; i++) {
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
+  for ( int i = 0; i < n_port; i++ )
+  {
     // Synaptic conductance derivative
-    dg1dt(i) = -g1(i) / tau_rise(i);
-    dgdt(i) = g1(i) - g(i) / tau_decay(i);
+    dg1dt( i ) = -g1( i ) / tau_rise( i );
+    dgdt( i ) = g1( i ) - g( i ) / tau_decay( i );
   }
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
-};
+}; // namespace user_m2_ns
 
 template <>
-int user_m2::UpdateNR<0>(long long it, double t1);
+int user_m2::UpdateNR< 0 >( long long it, double t1 );
 
-template<int N_PORT>
-int user_m2::UpdateNR(long long it, double t1)
+template < int N_PORT >
+int
+user_m2::UpdateNR( long long it, double t1 )
 {
-  if (N_PORT == n_port_) {
-    const int NVAR = user_m2_ns::N_SCAL_VAR
-      + user_m2_ns::N_PORT_VAR*N_PORT;
-    const int NPARAM = user_m2_ns::N_SCAL_PARAM
-      + user_m2_ns::N_PORT_PARAM*N_PORT;
+  if ( N_PORT == n_port_ )
+  {
+    const int NVAR = user_m2_ns::N_SCAL_VAR + user_m2_ns::N_PORT_VAR * N_PORT;
+    const int NPARAM = user_m2_ns::N_SCAL_PARAM + user_m2_ns::N_PORT_PARAM * N_PORT;
 
-    rk5_.Update<NVAR, NPARAM>(t1, h_min_, rk5_data_struct_);
+    rk5_.Update< NVAR, NPARAM >( t1, h_min_, rk5_data_struct_ );
   }
-  else {
-    UpdateNR<N_PORT - 1>(it, t1);
+  else
+  {
+    UpdateNR< N_PORT - 1 >( it, t1 );
   }
 
   return 0;
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct )
 {
-    user_m2_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  user_m2_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct )
 {
-    user_m2_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  user_m2_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/user_m2_cond_beta_rk5.h b/src/user_m2_cond_beta_rk5.h
index b45fc9610..417f92701 100644
--- a/src/user_m2_cond_beta_rk5.h
+++ b/src/user_m2_cond_beta_rk5.h
@@ -20,32 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2CONDBETARK5_H
 #define USERM2CONDBETARK5_H
 
 struct user_m2_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m2_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m2_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, user_m2_rk5 data_struct);
+__device__ void NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, user_m2_rk5 data_struct);
+__device__ void NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct );
 
 #endif
diff --git a/src/user_m2_iaf_psc_exp.cu b/src/user_m2_iaf_psc_exp.cu
index 133d96113..0e04b4b89 100644
--- a/src/user_m2_iaf_psc_exp.cu
+++ b/src/user_m2_iaf_psc_exp.cu
@@ -20,93 +20,91 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/user_m2.cpp
 
-#include <config.h>
-#include <cmath>
-#include <iostream>
-#include "user_m2.h"
 #include "propagator_stability.h"
 #include "spike_buffer.h"
+#include "user_m2.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 using namespace user_m2_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
-extern __device__ double propagator_32(double, double, double, double);
-
-#define I_syn_ex var[i_I_syn_ex]
-#define I_syn_in var[i_I_syn_in]
-#define V_m_rel var[i_V_m_rel]
-#define refractory_step var[i_refractory_step]
-
-#define tau_m param[i_tau_m]
-#define C_m param[i_C_m]
-#define E_L param[i_E_L]
-#define I_e param[i_I_e]
-#define Theta_rel param[i_Theta_rel]
-#define V_reset_rel param[i_V_reset_rel]
-#define tau_ex param[i_tau_ex]
-#define tau_in param[i_tau_in]
-//#define rho param[i_rho]
-//#define delta param[i_delta]
-#define t_ref param[i_t_ref]
-#define den_delay param[i_den_delay]
-
-#define P20 param[i_P20]
-#define P11ex param[i_P11ex]
-#define P11in param[i_P11in]
-#define P21ex param[i_P21ex]
-#define P21in param[i_P21in]
-#define P22 param[i_P22]
-
-
-
-__global__ void user_m2_Calibrate(int n_node, float *param_arr,
-				      int n_param, float h)
+extern __device__ double propagator_32( double, double, double, double );
+
+#define I_syn_ex var[ i_I_syn_ex ]
+#define I_syn_in var[ i_I_syn_in ]
+#define V_m_rel var[ i_V_m_rel ]
+#define refractory_step var[ i_refractory_step ]
+
+#define tau_m param[ i_tau_m ]
+#define C_m param[ i_C_m ]
+#define E_L param[ i_E_L ]
+#define I_e param[ i_I_e ]
+#define Theta_rel param[ i_Theta_rel ]
+#define V_reset_rel param[ i_V_reset_rel ]
+#define tau_ex param[ i_tau_ex ]
+#define tau_in param[ i_tau_in ]
+// #define rho param[i_rho]
+// #define delta param[i_delta]
+#define t_ref param[ i_t_ref ]
+#define den_delay param[ i_den_delay ]
+
+#define P20 param[ i_P20 ]
+#define P11ex param[ i_P11ex ]
+#define P11in param[ i_P11in ]
+#define P21ex param[ i_P21ex ]
+#define P21in param[ i_P21in ]
+#define P22 param[ i_P22 ]
+
+__global__ void
+user_m2_Calibrate( int n_node, float* param_arr, int n_param, float h )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *param = param_arr + n_param*i_neuron;
-    
+  if ( i_neuron < n_node )
+  {
+    float* param = param_arr + n_param * i_neuron;
+
     P11ex = exp( -h / tau_ex );
     P11in = exp( -h / tau_in );
     P22 = exp( -h / tau_m );
-    P21ex = (float)propagator_32( tau_ex, tau_m, C_m, h );
-    P21in = (float)propagator_32( tau_in, tau_m, C_m, h ); 
+    P21ex = ( float ) propagator_32( tau_ex, tau_m, C_m, h );
+    P21in = ( float ) propagator_32( tau_in, tau_m, C_m, h );
     P20 = tau_m / C_m * ( 1.0 - P22 );
   }
 }
 
-
-__global__ void user_m2_Update(int n_node, int i_node_0, float *var_arr,
-				   float *param_arr, int n_var, int n_param)
+__global__ void
+user_m2_Update( int n_node, int i_node_0, float* var_arr, float* param_arr, int n_var, int n_param )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V
+    else
+    { // neuron is not refractory, so evolve V
       V_m_rel = V_m_rel * P22 + I_syn_ex * P21ex + I_syn_in * P21in + I_e * P20;
     }
     // exponential decaying PSCs
     I_syn_ex *= P11ex;
     I_syn_in *= P11in;
-    
-    if (V_m_rel >= Theta_rel ) { // threshold crossing
-      PushSpike(i_node_0 + i_neuron, 1.0);
+
+    if ( V_m_rel >= Theta_rel )
+    { // threshold crossing
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m_rel = V_reset_rel;
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-    }    
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+    }
   }
 }
 
@@ -116,87 +114,87 @@ user_m2::~user_m2()
   FreeParamArr();
 }
 
-int user_m2::Init(int i_node_0, int n_node, int /*n_port*/,
-			 int i_group)
+int
+user_m2::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 2 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 2 /*n_port*/, i_group );
   node_type_ = i_user_m2_model;
 
   n_scal_var_ = N_SCAL_VAR;
   n_var_ = n_scal_var_;
   n_scal_param_ = N_SCAL_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
 
   scal_var_name_ = user_m2_scal_var_name;
   scal_param_name_ = user_m2_scal_param_name;
 
-  SetScalParam(0, n_node, "tau_m", 10.0 );           // in ms
-  SetScalParam(0, n_node, "C_m", 250.0 );            // in pF
-  SetScalParam(0, n_node, "E_L", -70.0 );            // in mV
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
-  SetScalParam(0, n_node, "Theta_rel", -55.0 - (-70.0) );   // relative to E_L_
-  SetScalParam(0, n_node, "V_reset_rel", -70.0 - (-70.0) ); // relative to E_L_
-  SetScalParam(0, n_node, "tau_ex", 2.0 );           // in ms
-  SetScalParam(0, n_node, "tau_in", 2.0 );           // in ms
+  SetScalParam( 0, n_node, "tau_m", 10.0 );                    // in ms
+  SetScalParam( 0, n_node, "C_m", 250.0 );                     // in pF
+  SetScalParam( 0, n_node, "E_L", -70.0 );                     // in mV
+  SetScalParam( 0, n_node, "I_e", 0.0 );                       // in pA
+  SetScalParam( 0, n_node, "Theta_rel", -55.0 - ( -70.0 ) );   // relative to E_L_
+  SetScalParam( 0, n_node, "V_reset_rel", -70.0 - ( -70.0 ) ); // relative to E_L_
+  SetScalParam( 0, n_node, "tau_ex", 2.0 );                    // in ms
+  SetScalParam( 0, n_node, "tau_in", 2.0 );                    // in ms
   // SetScalParam(0, n_node, "rho", 0.01 );             // in 1/s
   // SetScalParam(0, n_node, "delta", 0.0 );            // in mV
-  SetScalParam(0, n_node, "t_ref",  2.0 );           // in ms
-  SetScalParam(0, n_node, "den_delay", 0.0);         // in ms
-  SetScalParam(0, n_node, "P20", 0.0);
-  SetScalParam(0, n_node, "P11ex", 0.0);
-  SetScalParam(0, n_node, "P11in", 0.0);
-  SetScalParam(0, n_node, "P21ex", 0.0);
-  SetScalParam(0, n_node, "P21in", 0.0);
-  SetScalParam(0, n_node, "P22", 0.0);
-
-  SetScalVar(0, n_node, "I_syn_ex", 0.0 );
-  SetScalVar(0, n_node, "I_syn_in", 0.0 );
-  SetScalVar(0, n_node, "V_m_rel", -70.0 - (-70.0) ); // in mV, relative to E_L
-  SetScalVar(0, n_node, "refractory_step", 0 );
+  SetScalParam( 0, n_node, "t_ref", 2.0 );     // in ms
+  SetScalParam( 0, n_node, "den_delay", 0.0 ); // in ms
+  SetScalParam( 0, n_node, "P20", 0.0 );
+  SetScalParam( 0, n_node, "P11ex", 0.0 );
+  SetScalParam( 0, n_node, "P11in", 0.0 );
+  SetScalParam( 0, n_node, "P21ex", 0.0 );
+  SetScalParam( 0, n_node, "P21in", 0.0 );
+  SetScalParam( 0, n_node, "P22", 0.0 );
+
+  SetScalVar( 0, n_node, "I_syn_ex", 0.0 );
+  SetScalVar( 0, n_node, "I_syn_in", 0.0 );
+  SetScalVar( 0, n_node, "V_m_rel", -70.0 - ( -70.0 ) ); // in mV, relative to E_L
+  SetScalVar( 0, n_node, "refractory_step", 0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn_ex, I_syn_in
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn_ex");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn_ex" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 1;
 
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
-  
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
+
   return 0;
 }
 
-int user_m2::Update(long long it, double t1)
+int
+user_m2::Update( long long it, double t1 )
 {
   // std::cout << "user_m2 neuron update\n";
-  user_m2_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_);
+  user_m2_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>( n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_ );
   // gpuErrchk( cudaDeviceSynchronize() );
-  
+
   return 0;
 }
 
-int user_m2::Free()
+int
+user_m2::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
-  
+
   return 0;
 }
 
-int user_m2::Calibrate(double, float time_resolution)
+int
+user_m2::Calibrate( double, float time_resolution )
 {
-  user_m2_Calibrate<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, param_arr_, n_param_, time_resolution);
+  user_m2_Calibrate<<< ( n_node_ + 1023 ) / 1024, 1024 >>>( n_node_, param_arr_, n_param_, time_resolution );
 
   return 0;
 }
diff --git a/src/user_m2_iaf_psc_exp.h b/src/user_m2_iaf_psc_exp.h
index 726487412..c5f05b44a 100644
--- a/src/user_m2_iaf_psc_exp.h
+++ b/src/user_m2_iaf_psc_exp.h
@@ -20,48 +20,44 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/user_m2.h
 
-
 #ifndef USERM2IAFPSCEXP_H
 #define USERM2IAFPSCEXP_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 namespace user_m2_ns
 {
-enum ScalVarIndexes {
-  i_I_syn_ex = 0,        // postsynaptic current for exc. inputs
-  i_I_syn_in,            // postsynaptic current for inh. inputs
-  i_V_m_rel,                 // membrane potential
-  i_refractory_step,     // refractory step counter
+enum ScalVarIndexes
+{
+  i_I_syn_ex = 0,    // postsynaptic current for exc. inputs
+  i_I_syn_in,        // postsynaptic current for inh. inputs
+  i_V_m_rel,         // membrane potential
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_tau_m = 0,       // Membrane time constant in ms
-  i_C_m,             // Membrane capacitance in pF
-  i_E_L,             // Resting potential in mV
-  i_I_e,             // External current in pA
-  i_Theta_rel,       // Threshold, RELATIVE TO RESTING POTENTAIL(!)
-                     // i.e. the real threshold is (E_L_+Theta_rel_)
-  i_V_reset_rel,     // relative reset value of the membrane potential
-  i_tau_ex,          // Time constant of excitatory synaptic current in ms
-  i_tau_in,          // Time constant of inhibitory synaptic current in ms
+enum ScalParamIndexes
+{
+  i_tau_m = 0,   // Membrane time constant in ms
+  i_C_m,         // Membrane capacitance in pF
+  i_E_L,         // Resting potential in mV
+  i_I_e,         // External current in pA
+  i_Theta_rel,   // Threshold, RELATIVE TO RESTING POTENTAIL(!)
+                 // i.e. the real threshold is (E_L_+Theta_rel_)
+  i_V_reset_rel, // relative reset value of the membrane potential
+  i_tau_ex,      // Time constant of excitatory synaptic current in ms
+  i_tau_in,      // Time constant of inhibitory synaptic current in ms
   // i_rho,          // Stochastic firing intensity at threshold in 1/s
   // i_delta,        // Width of threshold region in mV
-  i_t_ref,           // Refractory period in ms
+  i_t_ref,     // Refractory period in ms
   i_den_delay, // dendritic backpropagation delay
   // time evolution operator
   i_P20,
@@ -73,17 +69,9 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
- 
-const std::string user_m2_scal_var_name[N_SCAL_VAR] = {
-  "I_syn_ex",
-  "I_syn_in",
-  "V_m_rel",
-  "refractory_step"
-};
-
+const std::string user_m2_scal_var_name[ N_SCAL_VAR ] = { "I_syn_ex", "I_syn_in", "V_m_rel", "refractory_step" };
 
-const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
-  "tau_m",
+const std::string user_m2_scal_param_name[ N_SCAL_PARAM ] = { "tau_m",
   "C_m",
   "E_L",
   "I_e",
@@ -100,26 +88,22 @@ const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
   "P11in",
   "P21ex",
   "P21in",
-  "P22"
-};
+  "P22" };
+
+} // namespace user_m2_ns
 
-} // namespace
- 
 class user_m2 : public BaseNeuron
 {
- public:
+public:
   ~user_m2();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Calibrate(double, float time_resolution);
-		
-  int Update(long long it, double t1);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
 
-  int Free();
+  int Calibrate( double, float time_resolution );
 
-};
+  int Update( long long it, double t1 );
 
+  int Free();
+};
 
 #endif
diff --git a/src/user_m2_iaf_psc_exp_g.cu b/src/user_m2_iaf_psc_exp_g.cu
index 72034e2b6..74c691612 100644
--- a/src/user_m2_iaf_psc_exp_g.cu
+++ b/src/user_m2_iaf_psc_exp_g.cu
@@ -20,74 +20,82 @@
  *
  */
 
-
-
-
-
-#include <config.h>
+#include "spike_buffer.h"
+#include "user_m2.h"
 #include <cmath>
+#include <config.h>
 #include <iostream>
-#include "user_m2.h"
-#include "spike_buffer.h"
 
 using namespace user_m2_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
 
-#define I_syn var[i_I_syn]
-#define V_m_rel var[i_V_m_rel]
-#define refractory_step var[i_refractory_step]
-#define I_e param[i_I_e]
-
-#define tau_m_ group_param_[i_tau_m]
-#define C_m_ group_param_[i_C_m]
-#define E_L_ group_param_[i_E_L]
-#define Theta_rel_ group_param_[i_Theta_rel]
-#define V_reset_rel_ group_param_[i_V_reset_rel]
-#define tau_syn_ group_param_[i_tau_syn]
-#define t_ref_ group_param_[i_t_ref]
-
-__global__ void user_m2_Update
-( int n_node, int i_node_0, float *var_arr, float *param_arr, int n_var,
-  int n_param, float Theta_rel, float V_reset_rel, int n_refractory_steps,
-  float P11, float P22, float P21, float P20 )
+#define I_syn var[ i_I_syn ]
+#define V_m_rel var[ i_V_m_rel ]
+#define refractory_step var[ i_refractory_step ]
+#define I_e param[ i_I_e ]
+
+#define tau_m_ group_param_[ i_tau_m ]
+#define C_m_ group_param_[ i_C_m ]
+#define E_L_ group_param_[ i_E_L ]
+#define Theta_rel_ group_param_[ i_Theta_rel ]
+#define V_reset_rel_ group_param_[ i_V_reset_rel ]
+#define tau_syn_ group_param_[ i_tau_syn ]
+#define t_ref_ group_param_[ i_t_ref ]
+
+__global__ void
+user_m2_Update( int n_node,
+  int i_node_0,
+  float* var_arr,
+  float* param_arr,
+  int n_var,
+  int n_param,
+  float Theta_rel,
+  float V_reset_rel,
+  int n_refractory_steps,
+  float P11,
+  float P22,
+  float P21,
+  float P20 )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V
+    else
+    { // neuron is not refractory, so evolve V
       V_m_rel = V_m_rel * P22 + I_syn * P21 + I_e * P20;
     }
     // exponential decaying PSC
     I_syn *= P11;
-    
-    if (V_m_rel >= Theta_rel ) { // threshold crossing
-      PushSpike(i_node_0 + i_neuron, 1.0);
+
+    if ( V_m_rel >= Theta_rel )
+    { // threshold crossing
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m_rel = V_reset_rel;
       refractory_step = n_refractory_steps;
-    }    
+    }
   }
 }
 
-double h_propagator_32( double tau_syn, double tau, double C, double h )
+double
+h_propagator_32( double tau_syn, double tau, double C, double h )
 {
-  const double P32_linear = 1.0 / ( 2.0 * C * tau * tau ) * h * h
-    * ( tau_syn - tau ) * exp( -h / tau );
+  const double P32_linear = 1.0 / ( 2.0 * C * tau * tau ) * h * h * ( tau_syn - tau ) * exp( -h / tau );
   const double P32_singular = h / C * exp( -h / tau );
   const double P32 =
-    -tau / ( C * ( 1.0 - tau / tau_syn ) ) * exp( -h / tau_syn )
-    * expm1( h * ( 1.0 / tau_syn - 1.0 / tau ) );
+    -tau / ( C * ( 1.0 - tau / tau_syn ) ) * exp( -h / tau_syn ) * expm1( h * ( 1.0 / tau_syn - 1.0 / tau ) );
 
   const double dev_P32 = fabs( P32 - P32_singular );
 
-  if ( tau == tau_syn || ( fabs( tau - tau_syn ) < 0.1 && dev_P32 > 2.0
-			   * fabs( P32_linear ) ) )
+  if ( tau == tau_syn || ( fabs( tau - tau_syn ) < 0.1 && dev_P32 > 2.0 * fabs( P32_linear ) ) )
   {
     return P32_singular;
   }
@@ -103,10 +111,10 @@ user_m2::~user_m2()
   FreeParamArr();
 }
 
-int user_m2::Init(int i_node_0, int n_node, int /*n_port*/,
-			   int i_group)
+int
+user_m2::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 1 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 1 /*n_port*/, i_group );
   node_type_ = i_user_m2_model;
 
   n_scal_var_ = N_SCAL_VAR;
@@ -114,46 +122,46 @@ int user_m2::Init(int i_node_0, int n_node, int /*n_port*/,
   n_scal_param_ = N_SCAL_PARAM;
   n_group_param_ = N_GROUP_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = user_m2_scal_var_name;
   scal_param_name_ = user_m2_scal_param_name;
   group_param_name_ = user_m2_group_param_name;
 
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
+  SetScalParam( 0, n_node, "I_e", 0.0 ); // in pA
 
-  SetScalVar(0, n_node, "I_syn", 0.0 );
-  SetScalVar(0, n_node, "V_m_rel", 0.0 ); // in mV
-  SetScalVar(0, n_node, "refractory_step", 0 );
+  SetScalVar( 0, n_node, "I_syn", 0.0 );
+  SetScalVar( 0, n_node, "V_m_rel", 0.0 ); // in mV
+  SetScalVar( 0, n_node, "refractory_step", 0 );
 
-  SetGroupParam("tau_m", 10.0);
-  SetGroupParam("C_m", 250.0);
-  SetGroupParam("E_L", -65.0);
-  SetGroupParam("Theta_rel", 15.0);
-  SetGroupParam("V_reset_rel", 0.0);
-  SetGroupParam("tau_syn", 0.5);
-  SetGroupParam("t_ref", 2.0);
+  SetGroupParam( "tau_m", 10.0 );
+  SetGroupParam( "C_m", 250.0 );
+  SetGroupParam( "E_L", -65.0 );
+  SetGroupParam( "Theta_rel", 15.0 );
+  SetGroupParam( "V_reset_rel", 0.0 );
+  SetGroupParam( "tau_syn", 0.5 );
+  SetGroupParam( "t_ref", 2.0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 0;
 
   return 0;
 }
 
-int user_m2::Update(long long it, double t1)
+int
+user_m2::Update( long long it, double t1 )
 {
   // std::cout << "user_m2 neuron update\n";
   float h = time_resolution_;
@@ -161,21 +169,32 @@ int user_m2::Update(long long it, double t1)
   float P22 = exp( -h / tau_m_ );
   float P21 = h_propagator_32( tau_syn_, tau_m_, C_m_, h );
   float P20 = tau_m_ / C_m_ * ( 1.0 - P22 );
-  int n_refractory_steps = int(round(t_ref_ / h));
+  int n_refractory_steps = int( round( t_ref_ / h ) );
+
+  user_m2_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>( n_node_,
+    i_node_0_,
+    var_arr_,
+    param_arr_,
+    n_var_,
+    n_param_,
+    Theta_rel_,
+    V_reset_rel_,
+    n_refractory_steps,
+    P11,
+    P22,
+    P21,
+    P20 );
+  // gpuErrchk( cudaDeviceSynchronize() );
 
-  user_m2_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_,
-      Theta_rel_, V_reset_rel_, n_refractory_steps, P11, P22, P21, P20 );
-  //gpuErrchk( cudaDeviceSynchronize() );
-  
   return 0;
 }
 
-int user_m2::Free()
+int
+user_m2::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
   delete[] group_param_;
-  
+
   return 0;
 }
diff --git a/src/user_m2_iaf_psc_exp_g.h b/src/user_m2_iaf_psc_exp_g.h
index 602a294a9..fb6800401 100644
--- a/src/user_m2_iaf_psc_exp_g.h
+++ b/src/user_m2_iaf_psc_exp_g.h
@@ -20,97 +20,76 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/iaf_psc_exp.h
 
-
 #ifndef USERM2IAFPSCEXPG_H
 #define USERM2IAFPSCEXPG_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 namespace user_m2_ns
 {
-enum ScalVarIndexes {
-  i_I_syn = 0,        // postsynaptic current for exc. inputs
-  i_V_m_rel,          // membrane potential relative to E_L
-  i_refractory_step,  // refractory step counter
+enum ScalVarIndexes
+{
+  i_I_syn = 0,       // postsynaptic current for exc. inputs
+  i_V_m_rel,         // membrane potential relative to E_L
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_I_e = 0,         // External current in pA
+enum ScalParamIndexes
+{
+  i_I_e = 0, // External current in pA
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
-  i_tau_m = 0,       // Membrane time constant in ms
-  i_C_m,             // Membrane capacitance in pF
-  i_E_L,             // Resting potential in mV
-  i_Theta_rel,       // Threshold, RELATIVE TO RESTING POTENTIAL(!)
-                     // i.e. the real threshold is (E_L_+Theta_rel_)
-  i_V_reset_rel,     // relative reset value of the membrane potential
-  i_tau_syn,         // Time constant of synaptic current in ms
-  i_t_ref,           // Refractory period in ms
+enum GroupParamIndexes
+{
+  i_tau_m = 0,   // Membrane time constant in ms
+  i_C_m,         // Membrane capacitance in pF
+  i_E_L,         // Resting potential in mV
+  i_Theta_rel,   // Threshold, RELATIVE TO RESTING POTENTIAL(!)
+                 // i.e. the real threshold is (E_L_+Theta_rel_)
+  i_V_reset_rel, // relative reset value of the membrane potential
+  i_tau_syn,     // Time constant of synaptic current in ms
+  i_t_ref,       // Refractory period in ms
   N_GROUP_PARAM
 };
 
+const std::string user_m2_scal_var_name[ N_SCAL_VAR ] = { "I_syn", "V_m_rel", "refractory_step" };
 
- 
-const std::string user_m2_scal_var_name[N_SCAL_VAR] = {
-  "I_syn",
-  "V_m_rel",
-  "refractory_step"
-};
-
-const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
-  "I_e"
-};
-
-const std::string user_m2_group_param_name[N_GROUP_PARAM] = {
-  "tau_m",
-  "C_m",
-  "E_L",
-  "Theta_rel",
-  "V_reset_rel",
-  "tau_syn",
-  "t_ref"
-};
- 
-} // namespace
- 
+const std::string user_m2_scal_param_name[ N_SCAL_PARAM ] = { "I_e" };
 
+const std::string
+  user_m2_group_param_name[ N_GROUP_PARAM ] = { "tau_m", "C_m", "E_L", "Theta_rel", "V_reset_rel", "tau_syn", "t_ref" };
 
+} // namespace user_m2_ns
 
 class user_m2 : public BaseNeuron
 {
   float time_resolution_;
 
- public:
+public:
   ~user_m2();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
-  int Calibrate(double /*time_min*/, float time_res) {
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int
+  Calibrate( double /*time_min*/, float time_res )
+  {
     time_resolution_ = time_res;
     return 0;
   }
-  
-  int Update(long long it, double t1);
 
-  int Free();
+  int Update( long long it, double t1 );
 
+  int Free();
 };
 
-
 #endif
diff --git a/src/user_m2_iaf_psc_exp_hc.cu b/src/user_m2_iaf_psc_exp_hc.cu
index 59bed4f17..3bec194a9 100644
--- a/src/user_m2_iaf_psc_exp_hc.cu
+++ b/src/user_m2_iaf_psc_exp_hc.cu
@@ -20,51 +20,50 @@
  *
  */
 
-
-
-
-
-#include <config.h>
+#include "spike_buffer.h"
+#include "user_m2_hc.h"
 #include <cmath>
+#include <config.h>
 #include <iostream>
-#include "user_m2_hc.h"
-#include "spike_buffer.h"
 
 using namespace user_m2_hc_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
 
-#define I_syn var[i_I_syn]
-#define V_m_rel var[i_V_m_rel]
-#define refractory_step var[i_refractory_step]
-#define I_e param[i_I_e]
+#define I_syn var[ i_I_syn ]
+#define V_m_rel var[ i_V_m_rel ]
+#define refractory_step var[ i_refractory_step ]
+#define I_e param[ i_I_e ]
 
 #include "user_m2_hc_params.h"
 
-__global__ void user_m2_hc_Update(int n_node, int i_node_0,
-					float *var_arr, float *param_arr,
-					int n_var, int n_param)
+__global__ void
+user_m2_hc_Update( int n_node, int i_node_0, float* var_arr, float* param_arr, int n_var, int n_param )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V
+    else
+    { // neuron is not refractory, so evolve V
       V_m_rel = V_m_rel * P22 + I_syn * P21 + I_e * P20;
     }
     // exponential decaying PSC
     I_syn *= P11;
-    
-    if (V_m_rel >= Theta_rel ) { // threshold crossing
-      PushSpike(i_node_0 + i_neuron, 1.0);
+
+    if ( V_m_rel >= Theta_rel )
+    { // threshold crossing
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m_rel = V_reset_rel;
       refractory_step = n_refractory_steps;
-    }    
+    }
   }
 }
 
@@ -74,59 +73,60 @@ user_m2_hc::~user_m2_hc()
   FreeParamArr();
 }
 
-int user_m2_hc::Init(int i_node_0, int n_node, int /*n_port*/,
-			   int i_group)
+int
+user_m2_hc::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 1 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 1 /*n_port*/, i_group );
   node_type_ = i_user_m2_hc_model;
 
   n_scal_var_ = N_SCAL_VAR;
   n_var_ = n_scal_var_;
   n_scal_param_ = N_SCAL_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
 
   scal_var_name_ = user_m2_hc_scal_var_name;
   scal_param_name_ = user_m2_hc_scal_param_name;
 
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
+  SetScalParam( 0, n_node, "I_e", 0.0 ); // in pA
 
-  SetScalVar(0, n_node, "I_syn", 0.0 );
-  SetScalVar(0, n_node, "V_m_rel", 0.0 ); // in mV
-  SetScalVar(0, n_node, "refractory_step", 0 );
+  SetScalVar( 0, n_node, "I_syn", 0.0 );
+  SetScalVar( 0, n_node, "V_m_rel", 0.0 ); // in mV
+  SetScalVar( 0, n_node, "refractory_step", 0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 0;
 
   return 0;
 }
 
-int user_m2_hc::Update(long long it, double t1)
+int
+user_m2_hc::Update( long long it, double t1 )
 {
   // std::cout << "user_m2_hc neuron update\n";
-  user_m2_hc_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_);
-  //gpuErrchk( cudaDeviceSynchronize() );
-  
+  user_m2_hc_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>(
+    n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_ );
+  // gpuErrchk( cudaDeviceSynchronize() );
+
   return 0;
 }
 
-int user_m2_hc::Free()
+int
+user_m2_hc::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
-  
+
   return 0;
 }
diff --git a/src/user_m2_iaf_psc_exp_hc.h b/src/user_m2_iaf_psc_exp_hc.h
index ad5f01115..1d13b7a89 100644
--- a/src/user_m2_iaf_psc_exp_hc.h
+++ b/src/user_m2_iaf_psc_exp_hc.h
@@ -20,65 +20,51 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/user_m2.h
 
-
 #ifndef USERM2IAFPSCEXPHC_H
 #define USERM2IAFPSCEXPHC_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 namespace user_m2_hc_ns
 {
-enum ScalVarIndexes {
-  i_I_syn = 0,        // postsynaptic current for exc. inputs
-  i_V_m_rel,          // membrane potential relative to E_L
-  i_refractory_step,  // refractory step counter
+enum ScalVarIndexes
+{
+  i_I_syn = 0,       // postsynaptic current for exc. inputs
+  i_V_m_rel,         // membrane potential relative to E_L
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_I_e = 0,         // External current in pA
+enum ScalParamIndexes
+{
+  i_I_e = 0, // External current in pA
   N_SCAL_PARAM
 };
 
- const std::string user_m2_hc_scal_var_name[N_SCAL_VAR] = {
-  "I_syn",
-  "V_m_rel",
-  "refractory_step"
-};
+const std::string user_m2_hc_scal_var_name[ N_SCAL_VAR ] = { "I_syn", "V_m_rel", "refractory_step" };
 
-const std::string user_m2_hc_scal_param_name[N_SCAL_PARAM] = {
-  "I_e"
-};
+const std::string user_m2_hc_scal_param_name[ N_SCAL_PARAM ] = { "I_e" };
 
-} // namespace
- 
+} // namespace user_m2_hc_ns
 
 class user_m2_hc : public BaseNeuron
 {
- public:
+public:
   ~user_m2_hc();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Update(long long it, double t1);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
 
-  int Free();
+  int Update( long long it, double t1 );
 
+  int Free();
 };
 
-
 #endif
diff --git a/src/user_m2_kernel.h b/src/user_m2_kernel.h
index 4648b3a84..2ca81ec51 100644
--- a/src/user_m2_kernel.h
+++ b/src/user_m2_kernel.h
@@ -20,38 +20,37 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2KERNEL_H
 #define USERM2KERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
 #include "node_group.h"
+#include "spike_buffer.h"
 #include "user_m2.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace user_m2_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_g = 0,
   i_g1,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -69,7 +68,8 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_E_rev = 0,
   i_tau_rise,
   i_tau_decay,
@@ -77,25 +77,18 @@ enum PortParamIndexes {
   N_PORT_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
+const std::string user_m2_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string user_m2_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
-
-const std::string user_m2_port_var_name[N_PORT_VAR] = {
-  "g",
-  "g1"
-};
+const std::string user_m2_port_var_name[ N_PORT_VAR ] = { "g", "g1" };
 
-const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string user_m2_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -108,165 +101,157 @@ const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string user_m2_port_param_name[N_PORT_PARAM] = {
-  "E_rev",
-  "tau_rise",
-  "tau_decay",
-  "g0"  
-};
+const std::string user_m2_port_param_name[ N_PORT_PARAM ] = { "E_rev", "tau_rise", "tau_decay", "g0" };
 
-const std::string user_m2_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string user_m2_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-#define g(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define g1(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-#define dgdt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g]
-#define dg1dt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_g1]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define E_rev(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_E_rev]
-#define tau_rise(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_rise]
-#define tau_decay(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_decay]
-#define g0(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_g0]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     user_m2_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+#define g( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define g1( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+#define dgdt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g ]
+#define dg1dt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_g1 ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define E_rev( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_E_rev ]
+#define tau_rise( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_rise ]
+#define tau_decay( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_decay ]
+#define g0( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_g0 ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct )
 {
-  enum { n_port = (NVAR-N_SCAL_VAR)/N_PORT_VAR };
+  enum
+  {
+    n_port = ( NVAR - N_SCAL_VAR ) / N_PORT_VAR
+  };
   float I_syn = 0.0;
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
-  for (int i = 0; i<n_port; i++) {
-    I_syn += g(i)*(E_rev(i) - V);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn += g( i ) * ( E_rev( i ) - V );
   }
-  float V_spike = Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
-  for (int i=0; i<n_port; i++) {
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
+  for ( int i = 0; i < n_port; i++ )
+  {
     // Synaptic conductance derivative
-    dg1dt(i) = -g1(i) / tau_rise(i);
-    dgdt(i) = g1(i) - g(i) / tau_decay(i);
+    dg1dt( i ) = -g1( i ) / tau_rise( i );
+    dgdt( i ) = g1( i ) - g( i ) / tau_decay( i );
   }
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
-};
+}; // namespace user_m2_ns
 
 template <>
-int user_m2::UpdateNR<0>(long long it, double t1);
+int user_m2::UpdateNR< 0 >( long long it, double t1 );
 
-template<int N_PORT>
-int user_m2::UpdateNR(long long it, double t1)
+template < int N_PORT >
+int
+user_m2::UpdateNR( long long it, double t1 )
 {
-  if (N_PORT == n_port_) {
-    const int NVAR = user_m2_ns::N_SCAL_VAR
-      + user_m2_ns::N_PORT_VAR*N_PORT;
-    const int NPARAM = user_m2_ns::N_SCAL_PARAM
-      + user_m2_ns::N_PORT_PARAM*N_PORT;
+  if ( N_PORT == n_port_ )
+  {
+    const int NVAR = user_m2_ns::N_SCAL_VAR + user_m2_ns::N_PORT_VAR * N_PORT;
+    const int NPARAM = user_m2_ns::N_SCAL_PARAM + user_m2_ns::N_PORT_PARAM * N_PORT;
 
-    rk5_.Update<NVAR, NPARAM>(t1, h_min_, rk5_data_struct_);
+    rk5_.Update< NVAR, NPARAM >( t1, h_min_, rk5_data_struct_ );
   }
-  else {
-    UpdateNR<N_PORT - 1>(it, t1);
+  else
+  {
+    UpdateNR< N_PORT - 1 >( it, t1 );
   }
 
   return 0;
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct )
 {
-    user_m2_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  user_m2_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct )
 {
-    user_m2_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  user_m2_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/user_m2_psc_alpha.cu b/src/user_m2_psc_alpha.cu
index d508233fe..2df842e39 100644
--- a/src/user_m2_psc_alpha.cu
+++ b/src/user_m2_psc_alpha.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
-#include "user_m2_kernel.h"
 #include "rk5.h"
 #include "user_m2.h"
+#include "user_m2_kernel.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace user_m2_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      user_m2_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -54,56 +49,57 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0.0;
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
-    I_syn(i) = 0.0;
-    I1_syn(i) = 0.0;
-    tau_syn(i) = 0.2;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn( i ) = 0.0;
+    I1_syn( i ) = 0.0;
+    tau_syn( i ) = 0.2;
   }
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, user_m2_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   refractory_step = 0;
   // set the right threshold depending on Delta_T
-  if (Delta_T <= 0.0) {
+  if ( Delta_T <= 0.0 )
+  {
     V_peak = V_th; // same as IAF dynamics for spikes if Delta_T == 0.
   }
-  for (int i = 0; i<n_port; i++) {
-    I0(i) = M_E / tau_syn(i);
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I0( i ) = M_E / tau_syn( i );
   }
 }
 
-}
+} // namespace user_m2_ns
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, user_m2_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-    user_m2_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  user_m2_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, user_m2_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 
 {
-    user_m2_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  user_m2_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace user_m2_ns;
 
-int user_m2::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+user_m2::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_user_m2_model;
   n_scal_var_ = N_SCAL_VAR;
   n_port_var_ = N_PORT_VAR;
@@ -111,58 +107,60 @@ int user_m2::Init(int i_node_0, int n_node, int n_port,
   n_port_param_ = N_PORT_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
 
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = user_m2_scal_var_name;
-  port_var_name_= user_m2_port_var_name;
+  port_var_name_ = user_m2_port_var_name;
   scal_param_name_ = user_m2_scal_param_name;
   port_param_name_ = user_m2_port_param_name;
   group_param_name_ = user_m2_group_param_name;
-  //rk5_data_struct_.node_type_ = i_user_m2_model;
+  // rk5_data_struct_.node_type_ = i_user_m2_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
 
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
-  port_weight_arr_ = GetParamArr() + n_scal_param_
-    + GetPortParamIdx("I0");
+  port_weight_arr_ = GetParamArr() + n_scal_param_ + GetPortParamIdx( "I0" );
   port_weight_arr_step_ = n_param_;
   port_weight_port_step_ = n_port_param_;
-  
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("I1_syn");
+
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "I1_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int user_m2::Calibrate(double time_min, float time_resolution)
+int
+user_m2::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
 template <>
-int user_m2::UpdateNR<0>(long long it, double t1)
+int
+user_m2::UpdateNR< 0 >( long long it, double t1 )
 {
   return 0;
 }
 
-int user_m2::Update(long long it, double t1) {
-  UpdateNR<MAX_PORT_NUM>(it, t1);
+int
+user_m2::Update( long long it, double t1 )
+{
+  UpdateNR< MAX_PORT_NUM >( it, t1 );
 
   return 0;
 }
diff --git a/src/user_m2_psc_alpha.h b/src/user_m2_psc_alpha.h
index 0fb28d7ed..7dc148270 100644
--- a/src/user_m2_psc_alpha.h
+++ b/src/user_m2_psc_alpha.h
@@ -20,20 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2PSCALPHA_H
 #define USERM2PSCALPHA_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 #define MAX_PORT_NUM 20
 
@@ -44,30 +40,32 @@ struct user_m2_rk5
 
 class user_m2 : public BaseNeuron
 {
- public:
-  RungeKutta5<user_m2_rk5> rk5_;
+public:
+  RungeKutta5< user_m2_rk5 > rk5_;
   float h_min_;
   float h_;
   user_m2_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/user_m2_psc_alpha_kernel.h b/src/user_m2_psc_alpha_kernel.h
index a0a1cc129..d286d44e9 100644
--- a/src/user_m2_psc_alpha_kernel.h
+++ b/src/user_m2_psc_alpha_kernel.h
@@ -20,38 +20,37 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2PSCALPHAKERNEL_H
 #define USERM2PSCALPHAKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
 #include "node_group.h"
+#include "spike_buffer.h"
 #include "user_m2.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace user_m2_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_I_syn = 0,
   i_I1_syn,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -69,31 +68,25 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_tau_syn = 0,
   i_I0,
   N_PORT_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
+const std::string user_m2_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string user_m2_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
-
-const std::string user_m2_port_var_name[N_PORT_VAR] = {
-  "I_syn",
-  "I1_syn"
-};
+const std::string user_m2_port_var_name[ N_PORT_VAR ] = { "I_syn", "I1_syn" };
 
-const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string user_m2_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -106,162 +99,155 @@ const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string user_m2_port_param_name[N_PORT_PARAM] = {
-  "tau_syn",
-  "I0"
-};
+const std::string user_m2_port_param_name[ N_PORT_PARAM ] = { "tau_syn", "I0" };
 
-const std::string user_m2_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string user_m2_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-#define I_syn(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_I_syn]
-#define I1_syn(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_I1_syn]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-#define dI_syndt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_I_syn]
-#define dI1_syndt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_I1_syn]
-#define I0(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_I0]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define tau_syn(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_syn]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
-
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     user_m2_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+#define I_syn( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_I_syn ]
+#define I1_syn( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_I1_syn ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+#define dI_syndt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_I_syn ]
+#define dI1_syndt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_I1_syn ]
+#define I0( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_I0 ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define tau_syn( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_syn ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct )
 {
-  enum { n_port = (NVAR-N_SCAL_VAR)/N_PORT_VAR };
+  enum
+  {
+    n_port = ( NVAR - N_SCAL_VAR ) / N_PORT_VAR
+  };
   float I_syn_tot = 0.0;
-  
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
-  for (int i = 0; i<n_port; i++) {
-    I_syn_tot += I_syn(i);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn_tot += I_syn( i );
   }
-  float V_spike = Delta_T == 0. ? 0. : Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T == 0. ? 0. : Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn_tot - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn_tot - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
-  for (int i=0; i<n_port; i++) {
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
+  for ( int i = 0; i < n_port; i++ )
+  {
     // Synaptic current derivatives
-    dI1_syndt(i) = -I1_syn(i)/tau_syn(i);
-    dI_syndt(i) = I1_syn(i) - I_syn(i)/tau_syn(i);
+    dI1_syndt( i ) = -I1_syn( i ) / tau_syn( i );
+    dI_syndt( i ) = I1_syn( i ) - I_syn( i ) / tau_syn( i );
   }
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
-};
+}; // namespace user_m2_ns
 
 template <>
-int user_m2::UpdateNR<0>(long long it, double t1);
+int user_m2::UpdateNR< 0 >( long long it, double t1 );
 
-template<int N_PORT>
-int user_m2::UpdateNR(long long it, double t1)
+template < int N_PORT >
+int
+user_m2::UpdateNR( long long it, double t1 )
 {
-  if (N_PORT == n_port_) {
-    const int NVAR = user_m2_ns::N_SCAL_VAR
-      + user_m2_ns::N_PORT_VAR*N_PORT;
-    const int NPARAM = user_m2_ns::N_SCAL_PARAM
-      + user_m2_ns::N_PORT_PARAM*N_PORT;
+  if ( N_PORT == n_port_ )
+  {
+    const int NVAR = user_m2_ns::N_SCAL_VAR + user_m2_ns::N_PORT_VAR * N_PORT;
+    const int NPARAM = user_m2_ns::N_SCAL_PARAM + user_m2_ns::N_PORT_PARAM * N_PORT;
 
-    rk5_.Update<NVAR, NPARAM>(t1, h_min_, rk5_data_struct_);
+    rk5_.Update< NVAR, NPARAM >( t1, h_min_, rk5_data_struct_ );
   }
-  else {
-    UpdateNR<N_PORT - 1>(it, t1);
+  else
+  {
+    UpdateNR< N_PORT - 1 >( it, t1 );
   }
 
   return 0;
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct )
 {
-    user_m2_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  user_m2_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct )
 {
-    user_m2_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  user_m2_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/user_m2_psc_alpha_rk5.h b/src/user_m2_psc_alpha_rk5.h
index 81e485e43..b63b13c9f 100644
--- a/src/user_m2_psc_alpha_rk5.h
+++ b/src/user_m2_psc_alpha_rk5.h
@@ -20,32 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2PSCALPHARK5_H
 #define USERM2PSCALPHARK5_H
 
 struct user_m2_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m2_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m2_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, user_m2_rk5 data_struct);
+__device__ void NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, user_m2_rk5 data_struct);
+__device__ void NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct );
 
 #endif
diff --git a/src/user_m2_psc_delta.cu b/src/user_m2_psc_delta.cu
index de5daedde..c7752ab47 100644
--- a/src/user_m2_psc_delta.cu
+++ b/src/user_m2_psc_delta.cu
@@ -20,25 +20,20 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
-#include "user_m2_kernel.h"
 #include "rk5.h"
 #include "user_m2.h"
+#include "user_m2_kernel.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace user_m2_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      user_m2_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -53,100 +48,100 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0;
   refractory_step = 0;
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, user_m2_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  //int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  // int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
 
   refractory_step = 0;
   // set the right threshold depending on Delta_T
-  if (Delta_T <= 0.0) {
+  if ( Delta_T <= 0.0 )
+  {
     V_peak = V_th; // same as IAF dynamics for spikes if Delta_T == 0.
   }
 }
 
-}
+} // namespace user_m2_ns
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, user_m2_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-    user_m2_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  user_m2_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, user_m2_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 
 {
-    user_m2_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  user_m2_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace user_m2_ns;
 
-int user_m2::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+user_m2::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_user_m2_model;
   n_scal_var_ = N_SCAL_VAR;
   n_scal_param_ = N_SCAL_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
+
+  group_param_ = new float[ N_GROUP_PARAM ];
 
-  group_param_ = new float[N_GROUP_PARAM];
-  
   scal_var_name_ = user_m2_scal_var_name;
   scal_param_name_ = user_m2_scal_param_name;
   group_param_name_ = user_m2_group_param_name;
-  //rk5_data_struct_.node_type_ = i_user_m2_model;
+  // rk5_data_struct_.node_type_ = i_user_m2_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
 
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("V_m");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "V_m" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int user_m2::Calibrate(double time_min, float time_resolution)
+int
+user_m2::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
-int user_m2::Update(long long it, double t1)
+int
+user_m2::Update( long long it, double t1 )
 {
-  rk5_.Update<N_SCAL_VAR, N_SCAL_PARAM>(t1, h_min_, rk5_data_struct_);
- 
+  rk5_.Update< N_SCAL_VAR, N_SCAL_PARAM >( t1, h_min_, rk5_data_struct_ );
+
   return 0;
 }
diff --git a/src/user_m2_psc_delta.h b/src/user_m2_psc_delta.h
index 54ff75de7..ede553581 100644
--- a/src/user_m2_psc_delta.h
+++ b/src/user_m2_psc_delta.h
@@ -20,20 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2PSCDELTA_H
 #define USERM2PSCDELTA_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 #define MAX_PORT_NUM 20
 
@@ -44,27 +40,29 @@ struct user_m2_rk5
 
 class user_m2 : public BaseNeuron
 {
- public:
-  RungeKutta5<user_m2_rk5> rk5_;
+public:
+  RungeKutta5< user_m2_rk5 > rk5_;
   float h_min_;
   float h_;
   user_m2_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
 };
 
 #endif
diff --git a/src/user_m2_psc_delta_kernel.h b/src/user_m2_psc_delta_kernel.h
index 4b505c0fc..21f4879ce 100644
--- a/src/user_m2_psc_delta_kernel.h
+++ b/src/user_m2_psc_delta_kernel.h
@@ -20,36 +20,35 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2PSCDELTAKERNEL_H
 #define USERM2PSCDELTAKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
 #include "node_group.h"
+#include "spike_buffer.h"
 #include "user_m2.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace user_m2_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   N_PORT_VAR = 0
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -67,20 +66,16 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
+const std::string user_m2_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string user_m2_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
-
-const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string user_m2_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -93,121 +88,110 @@ const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
-
-const std::string user_m2_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+  "den_delay" };
 
+const std::string user_m2_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     user_m2_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct )
 {
-  
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
 
-  float V_spike = Delta_T == 0. ? 0. : Delta_T*exp((V - V_th)/Delta_T);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+
+  float V_spike = Delta_T == 0. ? 0. : Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
+}; // namespace user_m2_ns
 
-};
-
-
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct )
 {
-    user_m2_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  user_m2_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct )
 {
-    user_m2_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  user_m2_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/user_m2_psc_delta_rk5.h b/src/user_m2_psc_delta_rk5.h
index a0dafdaf4..15ec45656 100644
--- a/src/user_m2_psc_delta_rk5.h
+++ b/src/user_m2_psc_delta_rk5.h
@@ -20,32 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2PSCDELTARK5_H
 #define USERM2PSCDELTARK5_H
 
 struct user_m2_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m2_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m2_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, user_m2_rk5 data_struct);
+__device__ void NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, user_m2_rk5 data_struct);
+__device__ void NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct );
 
 #endif
diff --git a/src/user_m2_psc_exp.cu b/src/user_m2_psc_exp.cu
index 4411a6bee..0842a37ec 100644
--- a/src/user_m2_psc_exp.cu
+++ b/src/user_m2_psc_exp.cu
@@ -20,26 +20,21 @@
  *
  */
 
-
-
-
-
-#include <config.h>
-#include <cmath>
-#include <iostream>
-#include "user_m2_kernel.h"
 #include "rk5.h"
 #include "user_m2.h"
+#include "user_m2_kernel.h"
+#include <cmath>
+#include <config.h>
+#include <iostream>
 
 namespace user_m2_ns
 {
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y, float *param,
-	      user_m2_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int n_port = ( n_var - N_SCAL_VAR ) / N_PORT_VAR;
 
   V_th = -50.4;
   Delta_T = 2.0;
@@ -54,52 +49,52 @@ void NodeInit(int n_var, int n_param, double x, float *y, float *param,
   V_reset = -60.0;
   t_ref = 0.0;
   den_delay = 0.0;
-  
+
   V_m = E_L;
   w = 0;
   refractory_step = 0;
-  for (int i = 0; i<n_port; i++) {
-    tau_syn(i) = 0.2;
-    I_syn(i) = 0;
+  for ( int i = 0; i < n_port; i++ )
+  {
+    tau_syn( i ) = 0.2;
+    I_syn( i ) = 0;
   }
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		       float *param, user_m2_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-  //int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  //int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
+  // int array_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  // int n_port = (n_var-N_SCAL_VAR)/N_PORT_VAR;
 
   refractory_step = 0;
   // set the right threshold depending on Delta_T
-  if (Delta_T <= 0.0) {
+  if ( Delta_T <= 0.0 )
+  {
     V_peak = V_th; // same as IAF dynamics for spikes if Delta_T == 0.
   }
 }
 
-}
+} // namespace user_m2_ns
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	     float *param, user_m2_rk5 data_struct)
+__device__ void
+NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 {
-    user_m2_ns::NodeInit(n_var, n_param, x, y, param, data_struct);
+  user_m2_ns::NodeInit( n_var, n_param, x, y, param, data_struct );
 }
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		  float *param, user_m2_rk5 data_struct)
+__device__ void
+NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct )
 
 {
-    user_m2_ns::NodeCalibrate(n_var, n_param, x, y, param, data_struct);
+  user_m2_ns::NodeCalibrate( n_var, n_param, x, y, param, data_struct );
 }
 
 using namespace user_m2_ns;
 
-int user_m2::Init(int i_node_0, int n_node, int n_port,
-			 int i_group) {
-  BaseNeuron::Init(i_node_0, n_node, n_port, i_group);
+int
+user_m2::Init( int i_node_0, int n_node, int n_port, int i_group )
+{
+  BaseNeuron::Init( i_node_0, n_node, n_port, i_group );
   node_type_ = i_user_m2_model;
   n_scal_var_ = N_SCAL_VAR;
   n_port_var_ = N_PORT_VAR;
@@ -107,61 +102,63 @@ int user_m2::Init(int i_node_0, int n_node, int n_port,
   n_port_param_ = N_PORT_PARAM;
   n_group_param_ = N_GROUP_PARAM;
 
-  n_var_ = n_scal_var_ + n_port_var_*n_port;
-  n_param_ = n_scal_param_ + n_port_param_*n_port;
+  n_var_ = n_scal_var_ + n_port_var_ * n_port;
+  n_param_ = n_scal_param_ + n_port_param_ * n_port;
+
+  group_param_ = new float[ N_GROUP_PARAM ];
 
-  group_param_ = new float[N_GROUP_PARAM];
-  
   scal_var_name_ = user_m2_scal_var_name;
-  port_var_name_= user_m2_port_var_name;
+  port_var_name_ = user_m2_port_var_name;
   scal_param_name_ = user_m2_scal_param_name;
   port_param_name_ = user_m2_port_param_name;
   group_param_name_ = user_m2_group_param_name;
-  //rk5_data_struct_.node_type_ = i_user_m2_model;
+  // rk5_data_struct_.node_type_ = i_user_m2_model;
   rk5_data_struct_.i_node_0_ = i_node_0_;
 
-  SetGroupParam("h_min_rel", 1.0e-3);
-  SetGroupParam("h0_rel",  1.0e-2);
-  h_ = h0_rel_* 0.1;
-  
-  rk5_.Init(n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_);
+  SetGroupParam( "h_min_rel", 1.0e-3 );
+  SetGroupParam( "h0_rel", 1.0e-2 );
+  h_ = h0_rel_ * 0.1;
+
+  rk5_.Init( n_node, n_var_, n_param_, 0.0, h_, rk5_data_struct_ );
   var_arr_ = rk5_.GetYArr();
   param_arr_ = rk5_.GetParamArr();
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
 
-  port_input_arr_ = GetVarArr() + n_scal_var_
-    + GetPortVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + n_scal_var_ + GetPortVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = n_port_var_;
-  den_delay_arr_ =  GetParamArr() + GetScalParamIdx("den_delay");
+  den_delay_arr_ = GetParamArr() + GetScalParamIdx( "den_delay" );
 
   return 0;
 }
 
-int user_m2::Calibrate(double time_min, float time_resolution)
+int
+user_m2::Calibrate( double time_min, float time_resolution )
 {
-  h_min_ = h_min_rel_* time_resolution;
-  h_ = h0_rel_* time_resolution;
-  rk5_.Calibrate(time_min, h_, rk5_data_struct_);
-  
+  h_min_ = h_min_rel_ * time_resolution;
+  h_ = h0_rel_ * time_resolution;
+  rk5_.Calibrate( time_min, h_, rk5_data_struct_ );
+
   return 0;
 }
 
 template <>
-int user_m2::UpdateNR<0>(long long it, double t1)
+int
+user_m2::UpdateNR< 0 >( long long it, double t1 )
 {
   return 0;
 }
 
-int user_m2::Update(long long it, double t1) {
-  UpdateNR<MAX_PORT_NUM>(it, t1);
+int
+user_m2::Update( long long it, double t1 )
+{
+  UpdateNR< MAX_PORT_NUM >( it, t1 );
 
   return 0;
 }
diff --git a/src/user_m2_psc_exp.h b/src/user_m2_psc_exp.h
index b6ccc53eb..77878b1d8 100644
--- a/src/user_m2_psc_exp.h
+++ b/src/user_m2_psc_exp.h
@@ -20,20 +20,16 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2PSCEXP_H
 #define USERM2PSCEXP_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "rk5.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
+#include "node_group.h"
+#include "rk5.h"
+#include <iostream>
+#include <string>
 
 #define MAX_PORT_NUM 20
 
@@ -44,30 +40,32 @@ struct user_m2_rk5
 
 class user_m2 : public BaseNeuron
 {
- public:
-  RungeKutta5<user_m2_rk5> rk5_;
+public:
+  RungeKutta5< user_m2_rk5 > rk5_;
   float h_min_;
   float h_;
   user_m2_rk5 rk5_data_struct_;
-    
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Calibrate(double time_min, float time_resolution);
-		
-  int Update(long long it, double t1);
-  
-  int GetX(int i_neuron, int n_node, double *x) {
-    return rk5_.GetX(i_neuron, n_node, x);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int Calibrate( double time_min, float time_resolution );
+
+  int Update( long long it, double t1 );
+
+  int
+  GetX( int i_neuron, int n_node, double* x )
+  {
+    return rk5_.GetX( i_neuron, n_node, x );
   }
-  
-  int GetY(int i_var, int i_neuron, int n_node, float *y) {
-    return rk5_.GetY(i_var, i_neuron, n_node, y);
+
+  int
+  GetY( int i_var, int i_neuron, int n_node, float* y )
+  {
+    return rk5_.GetY( i_var, i_neuron, n_node, y );
   }
-  
-  template<int N_PORT>
-    int UpdateNR(long long it, double t1);
 
+  template < int N_PORT >
+  int UpdateNR( long long it, double t1 );
 };
 
 #endif
diff --git a/src/user_m2_psc_exp_g.cu b/src/user_m2_psc_exp_g.cu
index 6240d73b9..5237b1879 100644
--- a/src/user_m2_psc_exp_g.cu
+++ b/src/user_m2_psc_exp_g.cu
@@ -20,74 +20,82 @@
  *
  */
 
-
-
-
-
-#include <config.h>
+#include "spike_buffer.h"
+#include "user_m2.h"
 #include <cmath>
+#include <config.h>
 #include <iostream>
-#include "user_m2.h"
-#include "spike_buffer.h"
 
 using namespace user_m2_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
 
-#define I_syn var[i_I_syn]
-#define V_m_rel var[i_V_m_rel]
-#define refractory_step var[i_refractory_step]
-#define I_e param[i_I_e]
-
-#define tau_m_ group_param_[i_tau_m]
-#define C_m_ group_param_[i_C_m]
-#define E_L_ group_param_[i_E_L]
-#define Theta_rel_ group_param_[i_Theta_rel]
-#define V_reset_rel_ group_param_[i_V_reset_rel]
-#define tau_syn_ group_param_[i_tau_syn]
-#define t_ref_ group_param_[i_t_ref]
-
-__global__ void user_m2_Update
-( int n_node, int i_node_0, float *var_arr, float *param_arr, int n_var,
-  int n_param, float Theta_rel, float V_reset_rel, int n_refractory_steps,
-  float P11, float P22, float P21, float P20 )
+#define I_syn var[ i_I_syn ]
+#define V_m_rel var[ i_V_m_rel ]
+#define refractory_step var[ i_refractory_step ]
+#define I_e param[ i_I_e ]
+
+#define tau_m_ group_param_[ i_tau_m ]
+#define C_m_ group_param_[ i_C_m ]
+#define E_L_ group_param_[ i_E_L ]
+#define Theta_rel_ group_param_[ i_Theta_rel ]
+#define V_reset_rel_ group_param_[ i_V_reset_rel ]
+#define tau_syn_ group_param_[ i_tau_syn ]
+#define t_ref_ group_param_[ i_t_ref ]
+
+__global__ void
+user_m2_Update( int n_node,
+  int i_node_0,
+  float* var_arr,
+  float* param_arr,
+  int n_var,
+  int n_param,
+  float Theta_rel,
+  float V_reset_rel,
+  int n_refractory_steps,
+  float P11,
+  float P22,
+  float P21,
+  float P20 )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V
+    else
+    { // neuron is not refractory, so evolve V
       V_m_rel = V_m_rel * P22 + I_syn * P21 + I_e * P20;
     }
     // exponential decaying PSC
     I_syn *= P11;
-    
-    if (V_m_rel >= Theta_rel ) { // threshold crossing
-      PushSpike(i_node_0 + i_neuron, 1.0);
+
+    if ( V_m_rel >= Theta_rel )
+    { // threshold crossing
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m_rel = V_reset_rel;
       refractory_step = n_refractory_steps;
-    }    
+    }
   }
 }
 
-double h_propagator_32( double tau_syn, double tau, double C, double h )
+double
+h_propagator_32( double tau_syn, double tau, double C, double h )
 {
-  const double P32_linear = 1.0 / ( 2.0 * C * tau * tau ) * h * h
-    * ( tau_syn - tau ) * exp( -h / tau );
+  const double P32_linear = 1.0 / ( 2.0 * C * tau * tau ) * h * h * ( tau_syn - tau ) * exp( -h / tau );
   const double P32_singular = h / C * exp( -h / tau );
   const double P32 =
-    -tau / ( C * ( 1.0 - tau / tau_syn ) ) * exp( -h / tau_syn )
-    * expm1( h * ( 1.0 / tau_syn - 1.0 / tau ) );
+    -tau / ( C * ( 1.0 - tau / tau_syn ) ) * exp( -h / tau_syn ) * expm1( h * ( 1.0 / tau_syn - 1.0 / tau ) );
 
   const double dev_P32 = fabs( P32 - P32_singular );
 
-  if ( tau == tau_syn || ( fabs( tau - tau_syn ) < 0.1 && dev_P32 > 2.0
-			   * fabs( P32_linear ) ) )
+  if ( tau == tau_syn || ( fabs( tau - tau_syn ) < 0.1 && dev_P32 > 2.0 * fabs( P32_linear ) ) )
   {
     return P32_singular;
   }
@@ -103,10 +111,10 @@ user_m2::~user_m2()
   FreeParamArr();
 }
 
-int user_m2::Init(int i_node_0, int n_node, int /*n_port*/,
-			   int i_group)
+int
+user_m2::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 1 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 1 /*n_port*/, i_group );
   node_type_ = i_user_m2_model;
 
   n_scal_var_ = N_SCAL_VAR;
@@ -114,46 +122,46 @@ int user_m2::Init(int i_node_0, int n_node, int /*n_port*/,
   n_scal_param_ = N_SCAL_PARAM;
   n_group_param_ = N_GROUP_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
-  group_param_ = new float[N_GROUP_PARAM];
+  group_param_ = new float[ N_GROUP_PARAM ];
 
   scal_var_name_ = user_m2_scal_var_name;
   scal_param_name_ = user_m2_scal_param_name;
   group_param_name_ = user_m2_group_param_name;
 
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
+  SetScalParam( 0, n_node, "I_e", 0.0 ); // in pA
 
-  SetScalVar(0, n_node, "I_syn", 0.0 );
-  SetScalVar(0, n_node, "V_m_rel", 0.0 ); // in mV
-  SetScalVar(0, n_node, "refractory_step", 0 );
+  SetScalVar( 0, n_node, "I_syn", 0.0 );
+  SetScalVar( 0, n_node, "V_m_rel", 0.0 ); // in mV
+  SetScalVar( 0, n_node, "refractory_step", 0 );
 
-  SetGroupParam("tau_m", 10.0);
-  SetGroupParam("C_m", 250.0);
-  SetGroupParam("E_L", -65.0);
-  SetGroupParam("Theta_rel", 15.0);
-  SetGroupParam("V_reset_rel", 0.0);
-  SetGroupParam("tau_syn", 0.5);
-  SetGroupParam("t_ref", 2.0);
+  SetGroupParam( "tau_m", 10.0 );
+  SetGroupParam( "C_m", 250.0 );
+  SetGroupParam( "E_L", -65.0 );
+  SetGroupParam( "Theta_rel", 15.0 );
+  SetGroupParam( "V_reset_rel", 0.0 );
+  SetGroupParam( "tau_syn", 0.5 );
+  SetGroupParam( "t_ref", 2.0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 0;
 
   return 0;
 }
 
-int user_m2::Update(long long it, double t1)
+int
+user_m2::Update( long long it, double t1 )
 {
   // std::cout << "user_m2 neuron update\n";
   float h = time_resolution_;
@@ -161,21 +169,32 @@ int user_m2::Update(long long it, double t1)
   float P22 = exp( -h / tau_m_ );
   float P21 = h_propagator_32( tau_syn_, tau_m_, C_m_, h );
   float P20 = tau_m_ / C_m_ * ( 1.0 - P22 );
-  int n_refractory_steps = int(round(t_ref_ / h));
+  int n_refractory_steps = int( round( t_ref_ / h ) );
+
+  user_m2_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>( n_node_,
+    i_node_0_,
+    var_arr_,
+    param_arr_,
+    n_var_,
+    n_param_,
+    Theta_rel_,
+    V_reset_rel_,
+    n_refractory_steps,
+    P11,
+    P22,
+    P21,
+    P20 );
+  // gpuErrchk( cudaDeviceSynchronize() );
 
-  user_m2_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_,
-      Theta_rel_, V_reset_rel_, n_refractory_steps, P11, P22, P21, P20 );
-  //gpuErrchk( cudaDeviceSynchronize() );
-  
   return 0;
 }
 
-int user_m2::Free()
+int
+user_m2::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
   delete[] group_param_;
-  
+
   return 0;
 }
diff --git a/src/user_m2_psc_exp_g.h b/src/user_m2_psc_exp_g.h
index 8515fb415..1128da7be 100644
--- a/src/user_m2_psc_exp_g.h
+++ b/src/user_m2_psc_exp_g.h
@@ -20,97 +20,76 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/iaf_psc_exp.h
 
-
 #ifndef USERM2PSCEXPG_H
 #define USERM2PSCEXPG_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 namespace user_m2_ns
 {
-enum ScalVarIndexes {
-  i_I_syn = 0,        // postsynaptic current for exc. inputs
-  i_V_m_rel,          // membrane potential relative to E_L
-  i_refractory_step,  // refractory step counter
+enum ScalVarIndexes
+{
+  i_I_syn = 0,       // postsynaptic current for exc. inputs
+  i_V_m_rel,         // membrane potential relative to E_L
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_I_e = 0,         // External current in pA
+enum ScalParamIndexes
+{
+  i_I_e = 0, // External current in pA
   N_SCAL_PARAM
 };
 
-enum GroupParamIndexes {
-  i_tau_m = 0,       // Membrane time constant in ms
-  i_C_m,             // Membrane capacitance in pF
-  i_E_L,             // Resting potential in mV
-  i_Theta_rel,       // Threshold, RELATIVE TO RESTING POTENTIAL(!)
-                     // i.e. the real threshold is (E_L_+Theta_rel_)
-  i_V_reset_rel,     // relative reset value of the membrane potential
-  i_tau_syn,         // Time constant of synaptic current in ms
-  i_t_ref,           // Refractory period in ms
+enum GroupParamIndexes
+{
+  i_tau_m = 0,   // Membrane time constant in ms
+  i_C_m,         // Membrane capacitance in pF
+  i_E_L,         // Resting potential in mV
+  i_Theta_rel,   // Threshold, RELATIVE TO RESTING POTENTIAL(!)
+                 // i.e. the real threshold is (E_L_+Theta_rel_)
+  i_V_reset_rel, // relative reset value of the membrane potential
+  i_tau_syn,     // Time constant of synaptic current in ms
+  i_t_ref,       // Refractory period in ms
   N_GROUP_PARAM
 };
 
+const std::string user_m2_scal_var_name[ N_SCAL_VAR ] = { "I_syn", "V_m_rel", "refractory_step" };
 
- 
-const std::string user_m2_scal_var_name[N_SCAL_VAR] = {
-  "I_syn",
-  "V_m_rel",
-  "refractory_step"
-};
-
-const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
-  "I_e"
-};
-
-const std::string user_m2_group_param_name[N_GROUP_PARAM] = {
-  "tau_m",
-  "C_m",
-  "E_L",
-  "Theta_rel",
-  "V_reset_rel",
-  "tau_syn",
-  "t_ref"
-};
- 
-} // namespace
- 
+const std::string user_m2_scal_param_name[ N_SCAL_PARAM ] = { "I_e" };
 
+const std::string
+  user_m2_group_param_name[ N_GROUP_PARAM ] = { "tau_m", "C_m", "E_L", "Theta_rel", "V_reset_rel", "tau_syn", "t_ref" };
 
+} // namespace user_m2_ns
 
 class user_m2 : public BaseNeuron
 {
   float time_resolution_;
 
- public:
+public:
   ~user_m2();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
-  int Calibrate(double /*time_min*/, float time_res) {
+
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
+
+  int
+  Calibrate( double /*time_min*/, float time_res )
+  {
     time_resolution_ = time_res;
     return 0;
   }
-  
-  int Update(long long it, double t1);
 
-  int Free();
+  int Update( long long it, double t1 );
 
+  int Free();
 };
 
-
 #endif
diff --git a/src/user_m2_psc_exp_hc.cu b/src/user_m2_psc_exp_hc.cu
index be9be6c65..3100fb986 100644
--- a/src/user_m2_psc_exp_hc.cu
+++ b/src/user_m2_psc_exp_hc.cu
@@ -20,51 +20,50 @@
  *
  */
 
-
-
-
-
-#include <config.h>
+#include "spike_buffer.h"
+#include "user_m2_hc.h"
 #include <cmath>
+#include <config.h>
 #include <iostream>
-#include "user_m2_hc.h"
-#include "spike_buffer.h"
 
 using namespace user_m2_hc_ns;
 
 extern __constant__ float NESTGPUTimeResolution;
 
-#define I_syn var[i_I_syn]
-#define V_m_rel var[i_V_m_rel]
-#define refractory_step var[i_refractory_step]
-#define I_e param[i_I_e]
+#define I_syn var[ i_I_syn ]
+#define V_m_rel var[ i_V_m_rel ]
+#define refractory_step var[ i_refractory_step ]
+#define I_e param[ i_I_e ]
 
 #include "user_m2_hc_params.h"
 
-__global__ void user_m2_hc_Update(int n_node, int i_node_0,
-					float *var_arr, float *param_arr,
-					int n_var, int n_param)
+__global__ void
+user_m2_hc_Update( int n_node, int i_node_0, float* var_arr, float* param_arr, int n_var, int n_param )
 {
   int i_neuron = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i_neuron<n_node) {
-    float *var = var_arr + n_var*i_neuron;
-    float *param = param_arr + n_param*i_neuron;
-    
-    if ( refractory_step > 0.0 ) {
+  if ( i_neuron < n_node )
+  {
+    float* var = var_arr + n_var * i_neuron;
+    float* param = param_arr + n_param * i_neuron;
+
+    if ( refractory_step > 0.0 )
+    {
       // neuron is absolute refractory
       refractory_step -= 1.0;
     }
-    else { // neuron is not refractory, so evolve V
+    else
+    { // neuron is not refractory, so evolve V
       V_m_rel = V_m_rel * P22 + I_syn * P21 + I_e * P20;
     }
     // exponential decaying PSC
     I_syn *= P11;
-    
-    if (V_m_rel >= Theta_rel ) { // threshold crossing
-      PushSpike(i_node_0 + i_neuron, 1.0);
+
+    if ( V_m_rel >= Theta_rel )
+    { // threshold crossing
+      PushSpike( i_node_0 + i_neuron, 1.0 );
       V_m_rel = V_reset_rel;
       refractory_step = n_refractory_steps;
-    }    
+    }
   }
 }
 
@@ -74,59 +73,60 @@ user_m2_hc::~user_m2_hc()
   FreeParamArr();
 }
 
-int user_m2_hc::Init(int i_node_0, int n_node, int /*n_port*/,
-			   int i_group)
+int
+user_m2_hc::Init( int i_node_0, int n_node, int /*n_port*/, int i_group )
 {
-  BaseNeuron::Init(i_node_0, n_node, 1 /*n_port*/, i_group);
+  BaseNeuron::Init( i_node_0, n_node, 1 /*n_port*/, i_group );
   node_type_ = i_user_m2_hc_model;
 
   n_scal_var_ = N_SCAL_VAR;
   n_var_ = n_scal_var_;
   n_scal_param_ = N_SCAL_PARAM;
   n_param_ = n_scal_param_;
-  
+
   AllocParamArr();
   AllocVarArr();
 
   scal_var_name_ = user_m2_hc_scal_var_name;
   scal_param_name_ = user_m2_hc_scal_param_name;
 
-  SetScalParam(0, n_node, "I_e", 0.0 );              // in pA
+  SetScalParam( 0, n_node, "I_e", 0.0 ); // in pA
 
-  SetScalVar(0, n_node, "I_syn", 0.0 );
-  SetScalVar(0, n_node, "V_m_rel", 0.0 ); // in mV
-  SetScalVar(0, n_node, "refractory_step", 0 );
+  SetScalVar( 0, n_node, "I_syn", 0.0 );
+  SetScalVar( 0, n_node, "V_m_rel", 0.0 ); // in mV
+  SetScalVar( 0, n_node, "refractory_step", 0 );
 
   // multiplication factor of input signal is always 1 for all nodes
   float input_weight = 1.0;
-  CUDAMALLOCCTRL("&port_weight_arr_",&port_weight_arr_, sizeof(float));
-  gpuErrchk(cudaMemcpy(port_weight_arr_, &input_weight,
-			 sizeof(float), cudaMemcpyHostToDevice));
+  CUDAMALLOCCTRL( "&port_weight_arr_", &port_weight_arr_, sizeof( float ) );
+  gpuErrchk( cudaMemcpy( port_weight_arr_, &input_weight, sizeof( float ), cudaMemcpyHostToDevice ) );
   port_weight_arr_step_ = 0;
   port_weight_port_step_ = 0;
-  
+
   // input spike signal is stored in I_syn
-  port_input_arr_ = GetVarArr() + GetScalVarIdx("I_syn");
+  port_input_arr_ = GetVarArr() + GetScalVarIdx( "I_syn" );
   port_input_arr_step_ = n_var_;
   port_input_port_step_ = 0;
 
   return 0;
 }
 
-int user_m2_hc::Update(long long it, double t1)
+int
+user_m2_hc::Update( long long it, double t1 )
 {
   // std::cout << "user_m2_hc neuron update\n";
-  user_m2_hc_Update<<<(n_node_+1023)/1024, 1024>>>
-    (n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_);
-  //gpuErrchk( cudaDeviceSynchronize() );
-  
+  user_m2_hc_Update<<< ( n_node_ + 1023 ) / 1024, 1024 >>>(
+    n_node_, i_node_0_, var_arr_, param_arr_, n_var_, n_param_ );
+  // gpuErrchk( cudaDeviceSynchronize() );
+
   return 0;
 }
 
-int user_m2_hc::Free()
+int
+user_m2_hc::Free()
 {
-  FreeVarArr();  
+  FreeVarArr();
   FreeParamArr();
-  
+
   return 0;
 }
diff --git a/src/user_m2_psc_exp_hc.h b/src/user_m2_psc_exp_hc.h
index 3c772deb8..f2f7216a6 100644
--- a/src/user_m2_psc_exp_hc.h
+++ b/src/user_m2_psc_exp_hc.h
@@ -20,65 +20,51 @@
  *
  */
 
-
-
-
-
 // adapted from:
 // https://github.com/nest/nest-simulator/blob/master/models/user_m2.h
 
-
 #ifndef USERM2PSCEXPHC_H
 #define USERM2PSCEXPHC_H
 
-#include <iostream>
-#include <string>
-#include "cuda_error.h"
-#include "node_group.h"
 #include "base_neuron.h"
+#include "cuda_error.h"
 #include "neuron_models.h"
-
+#include "node_group.h"
+#include <iostream>
+#include <string>
 
 namespace user_m2_hc_ns
 {
-enum ScalVarIndexes {
-  i_I_syn = 0,        // postsynaptic current for exc. inputs
-  i_V_m_rel,          // membrane potential relative to E_L
-  i_refractory_step,  // refractory step counter
+enum ScalVarIndexes
+{
+  i_I_syn = 0,       // postsynaptic current for exc. inputs
+  i_V_m_rel,         // membrane potential relative to E_L
+  i_refractory_step, // refractory step counter
   N_SCAL_VAR
 };
 
-enum ScalParamIndexes {
-  i_I_e = 0,         // External current in pA
+enum ScalParamIndexes
+{
+  i_I_e = 0, // External current in pA
   N_SCAL_PARAM
 };
 
- const std::string user_m2_hc_scal_var_name[N_SCAL_VAR] = {
-  "I_syn",
-  "V_m_rel",
-  "refractory_step"
-};
+const std::string user_m2_hc_scal_var_name[ N_SCAL_VAR ] = { "I_syn", "V_m_rel", "refractory_step" };
 
-const std::string user_m2_hc_scal_param_name[N_SCAL_PARAM] = {
-  "I_e"
-};
+const std::string user_m2_hc_scal_param_name[ N_SCAL_PARAM ] = { "I_e" };
 
-} // namespace
- 
+} // namespace user_m2_hc_ns
 
 class user_m2_hc : public BaseNeuron
 {
- public:
+public:
   ~user_m2_hc();
-  
-  int Init(int i_node_0, int n_neuron, int n_port, int i_group);
-	   
 
-  int Update(long long it, double t1);
+  int Init( int i_node_0, int n_neuron, int n_port, int i_group );
 
-  int Free();
+  int Update( long long it, double t1 );
 
+  int Free();
 };
 
-
 #endif
diff --git a/src/user_m2_psc_exp_kernel.h b/src/user_m2_psc_exp_kernel.h
index 55fc7356a..775a02a8f 100644
--- a/src/user_m2_psc_exp_kernel.h
+++ b/src/user_m2_psc_exp_kernel.h
@@ -20,37 +20,36 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2PSCEXPKERNEL_H
 #define USERM2PSCEXPKERNEL_H
 
-#include <string>
-#include <cmath>
-#include "spike_buffer.h"
 #include "node_group.h"
+#include "spike_buffer.h"
 #include "user_m2.h"
+#include <cmath>
+#include <string>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
 
 extern __constant__ float NESTGPUTimeResolution;
 
 namespace user_m2_ns
 {
-enum ScalVarIndexes {
+enum ScalVarIndexes
+{
   i_V_m = 0,
   i_w,
   N_SCAL_VAR
 };
 
-enum PortVarIndexes {
+enum PortVarIndexes
+{
   i_I_syn = 0,
   N_PORT_VAR
 };
 
-enum ScalParamIndexes {
+enum ScalParamIndexes
+{
   i_V_th = 0,
   i_Delta_T,
   i_g_L,
@@ -68,28 +67,24 @@ enum ScalParamIndexes {
   N_SCAL_PARAM
 };
 
-enum PortParamIndexes {
+enum PortParamIndexes
+{
   i_tau_syn = 0,
   N_PORT_PARAM
 };
 
-enum GroupParamIndexes {
-  i_h_min_rel = 0,  // Min. step in ODE integr. relative to time resolution
-  i_h0_rel,         // Starting step in ODE integr. relative to time resolution
+enum GroupParamIndexes
+{
+  i_h_min_rel = 0, // Min. step in ODE integr. relative to time resolution
+  i_h0_rel,        // Starting step in ODE integr. relative to time resolution
   N_GROUP_PARAM
 };
 
-const std::string user_m2_scal_var_name[N_SCAL_VAR] = {
-  "V_m",
-  "w"
-};
+const std::string user_m2_scal_var_name[ N_SCAL_VAR ] = { "V_m", "w" };
 
-const std::string user_m2_port_var_name[N_PORT_VAR] = {
-  "I_syn"
-};
+const std::string user_m2_port_var_name[ N_PORT_VAR ] = { "I_syn" };
 
-const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
-  "V_th",
+const std::string user_m2_scal_param_name[ N_SCAL_PARAM ] = { "V_th",
   "Delta_T",
   "g_L",
   "E_L",
@@ -102,157 +97,153 @@ const std::string user_m2_scal_param_name[N_SCAL_PARAM] = {
   "V_reset",
   "t_ref",
   "refractory_step",
-  "den_delay"
-};
+  "den_delay" };
 
-const std::string user_m2_port_param_name[N_PORT_PARAM] = {
+const std::string user_m2_port_param_name[ N_PORT_PARAM ] = {
   "tau_syn",
 };
 
-const std::string user_m2_group_param_name[N_GROUP_PARAM] = {
-  "h_min_rel",
-  "h0_rel"
-};
+const std::string user_m2_group_param_name[ N_GROUP_PARAM ] = { "h_min_rel", "h0_rel" };
 
 //
 // I know that defines are "bad", but the defines below make the
 // following equations much more readable.
 // For every rule there is some exceptions!
 //
-#define V_m y[i_V_m]
-#define w y[i_w]
-#define I_syn(i) y[N_SCAL_VAR + N_PORT_VAR*i + i_I_syn]
-
-#define dVdt dydx[i_V_m]
-#define dwdt dydx[i_w]
-#define dI_syndt(i) dydx[N_SCAL_VAR + N_PORT_VAR*i + i_I_syn]
-
-#define V_th param[i_V_th]
-#define Delta_T param[i_Delta_T]
-#define g_L param[i_g_L]
-#define E_L param[i_E_L]
-#define C_m param[i_C_m]
-#define a param[i_a]
-#define b param[i_b]
-#define tau_w param[i_tau_w]
-#define I_e param[i_I_e]
-#define V_peak param[i_V_peak]
-#define V_reset param[i_V_reset]
-#define t_ref param[i_t_ref]
-#define refractory_step param[i_refractory_step]
-#define den_delay param[i_den_delay]
-
-#define tau_syn(i) param[N_SCAL_PARAM + N_PORT_PARAM*i + i_tau_syn]
-
-#define h_min_rel_ group_param_[i_h_min_rel]
-#define h0_rel_ group_param_[i_h0_rel]
-
- 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void Derivatives(double x, float *y, float *dydx, float *param,
-		     user_m2_rk5 data_struct)
+#define V_m y[ i_V_m ]
+#define w y[ i_w ]
+#define I_syn( i ) y[ N_SCAL_VAR + N_PORT_VAR * i + i_I_syn ]
+
+#define dVdt dydx[ i_V_m ]
+#define dwdt dydx[ i_w ]
+#define dI_syndt( i ) dydx[ N_SCAL_VAR + N_PORT_VAR * i + i_I_syn ]
+
+#define V_th param[ i_V_th ]
+#define Delta_T param[ i_Delta_T ]
+#define g_L param[ i_g_L ]
+#define E_L param[ i_E_L ]
+#define C_m param[ i_C_m ]
+#define a param[ i_a ]
+#define b param[ i_b ]
+#define tau_w param[ i_tau_w ]
+#define I_e param[ i_I_e ]
+#define V_peak param[ i_V_peak ]
+#define V_reset param[ i_V_reset ]
+#define t_ref param[ i_t_ref ]
+#define refractory_step param[ i_refractory_step ]
+#define den_delay param[ i_den_delay ]
+
+#define tau_syn( i ) param[ N_SCAL_PARAM + N_PORT_PARAM * i + i_tau_syn ]
+
+#define h_min_rel_ group_param_[ i_h_min_rel ]
+#define h0_rel_ group_param_[ i_h0_rel ]
+
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct )
 {
-  enum { n_port = (NVAR-N_SCAL_VAR)/N_PORT_VAR };
+  enum
+  {
+    n_port = ( NVAR - N_SCAL_VAR ) / N_PORT_VAR
+  };
   float I_syn_tot = 0.0;
-  
 
-  float V = ( refractory_step > 0 ) ? V_reset :  MIN(V_m, V_peak);
-  for (int i = 0; i<n_port; i++) {
-    I_syn_tot += I_syn(i);
+  float V = ( refractory_step > 0 ) ? V_reset : MIN( V_m, V_peak );
+  for ( int i = 0; i < n_port; i++ )
+  {
+    I_syn_tot += I_syn( i );
   }
-  float V_spike = Delta_T == 0. ? 0. : Delta_T*exp((V - V_th)/Delta_T);
+  float V_spike = Delta_T == 0. ? 0. : Delta_T * exp( ( V - V_th ) / Delta_T );
 
-  dVdt = ( refractory_step > 0 ) ? 0 :
-    ( -g_L*(V - E_L - V_spike) + I_syn_tot - w + I_e) / C_m;
+  dVdt = ( refractory_step > 0 ) ? 0 : ( -g_L * ( V - E_L - V_spike ) + I_syn_tot - w + I_e ) / C_m;
   // Adaptation current w.
-  dwdt = (a*(V - E_L) - w) / tau_w;
-  for (int i=0; i<n_port; i++) {
+  dwdt = ( a * ( V - E_L ) - w ) / tau_w;
+  for ( int i = 0; i < n_port; i++ )
+  {
     // Synaptic current derivative
-    dI_syndt(i) = -I_syn(i) / tau_syn(i);
+    dI_syndt( i ) = -I_syn( i ) / tau_syn( i );
   }
 }
 
- template<int NVAR, int NPARAM> //, class DataStruct>
-__device__
-    void ExternalUpdate
-    (double x, float *y, float *param, bool end_time_step,
-			user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM > //, class DataStruct>
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct )
 {
-  if ( V_m < -1.0e3) { // numerical instability
-    printf("V_m out of lower bound\n");
+  if ( V_m < -1.0e3 )
+  { // numerical instability
+    printf( "V_m out of lower bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if ( w < -1.0e6 || w > 1.0e6) { // numerical instability
-    printf("w out of bound\n");
+  if ( w < -1.0e6 || w > 1.0e6 )
+  { // numerical instability
+    printf( "w out of bound\n" );
     V_m = V_reset;
-    w=0;
+    w = 0;
     return;
   }
-  if (refractory_step > 0.0) {
+  if ( refractory_step > 0.0 )
+  {
     V_m = V_reset;
-    if (end_time_step) {
+    if ( end_time_step )
+    {
       refractory_step -= 1.0;
     }
   }
-  else {
-    if ( V_m >= V_peak ) { // send spike
+  else
+  {
+    if ( V_m >= V_peak )
+    { // send spike
       int neuron_idx = threadIdx.x + blockIdx.x * blockDim.x;
-      PushSpike(data_struct.i_node_0_ + neuron_idx, 1.0);
+      PushSpike( data_struct.i_node_0_ + neuron_idx, 1.0 );
       V_m = V_reset;
       w += b; // spike-driven adaptation
-      refractory_step = (int)round(t_ref/NESTGPUTimeResolution);
-      if (refractory_step<0) {
-	refractory_step = 0;
+      refractory_step = ( int ) round( t_ref / NESTGPUTimeResolution );
+      if ( refractory_step < 0 )
+      {
+        refractory_step = 0;
       }
     }
   }
 }
 
-
-};
+}; // namespace user_m2_ns
 
 template <>
-int user_m2::UpdateNR<0>(long long it, double t1);
+int user_m2::UpdateNR< 0 >( long long it, double t1 );
 
-template<int N_PORT>
-int user_m2::UpdateNR(long long it, double t1)
+template < int N_PORT >
+int
+user_m2::UpdateNR( long long it, double t1 )
 {
-  if (N_PORT == n_port_) {
-    const int NVAR = user_m2_ns::N_SCAL_VAR
-      + user_m2_ns::N_PORT_VAR*N_PORT;
-    const int NPARAM = user_m2_ns::N_SCAL_PARAM
-      + user_m2_ns::N_PORT_PARAM*N_PORT;
+  if ( N_PORT == n_port_ )
+  {
+    const int NVAR = user_m2_ns::N_SCAL_VAR + user_m2_ns::N_PORT_VAR * N_PORT;
+    const int NPARAM = user_m2_ns::N_SCAL_PARAM + user_m2_ns::N_PORT_PARAM * N_PORT;
 
-    rk5_.Update<NVAR, NPARAM>(t1, h_min_, rk5_data_struct_);
+    rk5_.Update< NVAR, NPARAM >( t1, h_min_, rk5_data_struct_ );
   }
-  else {
-    UpdateNR<N_PORT - 1>(it, t1);
+  else
+  {
+    UpdateNR< N_PORT - 1 >( it, t1 );
   }
 
   return 0;
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct )
 {
-    user_m2_ns::Derivatives<NVAR, NPARAM>(x, y, dydx, param,
-						 data_struct);
+  user_m2_ns::Derivatives< NVAR, NPARAM >( x, y, dydx, param, data_struct );
 }
 
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m2_rk5 data_struct)
+template < int NVAR, int NPARAM >
+__device__ void
+ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct )
 {
-    user_m2_ns::ExternalUpdate<NVAR, NPARAM>(x, y, param,
-						    end_time_step,
-						    data_struct);
+  user_m2_ns::ExternalUpdate< NVAR, NPARAM >( x, y, param, end_time_step, data_struct );
 }
 
-
 #endif
diff --git a/src/user_m2_psc_exp_rk5.h b/src/user_m2_psc_exp_rk5.h
index d4e18656a..69ed2befc 100644
--- a/src/user_m2_psc_exp_rk5.h
+++ b/src/user_m2_psc_exp_rk5.h
@@ -20,32 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2PSCEXPRK5_H
 #define USERM2PSCEXPRK5_H
 
 struct user_m2_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m2_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m2_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, user_m2_rk5 data_struct);
+__device__ void NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, user_m2_rk5 data_struct);
+__device__ void NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct );
 
 #endif
diff --git a/src/user_m2_rk5.h b/src/user_m2_rk5.h
index 3c1557591..aa2aa9a1b 100644
--- a/src/user_m2_rk5.h
+++ b/src/user_m2_rk5.h
@@ -20,32 +20,19 @@
  *
  */
 
-
-
-
-
 #ifndef USERM2RK5_H
 #define USERM2RK5_H
 
 struct user_m2_rk5;
 
+template < int NVAR, int NPARAM >
+__device__ void Derivatives( double x, float* y, float* dydx, float* param, user_m2_rk5 data_struct );
 
-template<int NVAR, int NPARAM>
-__device__
-void Derivatives(double x, float *y, float *dydx, float *param,
-		 user_m2_rk5 data_struct);
-
-template<int NVAR, int NPARAM>
-__device__
-void ExternalUpdate(double x, float *y, float *param, bool end_time_step,
-		    user_m2_rk5 data_struct);
+template < int NVAR, int NPARAM >
+__device__ void ExternalUpdate( double x, float* y, float* param, bool end_time_step, user_m2_rk5 data_struct );
 
-__device__
-void NodeInit(int n_var, int n_param, double x, float *y,
-	      float *param, user_m2_rk5 data_struct);
+__device__ void NodeInit( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct );
 
-__device__
-void NodeCalibrate(int n_var, int n_param, double x, float *y,
-		   float *param, user_m2_rk5 data_struct);
+__device__ void NodeCalibrate( int n_var, int n_param, double x, float* y, float* param, user_m2_rk5 data_struct );
 
 #endif
diff --git a/src/utilities.cu b/src/utilities.cu
index 2f50e524a..6fdd17ba2 100644
--- a/src/utilities.cu
+++ b/src/utilities.cu
@@ -20,12 +20,25 @@
  *
  */
 
-int IntPow(int x, unsigned int p)
+int64_t
+IntPow( int64_t x, unsigned int p )
 {
-  if (p == 0) return 1;
-  if (p == 1) return x;
-  
-  int tmp = IntPow(x, p/2);
-  if (p%2 == 0) return tmp * tmp;
-  else return x * tmp * tmp;
+  if ( p == 0 )
+  {
+    return 1;
+  }
+  if ( p == 1 )
+  {
+    return x;
+  }
+
+  int64_t tmp = IntPow( x, p / 2 );
+  if ( p % 2 == 0 )
+  {
+    return tmp * tmp;
+  }
+  else
+  {
+    return x * tmp * tmp;
+  }
 }
diff --git a/src/utilities.h b/src/utilities.h
index 5067ff7db..3f9bc1ae8 100644
--- a/src/utilities.h
+++ b/src/utilities.h
@@ -23,36 +23,82 @@
 #ifndef UTILITIES_H
 #define UTILITIES_H
 
-__device__  __forceinline__ double atomicAddDouble(double* address, double val)
+#include "cuda_error.h"
+
+//<BEGIN-CLANG-TIDY-SKIP>//
+#include <cub/cub.cuh>
+//<END-CLANG-TIDY-SKIP>//
+
+__device__ __forceinline__ double
+atomicAddDouble( double* address, double val )
 {
-    unsigned long long int* address_as_ull =
-                                          (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed, 
-                        __double_as_longlong(val + 
-                        __longlong_as_double(assumed)));
-    } while (assumed != old);
-    return __longlong_as_double(old);
+  unsigned long long int* address_as_ull = ( unsigned long long int* ) address;
+  unsigned long long int old = *address_as_ull, assumed;
+  do
+  {
+    assumed = old;
+    old = atomicCAS( address_as_ull, assumed, __double_as_longlong( val + __longlong_as_double( assumed ) ) );
+  } while ( assumed != old );
+  return __longlong_as_double( old );
 }
 
-template <class T1, class T2>
-__device__  __forceinline__ T2 locate(T1 val, T1 *data, T2 n)
+template < class T1, class T2 >
+__device__ __forceinline__ T2
+locate( T1 val, T1* data, T2 n )
 {
   T2 i_left = 0;
   T2 i_right = n;
-  T2 i = (i_left+i_right)/2;
-  while(i_right-i_left>1) {
-    if (data[i] > val) i_right = i;
-    else if (data[i]<val) i_left = i;
-    else break;
-    i=(i_left+i_right)/2;
+  T2 i = ( i_left + i_right ) / 2;
+  while ( i_right - i_left > 1 )
+  {
+    if ( data[ i ] > val )
+    {
+      i_right = i;
+    }
+    else if ( data[ i ] < val )
+    {
+      i_left = i;
+    }
+    else
+    {
+      break;
+    }
+    i = ( i_left + i_right ) / 2;
   }
 
   return i;
 }
 
-int IntPow(int x, unsigned int p);
-  
+int64_t IntPow( int64_t x, unsigned int p );
+
+template < class T >
+T*
+sortArray( T* h_arr, int n_elem )
+{
+  // allocate unsorted and sorted array in device memory
+  T* d_arr_unsorted;
+  T* d_arr_sorted;
+  CUDAMALLOCCTRL( "&d_arr_unsorted", &d_arr_unsorted, n_elem * sizeof( T ) );
+  CUDAMALLOCCTRL( "&d_arr_sorted", &d_arr_sorted, n_elem * sizeof( T ) );
+  gpuErrchk( cudaMemcpy( d_arr_unsorted, h_arr, n_elem * sizeof( T ), cudaMemcpyHostToDevice ) );
+  void* d_storage = NULL;
+  size_t storage_bytes = 0;
+  // Determine temporary storage requirements for sorting source indexes
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceRadixSort::SortKeys( d_storage, storage_bytes, d_arr_unsorted, d_arr_sorted, n_elem );
+  //<END-CLANG-TIDY-SKIP>//
+
+  // Allocate temporary storage for sorting
+  CUDAMALLOCCTRL( "&d_storage", &d_storage, storage_bytes );
+  // Run radix sort
+  //<BEGIN-CLANG-TIDY-SKIP>//
+  cub::DeviceRadixSort::SortKeys( d_storage, storage_bytes, d_arr_unsorted, d_arr_sorted, n_elem );
+  //<END-CLANG-TIDY-SKIP>//
+
+  CUDAFREECTRL( "d_storage", d_storage );
+  CUDAFREECTRL( "d_arr_unsorted", d_arr_unsorted );
+
+  return d_arr_sorted;
+}
+
 #endif
diff --git a/tmp1.sh b/tmp1.sh
new file mode 100644
index 000000000..c7a73f886
--- /dev/null
+++ b/tmp1.sh
@@ -0,0 +1,2 @@
+export NESTGPU_LIB=/home/golosio/nest-gpu-git/golosio/nest-gpu/lib1/libnestgpu.so
+export PYTHONPATH=/home/golosio/nest-gpu-git/golosio/nest-gpu/pythonlib/:$PYTHONPATH