2020set -e
2121
2222export SCRIPT_DIR=" $( cd " $( dirname " ${BASH_SOURCE[0]} " ) " > /dev/null 2>&1 && pwd ) "
23- export REALM_DIR=" $( dirname " $( dirname " $SCRIPT_DIR " ) " ) "
23+ export REALM_DIR=${REALM_DIR :- " $( dirname " $( dirname " $SCRIPT_DIR " ) " ) " }
2424source " $SCRIPT_DIR /common.sh"
2525
2626# Prepare output directory
2727function mk_output() {
28+ export REALM_HOST_DIR=${REALM_HOST_DIR:- $REALM_DIR }
2829 DATE=" $( date +%Y/%m/%d) "
29- mkdir -p " $REALM_DIR /$DATE "
30- export HOST_OUT_DIR=" $REALM_DIR /$DATE "
30+ mkdir -p " $REALM_HOST_DIR /$DATE "
31+ export HOST_OUT_DIR=" $REALM_HOST_DIR /$DATE "
3132 echo " Redirecting stdout, stderr and logs to $HOST_OUT_DIR "
3233 export CMD_OUT_DIR=" $HOST_OUT_DIR "
3334}
@@ -47,6 +48,7 @@ if [[ $# -lt 2 || ! "$1" =~ ^(1/)?[0-9]+(:[0-9]+)?$ ]]; then
4748 echo " TIMELIMIT : how much time to request for the job, in minutes (defaut: 60)"
4849 echo " NUM_GPUS : number of gpus per node"
4950 echo " ACCOUNT : account name"
51+ echo " IMAGE: path to container image in container based clusteres"
5052 exit
5153fi
5254
@@ -132,6 +134,42 @@ elif [[ "$PLATFORM" == "computelab" || "$PLATFORM" == "oberon" ]]; then
132134
133135 sbatch_cmd+=(" ${slurm_cmd[@]} " )
134136 submit " ${sbatch_cmd[@]} "
137+ elif [[ " $PLATFORM " == " eos" ]]; then
138+ mk_output
139+ NODE_RATIO=" $RATIO_OF_NODE_USED / $RANKS_PER_NODE "
140+ NUMAS_PER_NODE=2
141+ RAM_PER_NUMA=950000
142+ # Calculate available resources per OpenMP group
143+ NUM_OMPS=$(( NUMAS_PER_NODE * $NODE_RATIO ))
144+ if [[ $NUM_OMPS -lt 1 ]]; then
145+ NUM_OMPS=1
146+ RAM_PER_OMP=$(( RAM_PER_NUMA * NUMAS_PER_NODE * $NODE_RATIO ))
147+ else
148+ RAM_PER_OMP=" $RAM_PER_NUMA "
149+ fi
150+ WORK_RAM=$(( NUM_OMPS * RAM_PER_OMP))
151+
152+ TIME=" $( date +%H%M%S) "
153+ NUM_GPUS_PER_RANK=$(( $NUM_GPUS / $RANKS_PER_NODE ))
154+
155+ export GASNET_AM_CREDITS_PP=16
156+ export GASNET_IBV_PORTS=mlx5_0+mlx5_3+mlx5_4+mlx5_5+mlx5_6+mlx5_9+mlx5_10+mlx5_11
157+
158+ # this is set in the docker image
159+ unset REALM_UCP_BOOTSTRAP_PLUGIN
160+
161+ slurm_cmd=(" $SCRIPT_DIR /job.slurm" srun -N " $NUM_RANKS " --mpi=pmix -n " $NUM_RANKS " --ntasks-per-node " $RANKS_PER_NODE " --mem " $(( WORK_RAM + 4000 )) " M --container-image " $IMAGE " " $@ " )
162+
163+ if [[ -z " ${NODE_LIST} " ]]; then
164+ sbatch_cmd=(sbatch -p " $QUEUE " -t " $TIMELIMIT " --exclusive -N " $NUM_NODES "
165+ -o " $HOST_OUT_DIR /$JOB_NAME -$TIME .txt" -A " $ACCOUNT " -J " $JOB " )
166+ else
167+ sbatch_cmd=(sbatch -p " $QUEUE " -t " $TIMELIMIT " --exclusive -w " $NODE_LIST " -N " $NUM_NODES "
168+ -o " $HOST_OUT_DIR /$JOB_NAME -$TIME .txt" -A " $ACCOUNT " -J " $JOB " )
169+ fi
170+
171+ sbatch_cmd+=(" ${slurm_cmd[@]} " )
172+ submit " ${sbatch_cmd[@]} "
135173fi
136174
137175# Wait for batch job to start
0 commit comments